Generative Chatbot
==================
> CSCK507 Group Project: Group A
> 
This is a generative chatbot that uses a seq2seq model.

# Importing Libraries

In [1]:
import codecs
import csv
import io
import itertools
import json
import math
import os
import random
import re
import tarfile
import unicodedata
import zipfile
from io import open

import numpy as np
import pandas as pd
import polars as pl
import requests
import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import optim
from torch.jit import script, trace
import torchtext as tt

In [2]:
# Load in GPU
spacy.prefer_gpu()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Preprocessing

## Importing the dataset

In [3]:
def download_file(url, dir):
    """
    Download file from url
    :param url: url of file
    :param filename: name of file
    :return: None
    """
    r = requests.get(url)
    if url.endswith('.tar.gz'):
        z = tarfile.open(fileobj=io.BytesIO(r.content), mode="r:gz")
        z.extractall(dir)
        z.close()
    elif url.endswith('.zip'):
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall(dir)
    else:
        print('Unknown file type')
    return None

def extract_zip(filename, dir):
    """
    Extract zip file
    :param filename: name of file
    :return: None
    """
    z = zipfile.ZipFile(filename)
    z.extractall(dir)
    return None

In [4]:
datasets = {#'WikiQACorpus': 'https://download.microsoft.com/download/E/5/F/E5FCFCEE-7005-4814-853D-DAA7C66507E0/WikiQACorpus.zip',
            #'Question_Answer_Dataset_v1.2': 'https://www.cs.cmu.edu/~ark/QA-data/data/Question_Answer_Dataset_v1.2.tar.gz',
            'ubuntu-dialogue': 'data/ubuntu dialogue.zip'}

In [5]:
# Create directory
if not os.path.exists('data'):
    os.makedirs('data')

# Check if data is already downloaded
for dataset, source in datasets.items():
    if os.path.exists('data/' + dataset):
        print(dataset + ' already exists')
    elif dataset == 'ubuntu-dialogue':
        ubuntu = 'data/ubuntu dialogue'
        if os.path.exists(source):
            os.makedirs(ubuntu)
            extract_zip(source, ubuntu)
            os.remove(source)
            print(dataset + ' extracted')
        elif os.path.exists(ubuntu):
            print(dataset + ' already exists')
        else:
            kag = 'https://www.kaggle.com/datasets/rtatman/ubuntu-dialogue-corpus/download?datasetVersionNumber=2'
            print(f'Manually download ubuntu dialogue dataset from {kag} and place in data folder')
    else:
        download_file(source, 'data')
        print(dataset + ' downloaded')

ubuntu-dialogue already exists


## Preparing the dataset

### Read Data

In [6]:
variants = {'small':'',
            'medium':'_196',
            'large':'_301'}
ubuntufile = f'data/ubuntu dialogue/ubuntu-dialogue-corpus/dialogueText{variants["small"]}.csv'
text_df = pd.read_csv(ubuntufile)
text_df['dialogueID'] = text_df['dialogueID'].apply(lambda x: int(x.split('.')[0]))
print(text_df.shape)


(1038324, 6)


In [7]:
# preview text from ubuntu dialogue dataset
text_df.head()

Unnamed: 0,folder,dialogueID,date,from,to,text
0,3,126125,2008-04-23T14:55:00.000Z,bad_image,,"Hello folks, please help me a bit with the fol..."
1,3,126125,2008-04-23T14:56:00.000Z,bad_image,,Did I choose a bad channel? I ask because you ...
2,3,126125,2008-04-23T14:57:00.000Z,lordleemo,bad_image,the second sentence is better english and we...
3,3,64545,2009-08-01T06:22:00.000Z,mechtech,,Sock Puppe?t
4,3,64545,2009-08-01T06:22:00.000Z,mechtech,,WTF?


### Preprocess Data

In [8]:
def unicodetoascii(text):
    """
    Turn a Unicode string to plain ASCII

    :param text: text to be converted
    :return: text in ascii format
    """
    normalized_text = unicodedata.normalize('NFKD', str(text))
    ascii_text = ''.join(char for char in normalized_text if unicodedata.category(char) != 'Mn')
    return ascii_text

def preprocess_text(text, fn=unicodetoascii):

    text = fn(text)
    text = text.lower()
    text = re.sub(r"[^a-zA-Z.!?']+", r" ", text) # Remove non-alphanumeric characters
    text = re.sub(r"(\w)[!?]+(\w)", r'\1\2', text) # Remove !? between words
    text = re.sub(r'([.!?,\"])', r" \1", text) # Add space between .!?," punctuation
    text = re.sub(r"('[\s,.])", r" ' ", text) # Add space before ' if it's not at the end of a word
    text = re.sub(r"([\s,.]')", r" ' ", text) # Add space after ' if it's not at the beginning of a word
    text = re.sub(r"\s\s+", r" ", text).strip() # Remove extra spaces
    return text

def parse_dialogue(data):
    dialogues = {}
    df = data.copy()
    df.reset_index(inplace=True)
    # Group by dialogueID
    for dialogue_id, group in df.groupby('dialogueID'):
        sentence_pairs = {}
        context = ''
        previous_direction = (None, None)
        for i, row in group.iterrows():
            idx = row['index']
            sender = row['from']
            recipient = row['to']
            response = str(row['text'])
            direction = (sender, recipient)

            if direction == previous_direction:
                # add to the response to the previous message if the current message is consecutive
                prev_idx = idx - 1
                while prev_idx not in sentence_pairs:
                    prev_idx -= 1
                response = context + ' ' + response
                sentence_pairs[prev_idx] = (sentence_pairs[prev_idx][0], response)
                # sentence_pairs[-1] = (sentence_pairs[-1][0], response)
            elif (direction == previous_direction[::-1]) or (previous_direction[1] == None) and (direction[1] == previous_direction[0]):
                # if the current message is from the previous recipient to the previous sender
                # if the previous message did not have a recipient, but the current message is to the previous sender
                sentence_pairs[idx]=(context, response)
            else:
                sentence_pairs[idx]=(context, response)
            
            previous_direction = tuple(direction)
            context = str(response) # response is the context for the next message
        # remove the sentence pairs that does not have context but only responses
        sentence_pairs = {k: v for k, v in sentence_pairs.items() if v[0] != ''}
        dialogues[dialogue_id] = sentence_pairs

    return dialogues

In [9]:
text_df['text'] = text_df['text'].apply(preprocess_text)
text_df.head()

Unnamed: 0,folder,dialogueID,date,from,to,text
0,3,126125,2008-04-23T14:55:00.000Z,bad_image,,hello folks please help me a bit with the foll...
1,3,126125,2008-04-23T14:56:00.000Z,bad_image,,did i choose a bad channel ? i ask because you...
2,3,126125,2008-04-23T14:57:00.000Z,lordleemo,bad_image,the second sentence is better english and we a...
3,3,64545,2009-08-01T06:22:00.000Z,mechtech,,sock puppet
4,3,64545,2009-08-01T06:22:00.000Z,mechtech,,wtf ?


In [10]:
dialogues = parse_dialogue(text_df)

In [13]:
dialogues[126125]

{2: ("hello folks please help me a bit with the following sentence ' order here your personal photos or videos . ' i think the only allowed version is ' order your personal videos or photos here . ' but i'm not sure are you ? did i choose a bad channel ? i ask because you seem to be dumb like windows user",
  'the second sentence is better english and we are not dumb')}