In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import string
import pickle

from datetime import datetime
from tqdm import tqdm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

EMBED_DIM = 300
PRETRAINED_VEC = "/content/drive/MyDrive/CapstoneProject/NewVersion/glove.6B.300d.txt"

# 1. Utils

### 1.1 Helper Functions

In [None]:
def timer(start_time=None):
  """ 
  Measure the block's execution time using the clock 
  """
  if not start_time:
    start_time = datetime.now()
    return start_time
  elif start_time:
    thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
    tmin, tsec = divmod(temp_sec, 60)
    print('Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))


def saveData(obj, filename_path):
  """
  file_type could be: .csv or .pickle
  """
  file_type = '.' + filename_path.split('.')[-1]
  if file_type == '.csv':
    obj.to_csv(filename_path, index=False)
  if file_type == '.pickle':
    with open(filename_path, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
  print('Data saved successully.')


def loadData(filename_path):
  """
  file_type could be: .csv or .pickle
  """
  file_type = '.' + filename_path.split('.')[-1]
  if file_type == '.csv':
    obj = pd.read_csv(filename_path)
  if file_type == '.pickle':
    f = open(filename_path, 'rb')
    obj = pickle.load(f)
    f.close()
  return obj

### 1.2 Functions for Data Cleaning

In [None]:
def extend(text):
  """ 
  Extend the abbreviations.
  """
  text = re.sub(r"won't", "will not", str(text))
  text = re.sub(r"can\'t", "can not", str(text))

  text = re.sub(r"n\'t", " not", str(text))
  text = re.sub(r"\'re", " are", str(text))
  text = re.sub(r"\'s", " is", str(text))
  text = re.sub(r"\'d", " would", str(text))
  text = re.sub(r"\'ll", " will", str(text))
  text = re.sub(r"\'t", " not", str(text))
  text = re.sub(r"\'ve", " have", str(text))
  text = re.sub(r"\'m", " am", str(text))
  return text


def removeHTML(text):
  """ 
  Removes HTML.
  """
  return re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))','', str(text))


def removePuncLower(text):
  """
  Remove punctuation and change all letters to lower case
  """
  return ''.join(' ' if c in string.punctuation else c for c in str(text)).lower()


# https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
  """
  Remove all emoji.
  """
  emoji_pattern = re.compile("["
                              u"\U0001F600-\U0001F64F"  # emoticons
                              u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                              u"\U0001F680-\U0001F6FF"  # transport & map symbols
                              u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                              u"\U00002702-\U000027B0"
                              u"\U000024C2-\U0001F251"
                              "]+", flags=re.UNICODE)
  return emoji_pattern.sub(r'', text)


def preprocess(corpus):
  """ 
  Cleans and removes unwanted characters from the corpus. 
  """
  preprocessed = list()
  for message in corpus:
    msg = removeHTML(message)
    msg = extend(msg)
    msg = removePuncLower(msg)
    msg = msg.replace('\\r', ' ')
    msg = msg.replace('\\"', ' ')
    msg = msg.replace('\\n', ' ')
    # clean numbers
    msg = re.sub('[0-9]+', ' ', str(msg))
    msg = ' '.join(msg.split())
    preprocessed.append(msg.strip())
  return preprocessed

### 1.3 Functions for get word-ID mapping

In [None]:
def get_tokenizer(txt):
  """
  Vectorizing a text corpus
  """
  tokenizer = Tokenizer(filters='')
  tokenizer.fit_on_texts(txt)
  return tokenizer


def encode_pad_seq(tokenizer, length, txt):
  """
    Encode and pad sequences
  """
  # Integer encode sequences
  X = tokenizer.texts_to_sequences(txt)
  # Pad sequences with 0 values
  return pad_sequences(X, maxlen=length, padding='post')


def wordIDmapping(tokenizer):
  """
  Get both word-to-ID and ID-to-word mappings
  """
  vocab = tokenizer.word_index
  word2id = dict()
  id2word = dict()
  for k, v in vocab.items():
    word2id[k] = v
    id2word[v] = k
  return word2id, id2word


def tokenEmbed(tokenizer):
  """
  Use the pre-trained word-embedding vectors: “glove.6B.300d.text”
  Create token-embedding mapping
  """
  pretrained_vec = open(PRETRAINED_VEC)
  embeddings_index = dict()
  for i, line in enumerate(pretrained_vec):
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

  vocab = tokenizer.word_index
  vocab_size = len(vocab) + 1
  embedding_matrix = np.zeros((vocab_size, EMBED_DIM))
  for word, i in vocab.items():
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
          embedding_matrix[i] = embedding_vector
  return embedding_matrix

# 2. Preprocess

In [None]:
BASELINE_DATA = '/content/drive/MyDrive/CapstoneProject/NewVersion/single_qna.csv'
MODEL_DATA = '/content/drive/MyDrive/CapstoneProject/NewVersion/topical_chat.csv'

BASELINE_ROOT = '/content/drive/MyDrive/CapstoneProject/NewVersion/Baseline'
MODEL_ROOT = '/content/drive/MyDrive/CapstoneProject/NewVersion/MyModel'

ANS_LENGTH = 5

In [None]:
baseline_data = loadData(BASELINE_DATA)
model_data = loadData(MODEL_DATA)

### 2.1 Preprocess baseline data

In [None]:
baseline_data

Unnamed: 0,QuestionType,Asin,AnswerTime,UnixTime,Question,AnswerType,Answer,Category
0,yes/no,B00004U9JP,"Jun 27, 2014",1.403852e+09,I have a 9 year old Badger 1 that needs replac...,?,I replaced my old one with this without a hitch.,Appliances
1,open-ended,B00004U9JP,"Apr 28, 2014",1.398668e+09,model number,,This may help InSinkErator Model BADGER-1: Bad...,Appliances
2,yes/no,B00004U9JP,"Aug 25, 2014",1.408950e+09,can I replace Badger 1 1/3 with a Badger 5 1/2...,?,Plumbing connections will vary with different ...,Appliances
3,yes/no,B00004U9JP,"Nov 3, 2014",1.415002e+09,Does this come with power cord and dishwasher ...,?,It does not come with a power cord. It does co...,Appliances
4,open-ended,B00004U9JP,"Jun 21, 2014",1.403334e+09,loud noise inside when turned on. sounds like ...,,Check if you dropped something inside.Usually ...,Appliances
...,...,...,...,...,...,...,...,...
1396891,yes/no,B00KGGJPYA,"Nov 26, 2014",1.416989e+09,Does the adaptor cord for the iPhone 5 work wi...,N,No,Video Games
1396892,yes/no,B00KGGJPYA,"Oct 19, 2014",1.413702e+09,will it charge a kidle fire?,Y,"Simply answered, yes. It comes with a 3-in-1 a...",Video Games
1396893,open-ended,B00KGGJPYA,"Oct 15, 2014",1.413356e+09,What are the dimensions of this product?,,4 by 1 1/2 inches,Video Games
1396894,yes/no,B00KGGJPYA,"Jul 15, 2014",1.405408e+09,Does this have connector for 5C? I think 5C is...,?,I was able to charge Gembonics battery with iP...,Video Games


In [None]:
def removeCols(df):
  """
  Drop unecessary columns
  """
  df.drop('Asin', inplace=True, axis=1)
  df.drop('AnswerTime', inplace=True, axis=1)
  df.drop('UnixTime', inplace=True, axis=1)
  df.drop('Category', inplace=True, axis=1)
  df.drop('AnswerType', inplace=True, axis=1)
  
  df.dropna()
  df.reset_index(drop=True, inplace=True)
  return df


def addLengthCol(df):
  """
  Process Dataframe
  """
  df.dropna()
  df['Question'] = preprocess(df['Question'])
  df['Answer'] = preprocess(df['Answer'])
  df.drop_duplicates(inplace=True)
  df.dropna()
  df.reset_index(drop=True, inplace=True)

  # add QA length
  df['QuestionLength'] = df['Question'].str.split().apply(len)
  df['AnswerLength'] = df['Answer'].str.split().apply(len)

  # remove QA with length 0
  df = df[df['AnswerLength'] > 0]
  df = df[df['QuestionLength'] > 0]
  df.reset_index(drop=True, inplace=True)
  return df


def restrictLength(df, Anslength):
  """
  Resctrict the Questions and Answers length
  - Questions length <= mean length
  - Answers length <= Anslength
  """
  Qlength = int(df['QuestionLength'].mean())
  df = df[df["QuestionLength"] <= Qlength]
  df.reset_index(drop=True, inplace=True)
  df = df[df['AnswerLength'] <= Anslength]
  df.reset_index(drop=True, inplace=True)
  # add begin & end indicators to answers, add QA column
  df['Answer'] = df['Answer'].apply(lambda x : '<bos> '+ str(x) + ' <eos>')
  df['QA'] = df['Question'].astype(str) + ' ' + df['Answer'].astype(str)
  return df


def processDataframe(df, DATASET_TYPE):
  """
  Further process the datasets
  """
  if DATASET_TYPE == 'BASELINE':
    df = removeCols(df)
    df = addLengthCol(df)
    df = restrictLength(df, ANS_LENGTH)
  if DATASET_TYPE == 'MODEL':
    df.message = preprocess(df.message)
    df.dropna()
    df.drop('sentiment', inplace=True, axis=1)
    df.reset_index(drop=True, inplace=True)
  return df

In [None]:
baseline_df = processDataframe(baseline_data, 'BASELINE')
baseline_df

Unnamed: 0,QuestionType,Question,Answer,QuestionLength,AnswerLength,QA
0,open-ended,where is the reset button located,<bos> on the bottom <eos>,6,3,where is the reset button located <bos> on the...
1,yes/no,is there a connection for the dish washer,<bos> yes <eos>,8,1,is there a connection for the dish washer <bos...
2,yes/no,does this test forlead,<bos> yes it does <eos>,4,3,does this test forlead <bos> yes it does <eos>
3,yes/no,does it work with kenmore humidifier,<bos> yes it does <eos>,6,3,does it work with kenmore humidifier <bos> yes...
4,yes/no,does this filter fit the holmes hm,<bos> yes <eos>,7,1,does this filter fit the holmes hm <bos> yes <...
...,...,...,...,...,...,...
148399,yes/no,can you play the tomodachi life on a regular n...,<bos> no ds only <eos>,11,3,can you play the tomodachi life on a regular n...
148400,yes/no,is this like sims,<bos> sure is <eos>,4,2,is this like sims <bos> sure is <eos>
148401,open-ended,is this a hard copy or a download,<bos> hard copy <eos>,8,2,is this a hard copy or a download <bos> hard c...
148402,yes/no,does it play blue ray movies,<bos> yes <eos>,6,1,does it play blue ray movies <bos> yes <eos>


In [None]:
# filname = '/processed_clean_single_qna.csv'
# saveData(baseline_df, BASELINE_ROOT + filname)

### 2.2 Preprocess model data

In [None]:
model_data = loadData(MODEL_DATA)

In [None]:
model_data

Unnamed: 0,conversation_id,message,sentiment
0,1,Are you a fan of Google or Microsoft?,Curious to dive deeper
1,1,Both are excellent technology they are helpfu...,Curious to dive deeper
2,1,"I'm not a huge fan of Google, but I use it a...",Curious to dive deeper
3,1,Google provides online related services and p...,Curious to dive deeper
4,1,"Yeah, their services are good. I'm just not a...",Curious to dive deeper
...,...,...,...
188373,8628,"Wow, it does not seem like that long. Since I...",Surprised
188374,8628,"I havent seen that episode, I might google it...",Curious to dive deeper
188375,8628,I don't think I have either. That's an insane...,Curious to dive deeper
188376,8628,"I did, my little brother used to love Thomas ...",Happy


In [None]:
model_df = processDataframe(model_data, 'MODEL')
model_df

Unnamed: 0,conversation_id,message
0,1,are you a fan of google or microsoft
1,1,both are excellent technology they are helpful...
2,1,i am not a huge fan of google but i use it a l...
3,1,google provides online related services and pr...
4,1,yeah their services are good i am just not a f...
...,...,...
188373,8628,wow it does not seem like that long since i me...
188374,8628,i havent seen that episode i might google it l...
188375,8628,i do not think i have either that is an insane...
188376,8628,i did my little brother used to love thomas th...


In [None]:
# filname = '/cleaned_topical_chat.csv'
# saveData(model_df, MODEL_ROOT + filname)

In [None]:
def vectorization(df):
  """
  Vectorize the data and split out the input and target texts.
  """
  input_texts = []
  target_texts = []
  input_words_set = set()
  target_words_set = set()
  for conversation_index in tqdm(range(df.shape[0])):
    if conversation_index == 0:
      continue
    input_text = df.iloc[conversation_index - 1]
    target_text = df.iloc[conversation_index]
    if input_text.conversation_id == target_text.conversation_id:
      input_text = input_text.message
      target_text = target_text.message
      if input_text and target_text and len(input_text.split()) in range(3, 30) and len(target_text.split()) in range(1, 10):
        # Add <bos> and <eos> indicators to the target_text
        target_text = '<bos> ' + target_text + ' <eos>'         
        input_texts.append(input_text)
        target_texts.append(target_text)
        for word in input_text.split():
          if word not in input_words_set:
            input_words_set.add(word)
        for word in target_text.split():
          if word not in target_words_set:
            target_words_set.add(word)
  return input_texts, target_texts, input_words_set, target_words_set


def saveVectorizationResult(df, root):
  """
  Save the vectorization result.
  """
  input_texts, target_texts, input_words_set, target_words_set = vectorization(df)
  fname1, fname2, fname3, fname4 = '/input_texts.pickle', '/target_texts.pickle', '/input_words_set.pickle', '/target_words_set.pickle'
  saveData(input_texts, root + fname1)
  saveData(target_texts, root + fname2)
  saveData(input_words_set, root + fname3)
  saveData(target_words_set, root + fname4)
  return input_texts, target_texts, input_words_set, target_words_set

In [None]:
root = '/content/drive/MyDrive/CapstoneProject/NewVersion/MyModel'
input_texts, target_texts, input_words_set, target_words_set = saveVectorizationResult(model_df, root)

100%|██████████| 188378/188378 [00:55<00:00, 3406.20it/s]

Data saved successully.
Data saved successully.
Data saved successully.
Data saved successully.





In [None]:
len(input_texts), len(target_texts), len(input_words_set), len(target_words_set)

(19868, 19868, 11438, 7022)

# 3. Vectorization (Tokenization), Padding

### 3.1 Baseline dataset

In [None]:
SEED = 42
# X = (Question, QA), y = Answer; train:validation = 8:2
enc_train, enc_test, qa_train, qa_test, dec_train, dec_test = train_test_split(baseline_df['Question'], baseline_df['QA'], baseline_df['Answer'], test_size=0.005, random_state=SEED)
enc_train, enc_val, qa_train, qa_val, dec_train, dec_val = train_test_split(enc_train, qa_train, dec_train, test_size=0.2, random_state=SEED)
qa_tokenizer = get_tokenizer(baseline_df['QA'])

enc_train = encode_pad_seq(qa_tokenizer, baseline_df['QuestionLength'].max(), enc_train)
enc_val = encode_pad_seq(qa_tokenizer, baseline_df['QuestionLength'].max(), enc_val)
enc_test = encode_pad_seq(qa_tokenizer, baseline_df['QuestionLength'].max(), enc_test)
# Answer Length + 2 for '<bos>' and '<eos>'
dec_train = encode_pad_seq(qa_tokenizer, baseline_df['AnswerLength'].max()+2, dec_train)
dec_val = encode_pad_seq(qa_tokenizer, baseline_df['AnswerLength'].max()+2, dec_val)
dec_test = encode_pad_seq(qa_tokenizer, baseline_df['AnswerLength'].max()+2, dec_test)

In [None]:
# # Save Data
# saveData(qa_tokenizer, BASELINE_ROOT + '/qa_tokenizer.pickle')

# saveData(enc_train, BASELINE_ROOT + '/enc_train.pickle')
# saveData(enc_val, BASELINE_ROOT + '/enc_val.pickle')
# saveData(enc_test, BASELINE_ROOT + '/enc_test.pickle')

# saveData(dec_train, BASELINE_ROOT + '/dec_train.pickle')
# saveData(dec_val, BASELINE_ROOT + '/dec_val.pickle')
# saveData(dec_test, BASELINE_ROOT + '/dec_test.pickle')

In [None]:
embedding_matrix = tokenEmbed(qa_tokenizer)
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.090649  , -0.033193  ,  0.54229999, ..., -0.23192   ,
        -0.57389998,  0.25735   ],
       [ 0.027796  , -0.20241   ,  0.23388   , ..., -0.105     ,
         0.26631001,  0.60374999]])

In [None]:
embedding_matrix.shape

(31973, 300)

### 3.2 Model dataset

In [None]:
input_tokenizer, target_tokenizer = get_tokenizer(input_texts), get_tokenizer(target_texts)
input_maxlen, target_maxlen = max([len(x) for x in input_texts]), max([len(x) for x in target_texts])
input_sequences = encode_pad_seq(input_tokenizer, input_maxlen, input_texts)
target_sequences = encode_pad_seq(target_tokenizer, target_maxlen, target_texts)

In [None]:
# # Save Data
# saveData(input_sequences, MODEL_ROOT + '/input_sequences.pickle')
# saveData(target_sequences, MODEL_ROOT + '/target_sequences.pickle')