In [1]:
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
import re
import pickle
import nltk
import datasets
import sys
import json
import pprint
import xml.etree.ElementTree as ET
import datetime
from gzip import GzipFile
import warnings
warnings.filterwarnings('ignore')

nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import pairwise_distances_argmin

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wlyip\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
cornell_df = datasets.readCornellData('data/cornell', max_len=50)

100%|█████████████████████████████████████████████████████████████████████████| 83097/83097 [00:03<00:00, 21479.54it/s]


In [3]:
opensubs_df = datasets.readOpensubsData('data/opensubs', max_len=50)

Loading OpenSubtitles conversations in data/opensubs.


                                                                                                                       

Skipping file data/opensubs\OpenSubtitles\xml\en\Action\2003\602_152466_207871_batoru_rowaiaru_ii_rekuiemu.xml with errors.


                                                                                                                       

Skipping file data/opensubs\OpenSubtitles\xml\en\Action\2004\59_84873_113518_appurushdo.xml with errors.


                                                                                                                       

Skipping file data/opensubs\OpenSubtitles\xml\en\Comedy\2003\529_124078_171007_how_to_lose_a_guy_in_10_days.xml with errors.


                                                                                                                       

Skipping file data/opensubs\OpenSubtitles\xml\en\Comedy\2004\2480_226704_299940_little_black_book.xml with errors.


                                                                                                                       

Skipping file data/opensubs\OpenSubtitles\xml\en\Drama\2000\179_88528_119102_batoru_rowaiaru.xml with errors.


                                                                                                                       

Skipping file data/opensubs\OpenSubtitles\xml\en\Drama\2002\3265_149497_204017_unfaithful.xml with errors.


                                                                                                                       

Skipping file data/opensubs\OpenSubtitles\xml\en\Drama\2003\1723_68784_89159_big_fish.xml with errors.


                                                                                                                       

Skipping file data/opensubs\OpenSubtitles\xml\en\Drama\2004\146_206647_272090_eternal_sunshine_of_the_spotless_mind.xml with errors.


                                                                                                                       

Skipping file data/opensubs\OpenSubtitles\xml\en\Family\2001\3935_19508_22105_cats__dogs.xml with errors.


                                                                                                                       

Skipping file data/opensubs\OpenSubtitles\xml\en\Horror\1922\1166_134135_184270_nosferatu_eine_symphonie_des_grauens.xml with errors.


OpenSubtitles data files: 100%|████████████████████████████████████████████████████| 4634/4634 [02:41<00:00, 28.67it/s]
100%|█████████████████████████████████████████████████████████████████████| 1648080/1648080 [00:24<00:00, 68278.90it/s]


In [4]:
dataset = cornell_df + opensubs_df

np.shape(dataset)

(1217316, 2)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_features(dataset, vectorizer_path):  
    tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2),
                                      token_pattern='(\S+)')

    dataset = tfidf_vectorizer.fit_transform(dataset)
    pickle.dump(tfidf_vectorizer, open(vectorizer_path, 'wb'))
    
    return dataset, tfidf_vectorizer.vocabulary_

In [6]:
all_sentences = [sent[0] + ' ' + sent[1] for sent in dataset]
all_sentences_tfidf, tfidf_vocab = tfidf_features(all_sentences, 'tfidf_vectorizer.pkl')

In [7]:
len(tfidf_vocab)

281474

In [8]:
question_list = []
for i, sent in enumerate(dataset):
    temp_sent = []
    for word in sent[0].split(' '):
        if word in tfidf_vocab.keys():
            temp_sent.append(word)
    question_list.append(' '.join(temp_sent))

answer_list = []
for i, sent in enumerate(dataset):
    temp_sent = []
    for word in sent[1].split(' '):
        if word in tfidf_vocab.keys():
            temp_sent.append(word)
    answer_list.append(' '.join(temp_sent))

In [9]:
prepared_data = []
question_answer_id = []
for i, (question, answer) in enumerate(zip(question_list, answer_list)):
    if len(question) > 0 and len(answer) > 0:
        question_answer_id.append(i)
        temp_sent = question + '\t' + answer
        prepared_data.append(temp_sent)

In [14]:
len(question_answer_id)

1212689

In [67]:
out = open('data/prepared_data_tfidf.tsv', 'w')
for line in prepared_data:
    print(line, file=out)
out.close()

In [46]:
!starspace train -trainFile 'data/prepared_data_tfidf.tsv' -model starspace_embedding \
-trainMode 3 \
-adagrad true \
-ngrams 1 \
-epoch 5 \
-dim 100 \
-similarity "cosine" \
-minCount 2 \
-verbose true \
-fileFormat labelDoc \
-negSearchLimit 10 \
-lr 0.05 \
-thread 4

Arguments: 
lr: 0.05
dim: 100
epoch: 5
maxTrainTime: 8640000
saveEveryEpoch: 0
loss: hinge
margin: 0.05
similarity: cosine
maxNegSamples: 10
negSearchLimit: 10
thread: 4
minCount: 2
minCountLabel: 1
label: __label__
ngrams: 1
bucket: 2000000
adagrad: 1
trainMode: 3
fileFormat: labelDoc
normalizeText: 0
dropoutLHS: 0
dropoutRHS: 0
Start to initialize starspace model.
Build dict from input file : data/prepared_data_tfidf.tsv
Read 14M words
Number of words in dictionary:  39554
Number of labels in dictionary: 0
Loading data from file : data/prepared_data_tfidf.tsv
Total number of examples loaded : 1212689
Initialized model weights. Model size :
matrix : 39554 100
Training epoch 0: 0.05 0.01
Epoch: 100.0%  lr: 0.039999  loss: 0.053534  eta: 0h23m  tot: 0h5m48s  (20.0%)  lr: 0.049926  loss: 0.120039  eta: 0h30m  tot: 0h0m2s  (0.1%)0h0m15s  (0.9%)4.6%  lr: 0.049414  loss: 0.087083  eta: 0h29m  tot: 0h0m16s  (0.9%)5.3%  lr: 0.049332  loss: 0.085105  eta: 0h28m  tot: 0h0m18s  (1.1%)%  lr: 0.04

Epoch: 100.0%  lr: 0.019999  loss: 0.024594  eta: 0h11m  tot: 0h17m15s  (60.0%)7%  lr: 0.029175  loss: 0.023886  eta: 0h16m  tot: 0h12m4s  (41.5%)8.0%  lr: 0.029142  loss: 0.023883  eta: 0h16m  tot: 0h12m5s  (41.6%)8.6%  lr: 0.029068  loss: 0.023977  eta: 0h16m  tot: 0h12m8s  (41.7%)10.1%  lr: 0.028985  loss: 0.024096  eta: 0h16m  tot: 0h12m13s  (42.0%)10.4%  lr: 0.028969  loss: 0.024095  eta: 0h16m  tot: 0h12m15s  (42.1%)11.1%  lr: 0.028919  loss: 0.024111  eta: 0h16m  tot: 0h12m17s  (42.2%)12.8%  lr: 0.028688  loss: 0.024223  eta: 0h16m  tot: 0h12m24s  (42.6%)12.9%  lr: 0.028663  loss: 0.024241  eta: 0h16m  tot: 0h12m24s  (42.6%)13.4%  lr: 0.028655  loss: 0.024215  eta: 0h16m  tot: 0h12m26s  (42.7%)h12m27s  (42.7%)m29s  (42.8%)16.0%  lr: 0.028416  loss: 0.024184  eta: 0h16m  tot: 0h12m35s  (43.2%)18.6%  lr: 0.028176  loss: 0.024155  eta: 0h16m  tot: 0h12m43s  (43.7%)18.9%  lr: 0.028143  loss: 0.024136  eta: 0h16m  tot: 0h12m44s  (43.8%)19.0%  lr: 0.028135  loss: 0.024122  eta: 0h16m 

Epoch: 100.0%  lr: -0.000000  loss: 0.019568  eta: <1min   tot: 0h27m59s  (100.0%)lr: 0.009818  loss: 0.019522  eta: 0h4m  tot: 0h22m41s  (80.3%)6.4%  lr: 0.009282  loss: 0.019562  eta: 0h4m  tot: 0h22m56s  (81.3%)9.4%  lr: 0.009035  loss: 0.019569  eta: 0h4m  tot: 0h23m6s  (81.9%)11.4%  lr: 0.008820  loss: 0.019455  eta: 0h4m  tot: 0h23m13s  (82.3%)12.1%  lr: 0.008754  loss: 0.019484  eta: 0h4m  tot: 0h23m16s  (82.4%)12.9%  lr: 0.008696  loss: 0.019463  eta: 0h4m  tot: 0h23m19s  (82.6%)15.2%  lr: 0.008515  loss: 0.019465  eta: 0h4m  tot: 0h23m27s  (83.0%)15.7%  lr: 0.008449  loss: 0.019464  eta: 0h4m  tot: 0h23m28s  (83.1%)17.8%  lr: 0.008193  loss: 0.019536  eta: 0h4m  tot: 0h23m34s  (83.6%)18.2%  lr: 0.008144  loss: 0.019529  eta: 0h4m  tot: 0h23m35s  (83.6%) (83.7%)18.7%  lr: 0.008086  loss: 0.019542  eta: 0h4m  tot: 0h23m37s  (83.7%)19.0%  lr: 0.008028  loss: 0.019516  eta: 0h4m  tot: 0h23m38s  (83.8%)19.1%  lr: 0.008028  loss: 0.019526  eta: 0h4m  tot: 0h23m38s  (83.8%)23.4%  lr:

In [61]:
answer_df = [sent[1] for sent in dataset]

In [64]:
answer_pair = []
for i in question_answer_id:
    answer_pair.append(answer_df[i])

In [69]:
answer_path = 'data/answer_list.txt'
out = open(answer_path, 'w')
for line in answer_pair:
    print(line, file=out)
out.close()

In [68]:
len(answer_pair)

1212689

In [16]:
question_df = [sent[0] for sent in dataset]

In [17]:
question_pair = []
for i in question_answer_id:
    question_pair.append(question_df[i])

In [18]:
len(question_pair)

1212689

In [19]:
def load_embeddings(embeddings_path):
    embeddings = {}
    for line in open(embeddings_path):
        word, *arr = line.split('\t')
        embeddings[word] = np.asarray(arr, dtype='float32')
        
    dim = len(arr)
    
    return embeddings, dim

In [20]:
def question_to_vec(question, embeddings, dim):
    question2vec = [embeddings[word] for word in question.split() if word in embeddings]
    
    if not question2vec:
        return np.zeros(dim)
    
    question2vec = np.array(question2vec)
    
    return question2vec.mean(axis=0)

In [21]:
starspace_embeddings, embeddings_dim = load_embeddings('starspace_embedding.tsv')

In [22]:
question_matrix = np.zeros((len(question_pair), embeddings_dim), dtype=np.float32)

for i, question in enumerate(question_pair):
    question_matrix[i, :] = question_to_vec(question, starspace_embeddings, embeddings_dim)

In [25]:
matrix_list = []
matrix_size = 100000
for i in range(int(np.ceil(len(question_matrix)/matrix_size))):
    temp_matrix = question_matrix[i*matrix_size:(i+1)*matrix_size]
    matrix_list.append(temp_matrix)

for i in range(len(matrix_list)):
    file_name = 'question_matrix_{}.pkl'.format(i+1)
    pickle.dump(matrix_list[i], open(file_name, 'wb'))