In [1]:
import numpy as np
import pandas as pd
import json

import os
import pickle

import nltk
import gensim
from gensim import corpora, models, similarities

In [30]:
# Setup nltk corpora path and Google Word2Vec location
nltk_path = os.sep.join([os.environ['HOME'], 'nltk_data'])
google_vec_file = '/Users/warren/Data_Science/Metis/github/project-kojak/GoogleNews-vectors-negative300.bin.gz'

nltk.data.path.insert(0, nltk_path)

### Load the Kramer DataFrame

In [2]:
# Load prompts
with open('kramer_responses.pkl', 'rb') as file:
    df_kramer_01 = pickle.load(file)

### Further Processing

In [None]:
# tokenize, encode, pad

In [8]:
# The loaded data
df_kramer_01.head()

Unnamed: 0,ep_num,scene_num,line_num,char_prompt,prompt,char_response,response
137,1,6,30,GEORGE,did you need something .,KRAMER,do you handle any of that commercial .
437,2,9,6,JERRY,"you cant look in there , were playing !",KRAMER,hi .
440,2,9,9,MORTY,kramer !,KRAMER,hey morty !
446,2,9,15,JERRY,"dad , shes cheating !",KRAMER,quo ?
470,2,9,39,HELEN,32 .,KRAMER,"no , you dont have to challenge that ."


In [6]:
df_kramer_01.shape

(6239, 7)

In [20]:
mask = df_kramer_01.response.apply(lambda x: len(nltk.word_tokenize(x)) <= 27)
df_kramer_01[mask].shape

(6208, 7)

In [22]:
df_kramer_02 = df_kramer_01[mask]

In [23]:
%%time

# Tokenize the data
prompts_tokenized = [nltk.word_tokenize(line) for line in df_kramer_02.prompt]
responses_tokenized = [nltk.word_tokenize(line) for line in df_kramer_02.response]

CPU times: user 1.81 s, sys: 48.3 ms, total: 1.85 s
Wall time: 2.4 s


In [24]:
print(max([len(line) for line in prompts_tokenized]))
print(max([len(line) for line in responses_tokenized]))

27
27


In [32]:
%%time

word2vec = gensim.models.KeyedVectors.load_word2vec_format(google_vec_file, binary=True)

CPU times: user 3min 32s, sys: 29 s, total: 4min 1s
Wall time: 6min 5s


In [50]:
%%time
# Encode the data using word2vec
# sentend is of length 300 because word2vec encodes words as length 300 vectors

sentend = np.ones((300,), dtype = np.float32) 

prompts_encoded = [[word2vec[word] for word in line if word in word2vec.vocab] for line in prompts_tokenized]
responses_encoded = [[word2vec[word] for word in line if word in word2vec.vocab] for line in responses_tokenized]

CPU times: user 191 ms, sys: 174 ms, total: 365 ms
Wall time: 704 ms


In [51]:
prompts_encoded[0][0][:5]

array([ 0.20019531,  0.15429688,  0.10302734,  0.00866699,  0.00118256], dtype=float32)

In [52]:
len(prompts_encoded[0])

4

In [44]:
test = prompts_encoded[0]
test[14:] = []
test.append(sentend)

In [56]:
print(max([len(p) for p in prompts_encoded]))
print(max([len(p) for p in responses_encoded]))

23
23


In [57]:
%%time

# Padding / adding sentence end tokens
# Make more efficient/pythonic?

for tok_sent in prompts_encoded:
    tok_sent[23:]=[]
    tok_sent.append(sentend)
    

for tok_sent in prompts_encoded:
    if len(tok_sent)<24:
        for i in range(24-len(tok_sent)):
            tok_sent.append(sentend)  


for tok_sent in responses_encoded:
    tok_sent[23:]=[]
    tok_sent.append(sentend)


for tok_sent in responses_encoded:
    if len(tok_sent)<24:
        for i in range(24-len(tok_sent)):
            tok_sent.append(sentend)  

CPU times: user 76.4 ms, sys: 4.86 ms, total: 81.3 ms
Wall time: 87 ms


### Save pickle file

In [58]:
with open('kramer_source_list_w2v.pickle','wb') as file:
    pickle.dump(prompts_encoded, file)

In [60]:
with open('kramer_response_list_w2v.pickle','wb') as file:
    pickle.dump(responses_encoded, file)