In [1]:
from dataset import download_tfds_imdb_as_text
from collections import Counter
from gensim.models import Word2Vec, KeyedVectors
from nlp_utils import spacy_tokenizer_lower_lemma_remove_stop, preprocess_remove_html_non_ascii, spacy_tokenizer,spacy_tokenizer_remove_stop

import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
model_word2vec = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

In [3]:
X_train, X_test, y_train, y_test = download_tfds_imdb_as_text()
X_train_preprocessed = [preprocess_remove_html_non_ascii(doc) for doc in X_train]
X_train_tokenized = [[e for e in nlp(doc, disable=["tagger", "parser"])] for doc in X_train_preprocessed]

X_train_tokenized_v2 = [nlp(doc) for doc in X_train_preprocessed]

number of training samples 25000
number of testing samples 25000


In [12]:
def filter_spacy(spacy_tok):
    return not (
        spacy_tok.is_bracket or
        spacy_tok.is_quote or
        not spacy_tok.is_ascii or
        spacy_tok.is_currency or
        spacy_tok.is_digit or
        spacy_tok.is_space or
        spacy_tok.like_email or
        spacy_tok.like_num or
        spacy_tok.like_url
    )


def filter_spacy2(spacy_tok):
    return False
    

In [18]:
X_train_tokenized_filtered = [[e.orth_ for e in filter( filter_spacy, doc)] for doc in X_train_tokenized]


In [49]:
counter = Counter([e for s in X_train_tokenized_filtered for e in s])
imdb_vocab = set(k for k,v in counter.items() if v >1)

In [117]:
# lower + imdb

vocab_1 = imdb_vocab.union(set( list(model_word2vec.vocab)[:300000]))

In [118]:
len(vocab_1)

309387

In [119]:
"followable" in vocab_1

True

In [202]:

model = Word2Vec(size=300, window=7, min_count=1, workers=4, sg=1)
model.build_vocab([list(vocab_1)])

In [203]:
model.intersect_word2vec_format(fname='./GoogleNews-vectors-negative300.bin',
                               lockf=0,
                               binary=True)

In [204]:
print("riemann" in model_word2vec)
print("Dishum" in model_word2vec)
print("followable" in model_word2vec)


print("riemann" in model.wv)
print("Dishum" in model.wv)
print("followable" in model.wv)

False
False
False
True
True
True


In [205]:
model.wv["cat"][:10]

array([ 0.0123291 ,  0.20410156, -0.28515625,  0.21679688,  0.11816406,
        0.08300781,  0.04980469, -0.00952148,  0.22070312, -0.12597656],
      dtype=float32)

In [206]:
model.wv["Dishum"][:10] # look like random init

array([-0.00020234,  0.00144157,  0.00119393, -0.0010093 ,  0.00090943,
        0.00082218, -0.00018011,  0.00114797,  0.00020639, -0.00142701],
      dtype=float32)

In [207]:
model.wv["followable"][:10] # look like random init

array([ 0.00011482, -0.00020896, -0.00050605, -0.00028058, -0.00147494,
       -0.00062779, -0.00077607, -0.00078202,  0.00148383, -0.00016087],
      dtype=float32)

In [208]:
model.train(X_train_tokenized_filtered_v2, total_examples=len(X_train_tokenized_filtered_v2), epochs=1)

(6536044, 6564848)

In [209]:
model.wv["followable"][:10] # after train  2 epoch

array([ 0.05079229,  0.03941049,  0.02097265,  0.07971535, -0.05137871,
       -0.0041139 ,  0.03097997, -0.07551193,  0.0709124 ,  0.06988747],
      dtype=float32)

In [210]:
model.wv["cat"][:10]

array([ 0.0123291 ,  0.20410156, -0.28515625,  0.21679688,  0.11816406,
        0.08300781,  0.04980469, -0.00952148,  0.22070312, -0.12597656],
      dtype=float32)

In [211]:
model.save("word2vec.model.tf.300k.1.case.v2")

In [145]:
dir(X_train_tokenized[0][0])


['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_extension',
 'has_vector',
 'head',
 'i',
 'idx',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'left_edge',
 'lefts',
 'lemma',
 'lemma_',
 'lex_id',
 'like_email',
 'li

In [166]:
[s for s in X_train_tokenized_v2[0].sents]

[This was an absolutely terrible movie.,
 Don't be lured in by Christopher Walken or Michael Ironside.,
 Both are great actors, but this must simply be their worst role in history.,
 Even their great acting could not redeem this movie's ridiculous storyline.,
 This movie is an early nineties US propaganda piece.,
 The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions.,
 Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning.,
 I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name.,
 I could barely sit through it.]

In [165]:
X_train_tokenized_v2 = [nlp(doc) for doc in X_train_preprocessed]

In [167]:
sentenized_corpus = [s for instance in X_train_tokenized_v2 for s in instance.sents ]

In [176]:
X_train_tokenized_filtered_v2 = [[e.orth_ for e in filter( filter_spacy, doc)] for doc in sentenized_corpus]

In [186]:
X_train_tokenized_filtered_v2_no_fulstop = [list(filter(lambda x: x!=".", s)) for s in X_train_tokenized_filtered_v2]

In [221]:
model = Word2Vec(X_train_tokenized_filtered_v2, size=240, window=5, min_count=1, workers=4, sg=1, iter=10)
model.save("word2vec.model.240.5.10.filtered.v2")

In [216]:
X_train_tokenized_filtered_v2_no_fulstop[1]

['Do',
 "n't",
 'be',
 'lured',
 'in',
 'by',
 'Christopher',
 'Walken',
 'or',
 'Michael',
 'Ironside']

In [222]:
import threading
for thread in threading.enumerate():
    print(thread.name)

MainThread
Thread-2
Thread-3
IPythonHistorySavingThread
Thread-1
Thread-4
Thread-215
Thread-216
Thread-217
Thread-218
Thread-219


In [28]:
X_train_new = [e.orth_.lower() for sent in X_train_tokenized for e in sent]
X_train, X_test, y_train, y_test = download_tfds_imdb_as_text()

In [29]:
len(X_train_new)

6797828

# 5/31 start here

In [1]:
from dataset import download_tfds_imdb_as_text
from collections import Counter
from gensim.models import Word2Vec, KeyedVectors
import pickle
import spacy
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser"])


%load_ext autoreload
%autoreload
from wvtf2 import MyWord2Vec


In [3]:
# X_train, X_test, y_train, y_test = download_tfds_imdb_as_text()
# corpus = list()

# for s in nlp.pipe(X_train, disable=["tagger", "parser"], n_threads=4):
#     for tok in s:
#         if tok.is_digit or tok.is_punct or tok.like_url or tok.like_num or tok.like_email:
#             continue
#         corpus.append(tok.orth_.lower())

# pickle.dump(corpus, open("corpus-2020-05-31.pkl", "wb"))
    

In [2]:
corpus =  pickle.load( open("corpus-2020-05-31.pkl", "rb"))

In [3]:
corpus[:10]

['this',
 'was',
 'an',
 'absolutely',
 'terrible',
 'movie',
 'do',
 "n't",
 'be',
 'lured']

In [4]:
embedding_size = 300
max_vocabulary_size = 100000
min_occurrence = 1
skip_window = 10
batch_size = 1000
epoch = 50


myWV = MyWord2Vec(corpus, embedding_size, max_vocabulary_size, min_occurrence, skip_window, batch_size, epoch, num_skips=2, num_sampled=5)



Words count: 5944282
Unique words: 100161
Vocabulary size: 100000
Most common words: [('UNK', 162), ('the', 328822), ('and', 162887), ('a', 161730), ('of', 145595), ('to', 135355), ('is', 110135), ('it', 93487), ('in', 92823), ('i', 82706)]


In [None]:
emb, idx = myWV.train()

wv = dict()
for i, vocab in idx.items():
    wv[vocab] = emb[i,:]
    
pickle.dump(wv, open("wv_20200531-remove-punct-digit.pkl", "wb"))

epoch 0 loss 126.508642442594
epoch 1 loss 52.73447093952393
epoch 2 loss 34.810731032887546
epoch 3 loss 26.856889772058206
epoch 4 loss 22.242692825300697
epoch 5 loss 19.37683467911515
epoch 6 loss 17.347419358230297
epoch 7 loss 15.849497171755404
epoch 8 loss 14.703792633947346
epoch 9 loss 13.774224072672219
epoch 10 loss 13.003493250063084
epoch 11 loss 12.3770410148036
epoch 12 loss 11.824668548658424
epoch 13 loss 11.389033455294811
epoch 14 loss 10.982524550004205
epoch 15 loss 10.625477726259568
epoch 16 loss 10.303933783329128
epoch 17 loss 10.019241131508117
