# Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from gensim.models import Word2Vec, KeyedVectors
from gensim.models.phrases import Phrases, Phraser

import logging  
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, Dropout, SimpleRNN, LSTM, GRU
from keras.callbacks import EarlyStopping, ModelCheckpoint

import multiprocessing

In [None]:
cores = multiprocessing.cpu_count()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/datasets/train_cleaned.csv')

In [None]:
train.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,num_labels,merged,num_words_original,processed,num_words_processed
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0,1,Reconstructing Subject-Specific Effect Maps ...,269,reconstruct subject specific effect map predic...,180
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0,1,Rotation Invariance Neural Network Rotation ...,80,rotation invariance neural network rotation in...,49
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0,1,Spherical polyharmonics and Poisson kernels fo...,107,spherical polyharmonic poisson kernel polyharm...,62
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0,1,A finite element approximation for the stochas...,119,finite element approximation stochastic maxwel...,72
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0,2,Comparative study of Discrete Wavelet Transfor...,145,comparative study discrete wavelet transforms ...,95


# Training Word2Vec Models

## Continuous Bag of Words(CBOW)

In [None]:
# split processed text into individual words for each article
sent = [row.split() for row in train['processed']] 
len(sent)

20972

In [None]:
phrases = Phrases(sent, min_count=1, progress_per=10000)
# generate bigrams
bigram = Phraser(phrases)
sentences = bigram[sent]

INFO - 07:12:46: collecting all words and their counts
INFO - 07:12:46: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 07:12:48: PROGRESS: at sentence #10000, processed 919494 words and 577204 word types
INFO - 07:12:50: PROGRESS: at sentence #20000, processed 1835016 words and 997570 word types
INFO - 07:12:50: collected 1034660 word types from a corpus of 1923943 words (unigram + bigrams) and 20972 sentences
INFO - 07:12:50: using 1034660 counts as vocab in Phrases<0 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 07:12:50: source_vocab length 1034660
INFO - 07:12:57: Phraser added 50000 phrasegrams
INFO - 07:13:02: Phraser built with 67312 phrasegrams


In [None]:
w2v_cbow = Word2Vec(window=5,
                    size=300,
                    sample=6e-5, 
                    alpha=0.03, 
                    min_alpha=0.0007, 
                    negative=20,
                    sg=0, #<-- CBOW
                    workers=cores
                    )

In [None]:
w2v_cbow.build_vocab(sentences, progress_per=10000)

INFO - 07:13:02: collecting all words and their counts
INFO - 07:13:02: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 07:13:04: PROGRESS: at sentence #10000, processed 764703 words, keeping 69579 word types
INFO - 07:13:07: PROGRESS: at sentence #20000, processed 1525128 words, keeping 99671 word types
INFO - 07:13:07: collected 101873 word types from a corpus of 1598894 raw words and 20972 sentences
INFO - 07:13:07: Loading a fresh vocabulary
INFO - 07:13:07: effective_min_count=5 retains 24585 unique words (24% of original 101873, drops 77288)
INFO - 07:13:07: effective_min_count=5 leaves 1440996 word corpus (90% of original 1598894, drops 157898)
INFO - 07:13:08: deleting the raw counts dictionary of 101873 items
INFO - 07:13:08: sample=6e-05 downsamples 1086 most-common words
INFO - 07:13:08: downsampling leaves estimated 850281 word corpus (59.0% of prior 1440996)
INFO - 07:13:08: estimated required memory for 24585 words and 300 dimensions: 71296500 byt

In [None]:
w2v_cbow.train(sentences, total_examples=w2v_cbow.corpus_count, epochs=30, report_delay=1)

INFO - 07:13:13: training model with 2 workers on 24585 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=5
INFO - 07:13:14: EPOCH 1 - PROGRESS: at 8.05% examples, 68606 words/s, in_qsize 0, out_qsize 0
INFO - 07:13:15: EPOCH 1 - PROGRESS: at 16.75% examples, 69989 words/s, in_qsize 0, out_qsize 0
INFO - 07:13:16: EPOCH 1 - PROGRESS: at 25.51% examples, 70769 words/s, in_qsize 0, out_qsize 0
INFO - 07:13:17: EPOCH 1 - PROGRESS: at 34.26% examples, 70528 words/s, in_qsize 0, out_qsize 0
INFO - 07:13:18: EPOCH 1 - PROGRESS: at 42.91% examples, 71198 words/s, in_qsize 0, out_qsize 0
INFO - 07:13:19: EPOCH 1 - PROGRESS: at 51.44% examples, 71668 words/s, in_qsize 0, out_qsize 0
INFO - 07:13:20: EPOCH 1 - PROGRESS: at 60.16% examples, 71952 words/s, in_qsize 0, out_qsize 0
INFO - 07:13:21: EPOCH 1 - PROGRESS: at 69.00% examples, 71800 words/s, in_qsize 0, out_qsize 0
INFO - 07:13:22: EPOCH 1 - PROGRESS: at 77.65% examples, 71886 words/s, in_qsize 0, out_qsize 0
IN

(25507811, 47966820)

In [None]:
filepath='/content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/word2vec trained models/w2v_cbow.bin'
w2v_cbow.save(filepath)

INFO - 07:19:14: saving Word2Vec object under /content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/word2vec trained models/w2v_cbow.bin, separately None
INFO - 07:19:14: not storing attribute vectors_norm
INFO - 07:19:14: not storing attribute cum_table
INFO - 07:19:15: saved /content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/word2vec trained models/w2v_cbow.bin


In [None]:
w2v_cbow = Word2Vec.load(filepath)

INFO - 07:19:15: loading Word2Vec object from /content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/word2vec trained models/w2v_cbow.bin
INFO - 07:19:15: loading wv recursively from /content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/word2vec trained models/w2v_cbow.bin.wv.* with mmap=None
INFO - 07:19:15: setting ignored attribute vectors_norm to None
INFO - 07:19:15: loading vocabulary recursively from /content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/word2vec trained models/w2v_cbow.bin.vocabulary.* with mmap=None
INFO - 07:19:15: loading trainables recursively from /content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/word2vec trained models/w2v_cbow.bin.trainables.* with mmap=None
INFO - 07:19:15: setting ignored attribute cum_table to None
INFO - 07:19:15: loaded /content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/word2vec trained mod

In [None]:
w2v_cbow.wv['neural']

array([ 0.9243051 ,  0.33688238,  0.07044522,  1.8102928 , -0.2515519 ,
       -0.6152202 , -0.9010733 , -0.19954833,  0.3346904 ,  0.77733386,
       -1.5004901 ,  0.41397956,  0.6738624 , -1.4820195 , -0.24897403,
        0.66830343,  1.3471195 , -0.88333553, -0.22689655,  1.6027219 ,
        1.6821423 ,  1.270086  ,  0.5901932 , -1.0434482 , -0.982463  ,
        0.2386729 , -0.34063798, -0.6455318 ,  0.111669  , -0.84072244,
        0.08247399, -0.23655242, -0.19669159, -0.5697638 ,  1.7348073 ,
       -1.3351154 , -0.00754811,  0.8906611 ,  0.6929577 ,  1.3391168 ,
        1.694624  , -0.5208038 , -0.5192902 ,  0.8866572 , -0.00449319,
       -0.44216248,  0.53584254, -0.199277  ,  0.11983273,  0.20994326,
        0.6804729 , -1.4486474 , -0.59087044,  0.04199345,  0.7143    ,
        0.57560533,  0.54569674, -0.5174372 ,  0.19062795, -0.7537732 ,
        0.2005631 , -1.6498071 , -0.11766499, -0.7643135 , -1.1143699 ,
        0.54714257,  0.8764769 ,  0.10639992,  0.6280645 ,  0.04

## Skipgram

In [None]:
w2v_skip = Word2Vec(window=5,
                    size=300,
                    sample=6e-5, 
                    alpha=0.03, 
                    min_alpha=0.0007, 
                    negative=20,
                    sg=1, #<-- skipgram
                    workers=cores)

In [None]:
w2v_skip.build_vocab(sentences, progress_per=10000)

INFO - 07:19:15: collecting all words and their counts
INFO - 07:19:15: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 07:19:18: PROGRESS: at sentence #10000, processed 764703 words, keeping 69579 word types
INFO - 07:19:21: PROGRESS: at sentence #20000, processed 1525128 words, keeping 99671 word types
INFO - 07:19:21: collected 101873 word types from a corpus of 1598894 raw words and 20972 sentences
INFO - 07:19:21: Loading a fresh vocabulary
INFO - 07:19:21: effective_min_count=5 retains 24585 unique words (24% of original 101873, drops 77288)
INFO - 07:19:21: effective_min_count=5 leaves 1440996 word corpus (90% of original 1598894, drops 157898)
INFO - 07:19:21: deleting the raw counts dictionary of 101873 items
INFO - 07:19:21: sample=6e-05 downsamples 1086 most-common words
INFO - 07:19:21: downsampling leaves estimated 850281 word corpus (59.0% of prior 1440996)
INFO - 07:19:21: estimated required memory for 24585 words and 300 dimensions: 71296500 byt

In [None]:
w2v_skip.train(sentences, total_examples=w2v_skip.corpus_count, epochs=30, report_delay=1)

INFO - 07:19:29: training model with 2 workers on 24585 vocabulary and 300 features, using sg=1 hs=0 sample=6e-05 negative=20 window=5
INFO - 07:19:30: EPOCH 1 - PROGRESS: at 2.48% examples, 19008 words/s, in_qsize 3, out_qsize 0
INFO - 07:19:31: EPOCH 1 - PROGRESS: at 5.62% examples, 21653 words/s, in_qsize 3, out_qsize 0
INFO - 07:19:32: EPOCH 1 - PROGRESS: at 8.66% examples, 22502 words/s, in_qsize 3, out_qsize 0
INFO - 07:19:33: EPOCH 1 - PROGRESS: at 11.83% examples, 22748 words/s, in_qsize 3, out_qsize 0
INFO - 07:19:35: EPOCH 1 - PROGRESS: at 15.51% examples, 23102 words/s, in_qsize 2, out_qsize 1
INFO - 07:19:36: EPOCH 1 - PROGRESS: at 19.22% examples, 23577 words/s, in_qsize 3, out_qsize 0
INFO - 07:19:37: EPOCH 1 - PROGRESS: at 22.36% examples, 23838 words/s, in_qsize 3, out_qsize 0
INFO - 07:19:38: EPOCH 1 - PROGRESS: at 25.51% examples, 23966 words/s, in_qsize 3, out_qsize 0
INFO - 07:19:39: EPOCH 1 - PROGRESS: at 28.56% examples, 23974 words/s, in_qsize 3, out_qsize 0
INFO

(25509597, 47966820)

In [None]:
filepath='/content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/word2vec trained models/w2v_skip.bin'
w2v_skip.save(filepath)

INFO - 07:36:15: saving Word2Vec object under /content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/word2vec trained models/w2v_skip.bin, separately None
INFO - 07:36:15: not storing attribute vectors_norm
INFO - 07:36:15: not storing attribute cum_table
INFO - 07:36:16: saved /content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/word2vec trained models/w2v_skip.bin


In [None]:
w2v_skip = Word2Vec.load(filepath)

INFO - 07:36:16: loading Word2Vec object from /content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/word2vec trained models/w2v_skip.bin
INFO - 07:36:17: loading wv recursively from /content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/word2vec trained models/w2v_skip.bin.wv.* with mmap=None
INFO - 07:36:17: setting ignored attribute vectors_norm to None
INFO - 07:36:17: loading vocabulary recursively from /content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/word2vec trained models/w2v_skip.bin.vocabulary.* with mmap=None
INFO - 07:36:17: loading trainables recursively from /content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/word2vec trained models/w2v_skip.bin.trainables.* with mmap=None
INFO - 07:36:17: setting ignored attribute cum_table to None
INFO - 07:36:17: loaded /content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/word2vec trained mod

## Load LexVec

In [None]:
lexvec_path = '/content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/pre-trained word embeddings/lexvec.enwiki+newscrawl.300d.W.pos.vectors'

lexvec = KeyedVectors.load_word2vec_format(lexvec_path)

INFO - 07:36:17: loading projection weights from /content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/pre-trained word embeddings/lexvec.enwiki+newscrawl.300d.W.pos.vectors
INFO - 07:38:13: loaded (368999, 300) matrix from /content/drive/MyDrive/Capstone - Multi Label Classification on Research Articles/pre-trained word embeddings/lexvec.enwiki+newscrawl.300d.W.pos.vectors



# Comparing Word Similarities

In [None]:
word = 'neural'
w2v_cbow.most_similar(word)

  
INFO - 07:38:13: precomputing L2-norms of word weight vectors


[('primary_visual', 0.6881042718887329),
 ('architecture_search', 0.6720964312553406),
 ('cortex', 0.6583304405212402),
 ('layer_normalization', 0.6485791206359863),
 ('machine_translation', 0.6460709571838379),
 ('semantic_parsing', 0.6372041702270508),
 ('nmt', 0.6358315944671631),
 ('multisensory', 0.6336055397987366),
 ('neuron_circuit', 0.6272149085998535),
 ('nas', 0.6232646703720093)]

In [None]:
w2v_skip.most_similar(word)

  """Entry point for launching an IPython kernel.
INFO - 07:38:13: precomputing L2-norms of word weight vectors


[('primary_visual', 0.5433772802352905),
 ('nas', 0.4985634982585907),
 ('multisensory', 0.4941956400871277),
 ('sensory_motor', 0.4814603328704834),
 ('dpca', 0.4742458760738373),
 ('nmt', 0.47084712982177734),
 ('machine_translation', 0.46738892793655396),
 ('sockeye', 0.46322304010391235),
 ('phrase_base', 0.4533829092979431),
 ('integrate_fire', 0.4525911211967468)]

In [None]:
lexvec.most_similar(word)

INFO - 07:38:13: precomputing L2-norms of word weight vectors


[('neuronal', 0.7093234658241272),
 ('brain', 0.6016504764556885),
 ('cortical', 0.5880976319313049),
 ('sensory', 0.56494140625),
 ('synaptic', 0.5620172023773193),
 ('neurons', 0.560059666633606),
 ('auditory', 0.5547546744346619),
 ('prefrontal', 0.542811930179596),
 ('orbitofrontal', 0.537427544593811),
 ('dendritic', 0.5215905904769897)]

In [None]:
word = 'bayes'
w2v_cbow.most_similar(word)

  


[('dirichlet_distribution', 0.7402399182319641),
 ('selection_procedure', 0.7217991948127747),
 ('posterior_sampling', 0.7216918468475342),
 ('sure_convergence', 0.7105766534805298),
 ('frequentist', 0.7099422812461853),
 ('estimators', 0.7035677433013916),
 ('confidence_set', 0.7003422975540161),
 ('rejection_sample', 0.6969581246376038),
 ('likelihood_bootstrap', 0.6873930096626282),
 ('likelihood_estimation', 0.6768184900283813)]

In [None]:
w2v_skip.most_similar(word)

  """Entry point for launching an IPython kernel.


[('gp_sum', 0.47594118118286133),
 ('sure_convergence', 0.4364965558052063),
 ('hellinger_distance', 0.42843273282051086),
 ('preliminary_test', 0.4276887774467468),
 ('entropy_sgd', 0.4150734841823578),
 ('likelihood_bootstrap', 0.4140958786010742),
 ('gibbs_posterior', 0.4108899235725403),
 ('asymptotically_efficient', 0.40491783618927),
 ('hypernetwork', 0.38725072145462036),
 ('bayes_estimator', 0.387126624584198)]

In [None]:
lexvec.most_similar(word)

[('bayesian', 0.5732742547988892),
 ('regression', 0.5125405788421631),
 ('inference', 0.48300325870513916),
 ('probabilistic', 0.4808858036994934),
 ('probability', 0.4732035994529724),
 ('frequentist', 0.45971786975860596),
 ('dirichlet', 0.45899203419685364),
 ('propositional', 0.4536210298538208),
 ('multinomial', 0.44903528690338135),
 ('classifier', 0.4484688639640808)]

In [None]:
word = 'cat'
w2v_cbow.most_similar(word)

  


[('dog', 0.6721174120903015),
 ('traffic_sign', 0.6483354568481445),
 ('robot_assist', 0.6048993468284607),
 ('large_annotate', 0.602262020111084),
 ('handwritten_character', 0.5857182741165161),
 ('reading_comprehension', 0.5851730108261108),
 ('performance_degrade', 0.584370493888855),
 ('potent', 0.5836292505264282),
 ('unlabele_datum', 0.5818756818771362),
 ('skilled', 0.5802294015884399)]

In [None]:
w2v_skip.most_similar(word)

  """Entry point for launching an IPython kernel.


[('defence', 0.5010182857513428),
 ('debugger', 0.4963228702545166),
 ('curvature_bound', 0.4953463673591614),
 ('dog', 0.48540619015693665),
 ('fully_autonomous', 0.45742371678352356),
 ('lesion_detection', 0.448682963848114),
 ('tumor_segmentation', 0.44609957933425903),
 ('thwart', 0.4378761053085327),
 ('robot_assist', 0.43498682975769043),
 ('core_decompression', 0.43464064598083496)]

In [None]:
lexvec.most_similar(word)

[('dog', 0.6504852771759033),
 ('cats', 0.6381483674049377),
 ('feline', 0.5630845427513123),
 ('puppy', 0.5600976943969727),
 ('kitten', 0.5556704998016357),
 ('pet', 0.5491912364959717),
 ('kittens', 0.539549708366394),
 ('dogs', 0.5145684480667114),
 ('puppies', 0.5118380784988403),
 ('hamster', 0.4843354821205139)]

Observations:

As expected, the 2 word2vec models trained with can produce similarity with sufficient qualities but will fail when feed with a word not so related to scientific research. 

The pretrained LexVec model the other hand performs better. Later on in notebook 2a), there are 18554 skipped word singled out during the embedding process. No matter how large the corpus is, a pretrained model cannot possibly cover all words.