# Capstone: Philosophical Factors for NLP
**_Measuring Similarity to Philosophical Concepts in Text Data_**

## Thomas W. Ludlow, Jr.
**General Assembly Data Science Immersive DSI-NY-6**

**February 12, 2019**

# Notebook 3 - Document Vectors

### Table of Contents

[**3.1 Gensim Doc2Vec**](#3.1-Gensim-Doc2Vec)
- [3.1.1 Create Tagged Documents](#3.1.1-Create-Tagged-Documents)
- [3.1.2 Build Doc2Vec Models](#3.1.2-Build-Doc2Vec-Models)

[**3.2 Doc2Vec Features for Corpora**](#3.2-Doc2Vec-Features-for-Corpora)

**Libraries**

In [2]:
# Python Data Science
import re
import ast
import time
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm

# Natural Language Processing
import spacy
from nltk.stem import PorterStemmer

# Gensim
import gensim
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, ldamulticore, CoherenceModel
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pyLDAvis.gensim

# Modeling Prep
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib

# Plotting
import matplotlib.pyplot as plt
%matplotlib inline

# Override deprecation warnings
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

## 3.1 Gensim Doc2Vec

**Load Preprocessed Text Data**

In [3]:
nlp_df = pd.read_csv('../ga_dsi_capstone_ec2only/data_eda/nlp_df.csv')
t_nlp_df = pd.read_csv('../ga_dsi_capstone_ec2only/data_eda/t_nlp_df.csv')

In [4]:
text8_corpus = api.load('text8')

In [6]:
text8_data = [d for d in text8_corpus]

In [13]:
wiki_corpus = api.load('wiki-english-20171001')

In [14]:
wiki_data = [d for d in tqdm(wiki_corpus)]

### 3.1.1 Create Tagged Documents

In [7]:
def create_tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

In [15]:
wiki_tag = list(create_tagged_document(wiki_data))

In [16]:
len(wiki_tag)

4924894

**Sentence Vectors**

In [17]:
d2v_s = gensim.models.doc2vec.Doc2Vec(vector_size=32, min_count=2, epochs=5)

In [18]:
d2v_s.build_vocab(wiki_tag)

**Paragraph Vectors**

In [23]:
d2v_p = gensim.models.doc2vec.Doc2Vec(vector_size=16, min_count=2, epochs=5)

In [24]:
d2v_p.build_vocab(wiki_tag)

### 3.1.2 Build Doc2Vec Models

**Sentence Model**

In [19]:
d2v_s.train(wiki_tag, total_examples=d2v_s.corpus_count, epochs=d2v_s.epochs)

In [20]:
d2v_s_vecs = []

for sent_vec in [ast.literal_eval(sent) for sent in nlp_df.sent_lemma]:
    d2v_s_vecs.append(d2v_s.infer_vector(sent_vec))

In [21]:
len(d2v_s_vecs)

70922

In [22]:
len(d2v_s_vecs[0])

32

In [26]:
d2v_s_vecs[0]

array([ 0.00124884,  0.00621537,  0.00584457,  0.00590125, -0.00437186,
        0.01183683,  0.01327725, -0.0145961 ,  0.00701683,  0.00531668,
       -0.00293273,  0.00560657, -0.01105577, -0.00042761, -0.0018066 ,
       -0.00269347,  0.00098871,  0.00815305,  0.01341378,  0.01553244,
       -0.00085134,  0.00450879,  0.01245038,  0.01457626,  0.00062826,
       -0.00885144,  0.0100125 ,  0.00058006, -0.0074619 ,  0.00418023,
        0.01027068,  0.01424398], dtype=float32)

Testing Text

In [42]:
t_d2v_s_vecs = []

for sent_vec in [ast.literal_eval(sent) for sent in t_nlp_df.sent_lemma]:
    t_d2v_s_vecs.append(d2v_s.infer_vector(sent_vec))

In [43]:
len(t_d2v_s_vecs)

8395

In [44]:
len(t_d2v_s_vecs[0])

32

**Paragraph Model**

In [25]:
d2v_p.train(wiki_tag, total_examples=d2v_p.corpus_count, epochs=d2v_p.epochs)

In [27]:
d2v_p_vecs = []

for par_vec in [ast.literal_eval(par) for par in nlp_df.par_lemma]:
    d2v_p_vecs.append(d2v_p.infer_vector(par_vec))

In [28]:
len(d2v_p_vecs)

70922

In [29]:
len(d2v_p_vecs[0])

16

Testing Text

In [39]:
t_d2v_p_vecs = []

for par_vec in [ast.literal_eval(par) for par in t_nlp_df.par_lemma]:
    t_d2v_p_vecs.append(d2v_p.infer_vector(par_vec))

In [40]:
len(t_d2v_p_vecs)

8395

In [41]:
len(t_d2v_p_vecs[0])

16

## 3.2 Doc2Vec Features for Corpora

In [30]:
# Feature names
s_vec_cols = ['s_vec_'+str(i) for i in range(len(d2v_s_vecs[0]))]
p_vec_cols = ['p_vec_'+str(j) for j in range(len(d2v_p_vecs[0]))]

In [33]:
vec_df = pd.DataFrame(d2v_s_vecs, columns=s_vec_cols)
vec_df.head()

Unnamed: 0,s_vec_0,s_vec_1,s_vec_2,s_vec_3,s_vec_4,s_vec_5,s_vec_6,s_vec_7,s_vec_8,s_vec_9,...,s_vec_22,s_vec_23,s_vec_24,s_vec_25,s_vec_26,s_vec_27,s_vec_28,s_vec_29,s_vec_30,s_vec_31
0,0.001249,0.006215,0.005845,0.005901,-0.004372,0.011837,0.013277,-0.014596,0.007017,0.005317,...,0.01245,0.014576,0.000628,-0.008851,0.010013,0.00058,-0.007462,0.00418,0.010271,0.014244
1,0.003242,-0.000672,0.01326,0.00794,0.002701,-0.003546,-0.000863,-0.013519,0.000382,-0.002663,...,0.000376,-0.013639,0.013421,0.001214,0.00062,0.00725,3.4e-05,0.005984,0.010108,-0.010874
2,-0.013009,0.013532,0.004053,0.011666,0.013006,-0.001418,-0.00827,0.013281,-0.015108,0.013458,...,-0.000713,-0.014569,0.011763,0.012487,-0.007188,-0.008616,0.004941,0.011405,-0.001922,0.006328
3,-0.010679,-0.00256,-0.004422,0.012446,0.008916,0.005347,0.011207,0.010528,0.010795,0.012482,...,0.005158,-0.012275,-0.015438,-0.005364,0.007497,0.015009,0.005901,-0.002879,0.006861,0.008418
4,0.006584,0.011894,-0.002948,-0.002465,-0.007765,0.002303,0.014071,0.005065,0.006366,-0.013893,...,-0.012635,-0.011083,0.01131,-0.008306,-0.010435,0.001199,0.000483,-0.005632,-0.001226,-0.014103


In [34]:
p_vec_df = pd.DataFrame(d2v_p_vecs, columns=p_vec_cols)
p_vec_df.head()

Unnamed: 0,p_vec_0,p_vec_1,p_vec_2,p_vec_3,p_vec_4,p_vec_5,p_vec_6,p_vec_7,p_vec_8,p_vec_9,p_vec_10,p_vec_11,p_vec_12,p_vec_13,p_vec_14,p_vec_15
0,-0.00952,0.004242,-0.010557,-0.001216,0.031085,-0.00083,0.007159,-0.020161,-0.016229,0.008747,-0.028436,-0.023167,-0.028918,0.000462,0.014013,0.022496
1,-0.00952,0.004242,-0.010557,-0.001216,0.031085,-0.00083,0.007159,-0.020161,-0.016229,0.008747,-0.028436,-0.023167,-0.028918,0.000462,0.014013,0.022496
2,-0.00952,0.004242,-0.010557,-0.001216,0.031085,-0.00083,0.007159,-0.020161,-0.016229,0.008747,-0.028436,-0.023167,-0.028918,0.000462,0.014013,0.022496
3,0.020753,-0.009114,0.014544,-0.005492,0.009235,-0.004078,-0.029228,0.004044,0.018803,0.002261,-0.010738,0.019206,0.022171,-0.008174,-0.021034,0.014731
4,0.020753,-0.009114,0.014544,-0.005492,0.009235,-0.004078,-0.029228,0.004044,0.018803,0.002261,-0.010738,0.019206,0.022171,-0.008174,-0.021034,0.014731


In [35]:
for col_name in p_vec_cols:
    vec_df[col_name] = p_vec_df[col_name]

In [36]:
vec_df.shape

(70922, 48)

Testing Texts

In [45]:
t_vec_df = pd.DataFrame(t_d2v_s_vecs, columns=s_vec_cols)
t_vec_df.head()

Unnamed: 0,s_vec_0,s_vec_1,s_vec_2,s_vec_3,s_vec_4,s_vec_5,s_vec_6,s_vec_7,s_vec_8,s_vec_9,...,s_vec_22,s_vec_23,s_vec_24,s_vec_25,s_vec_26,s_vec_27,s_vec_28,s_vec_29,s_vec_30,s_vec_31
0,0.004359,-0.000571,-0.004839,0.007535,-0.003274,0.000763,-0.013759,0.010369,0.013478,-0.014073,...,0.005352,0.013327,0.010591,-0.002382,0.003813,-0.008778,0.006432,0.001156,0.008181,0.001244
1,-0.009332,-0.002228,-0.011491,0.003014,-0.00476,0.012227,0.00854,0.007374,0.007985,0.010571,...,-0.012453,0.000977,-0.00213,-0.002443,-0.002815,0.011934,0.01362,-0.008817,0.007094,0.006024
2,0.003458,-0.011696,0.009825,-0.014435,0.004025,0.00182,-0.012778,0.001241,0.000223,-0.013823,...,0.003278,-0.004106,0.009378,-0.000505,-0.008823,-0.014553,-0.001184,0.014174,-0.001671,-0.002993
3,-0.009786,0.011872,0.010536,-0.005208,-0.01415,-0.008749,-0.011454,0.007486,-0.001776,0.011128,...,0.005409,-0.006618,-0.007612,-0.012557,0.014054,0.009215,-0.005416,0.009297,0.004702,0.005317
4,-0.015545,-0.013406,0.003756,0.014538,0.002095,-0.006507,-0.000138,-0.005231,0.001181,-0.012158,...,-0.011356,-0.013038,-0.013039,-0.002307,-0.001795,-0.01479,-0.006229,-0.013867,-0.003037,-0.005436


In [46]:
t_p_vec_df = pd.DataFrame(t_d2v_p_vecs, columns=p_vec_cols)
t_p_vec_df.head()

Unnamed: 0,p_vec_0,p_vec_1,p_vec_2,p_vec_3,p_vec_4,p_vec_5,p_vec_6,p_vec_7,p_vec_8,p_vec_9,p_vec_10,p_vec_11,p_vec_12,p_vec_13,p_vec_14,p_vec_15
0,0.008718,-0.001142,-0.009677,0.01507,-0.006549,0.001527,-0.027518,0.020738,0.026956,-0.028146,0.030955,-0.015057,0.010595,0.016444,-0.029222,0.006347
1,0.014704,0.020234,-0.008556,-0.01937,0.012501,-0.000461,-0.020978,-0.000837,0.003654,0.031146,-0.008485,-0.00537,0.030877,-0.021883,0.024861,-0.021846
2,0.014704,0.020234,-0.008556,-0.01937,0.012501,-0.000461,-0.020978,-0.000837,0.003654,0.031146,-0.008485,-0.00537,0.030877,-0.021883,0.024861,-0.021846
3,-0.019571,0.023745,0.021073,-0.010417,-0.0283,-0.017498,-0.022909,0.014972,-0.003553,0.022256,-0.015488,0.023517,0.016039,-0.013329,0.00869,0.009893
4,-0.031089,-0.026812,0.007512,0.029076,0.004189,-0.013014,-0.000277,-0.010462,0.002362,-0.024317,0.009738,-0.029609,-0.028948,-0.006355,-0.01441,-0.026552


In [47]:
for col_name in p_vec_cols:
    t_vec_df[col_name] = t_p_vec_df[col_name]

In [49]:
t_vec_df.shape

(8395, 48)

**Save Vector DataFrames to Disk**

In [50]:
vec_df.to_csv('../ga_dsi_capstone_ec2only/data_vec/vec_df.csv', index=False)

In [51]:
t_vec_df.to_csv('../ga_dsi_capstone_ec2only/data_vec/t_vec_df.csv', index=False)

## Continue to Notebook 4: Models