# Capstone: Philosophical Factors for NLP
**_Measuring Similarity to Philosophical Concepts in Text Data_**

## Thomas W. Ludlow, Jr.
**General Assembly Data Science Immersive DSI-NY-6**

**February 12, 2019**

# Notebook 3 - Document Vectors

### Table of Contents

[**3.1 Gensim Doc2Vec**](#3.1-Gensim-Doc2Vec)
- [3.1.1 Create Tagged Documents](#3.1.1-Create-Tagged-Documents)
- [3.1.2 Build Doc2Vec Models](#3.1.2-Build-Doc2Vec-Models)

[**3.2 Doc2Vec Features for Corpora**](#3.2-Doc2Vec-Features-for-Corpora)
- [3.2.1 3.2.1 Combine Sentence and Paragraph Vectors](#3.2.1-Combine-Sentence-and-Paragraph-Vectors)
- [3.2.2 Combine Vectors with LDA Features](#3.2.2-Combine-Vectors-with-LDA-Features)

**Libraries**

In [1]:
# Python Data Science
import re
import ast
import time
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

# Natural Language Processing
import spacy
from nltk.stem import PorterStemmer

# Gensim
import gensim
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, ldamulticore, CoherenceModel
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
import pyLDAvis.gensim

# Modeling Prep
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib

# Plotting
import matplotlib.pyplot as plt
%matplotlib inline

# Override deprecation warnings
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

## 3.1 Gensim Doc2Vec

**Load Preprocessed Text Data**

In [2]:
nlp_df = pd.read_csv('../data_eda/nlp_df.csv')
t_nlp_df = pd.read_csv('../data_eda/t_nlp_df.csv')

In [3]:
wiki_corpus = api.load('wiki-english-20171001')

In [4]:
wiki_data = [d for d in tqdm(wiki_corpus)]

4924894it [08:23, 9787.73it/s] 


### 3.1.1 Create Tagged Documents

In [5]:
def create_tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

In [6]:
wiki_tag = list(create_tagged_document(wiki_data))

In [7]:
len(wiki_tag)

4924894

**Sentence Vectors**

In [8]:
d2v_s = gensim.models.doc2vec.Doc2Vec(vector_size=32, min_count=2, epochs=5)

In [9]:
d2v_s.build_vocab(wiki_tag)

**Paragraph Vectors**

In [10]:
d2v_p = gensim.models.doc2vec.Doc2Vec(vector_size=16, min_count=2, epochs=5)

In [11]:
d2v_p.build_vocab(wiki_tag)

### 3.1.2 Build Doc2Vec Models

**Sentence Model**

In [12]:
d2v_s.train(wiki_tag, total_examples=d2v_s.corpus_count, epochs=d2v_s.epochs)

In [13]:
d2v_s_vecs = []

for sent_vec in [ast.literal_eval(sent) for sent in nlp_df.sent_lemma]:
    d2v_s_vecs.append(d2v_s.infer_vector(sent_vec))

In [14]:
len(d2v_s_vecs)

63932

In [15]:
len(d2v_s_vecs[0])

32

In [16]:
d2v_s_vecs[0]

array([ 0.00514079,  0.00756313, -0.00511922,  0.00348715,  0.01475863,
       -0.00040127, -0.01314587, -0.01083631,  0.01318629,  0.00035891,
       -0.00045937,  0.00500179, -0.00523515,  0.00804647, -0.00362201,
       -0.01548954, -0.01027478, -0.01155843,  0.01347735,  0.00225375,
       -0.00243251, -0.00711353, -0.00331062,  0.00680516, -0.00090451,
       -0.01197521, -0.00331335,  0.00697181, -0.00226184,  0.00126615,
       -0.0099524 ,  0.00706321], dtype=float32)

Testing Text

In [17]:
t_d2v_s_vecs = []

for sent_vec in [ast.literal_eval(sent) for sent in t_nlp_df.sent_lemma]:
    t_d2v_s_vecs.append(d2v_s.infer_vector(sent_vec))

In [18]:
len(t_d2v_s_vecs)

7935

In [19]:
len(t_d2v_s_vecs[0])

32

**Paragraph Model**

In [20]:
d2v_p.train(wiki_tag, total_examples=d2v_p.corpus_count, epochs=d2v_p.epochs)

In [21]:
d2v_p_vecs = []

for par_vec in [ast.literal_eval(par) for par in nlp_df.par_lemma]:
    d2v_p_vecs.append(d2v_p.infer_vector(par_vec))

In [22]:
len(d2v_p_vecs)

63932

In [23]:
len(d2v_p_vecs[0])

16

Testing Text

In [24]:
t_d2v_p_vecs = []

for par_vec in [ast.literal_eval(par) for par in t_nlp_df.par_lemma]:
    t_d2v_p_vecs.append(d2v_p.infer_vector(par_vec))

In [25]:
len(t_d2v_p_vecs)

7935

In [26]:
len(t_d2v_p_vecs[0])

16

**Save to Disk**

In [27]:
d2v_s_file = open('../models/d2v_s.pkl','wb')
pickle.dump(d2v_s, d2v_s_file)
d2v_s_file.close()

In [28]:
d2v_p_file = open('../models/d2v_p.pkl','wb')
pickle.dump(d2v_p, d2v_p_file)
d2v_p_file.close()

## 3.2 Doc2Vec Features for Corpora

### 3.2.1 Combine Sentence and Paragraph Vectors

In [29]:
# Feature names
s_vec_cols = ['s_vec_'+str(i) for i in range(len(d2v_s_vecs[0]))]
p_vec_cols = ['p_vec_'+str(j) for j in range(len(d2v_p_vecs[0]))]

In [30]:
vec_df = pd.DataFrame(d2v_s_vecs, columns=s_vec_cols)
vec_df.head()

Unnamed: 0,s_vec_0,s_vec_1,s_vec_2,s_vec_3,s_vec_4,s_vec_5,s_vec_6,s_vec_7,s_vec_8,s_vec_9,...,s_vec_22,s_vec_23,s_vec_24,s_vec_25,s_vec_26,s_vec_27,s_vec_28,s_vec_29,s_vec_30,s_vec_31
0,0.005141,0.007563,-0.005119,0.003487,0.014759,-0.000401,-0.013146,-0.010836,0.013186,0.000359,...,-0.003311,0.006805,-0.000905,-0.011975,-0.003313,0.006972,-0.002262,0.001266,-0.009952,0.007063
1,0.000672,-0.008255,0.004202,0.012191,-0.008325,0.010428,-0.010309,0.007246,0.000948,0.013675,...,-0.001058,0.004864,-0.010299,0.013718,0.003485,0.012364,0.007462,-0.000469,-9.7e-05,0.014184
2,0.009192,0.0038,0.004282,0.003132,0.00984,0.002502,-0.015351,-0.012835,0.005541,-0.002684,...,0.000799,-0.002421,0.000578,-0.009566,-0.00256,0.00978,-0.009652,-0.000443,-0.006498,0.004068
3,0.012862,0.005095,0.007222,-0.000288,0.005181,0.011893,-0.000838,-0.004347,0.000892,-0.013354,...,0.006634,0.007733,0.000416,-0.008731,-0.006828,0.012731,-0.00645,0.007383,-0.013415,0.003337
4,0.005489,-0.004147,-0.00346,-0.008962,0.004228,-0.00099,-0.011614,-0.008368,0.003744,0.005757,...,-0.014279,-0.006709,0.008639,-0.003609,0.014656,-0.008735,-0.00722,0.001693,-0.001438,0.001204


In [31]:
p_vec_df = pd.DataFrame(d2v_p_vecs, columns=p_vec_cols)
p_vec_df.head()

Unnamed: 0,p_vec_0,p_vec_1,p_vec_2,p_vec_3,p_vec_4,p_vec_5,p_vec_6,p_vec_7,p_vec_8,p_vec_9,p_vec_10,p_vec_11,p_vec_12,p_vec_13,p_vec_14,p_vec_15
0,0.015278,0.005593,-0.007604,-0.002626,7.5e-05,-0.010657,0.009646,-0.01744,-0.01992,-0.012895,0.026869,-0.008289,0.013262,0.026985,0.005105,-0.026231
1,0.015278,0.005593,-0.007604,-0.002626,7.5e-05,-0.010657,0.009646,-0.01744,-0.01992,-0.012895,0.026869,-0.008289,0.013262,0.026985,0.005105,-0.026231
2,0.015278,0.005593,-0.007604,-0.002626,7.5e-05,-0.010657,0.009646,-0.01744,-0.01992,-0.012895,0.026869,-0.008289,0.013262,0.026985,0.005105,-0.026231
3,-0.00248,0.015847,0.009259,-0.011517,0.004692,0.00419,-0.013091,-0.021624,-0.021936,0.027087,-0.018606,0.017486,-0.029226,-0.012035,0.017496,0.009983
4,-0.00248,0.015847,0.009259,-0.011517,0.004692,0.00419,-0.013091,-0.021624,-0.021936,0.027087,-0.018606,0.017486,-0.029226,-0.012035,0.017496,0.009983


In [32]:
for col_name in p_vec_cols:
    vec_df[col_name] = p_vec_df[col_name]

In [33]:
vec_df.shape

(63932, 48)

Testing Texts

In [34]:
t_vec_df = pd.DataFrame(t_d2v_s_vecs, columns=s_vec_cols)
t_vec_df.head()

Unnamed: 0,s_vec_0,s_vec_1,s_vec_2,s_vec_3,s_vec_4,s_vec_5,s_vec_6,s_vec_7,s_vec_8,s_vec_9,...,s_vec_22,s_vec_23,s_vec_24,s_vec_25,s_vec_26,s_vec_27,s_vec_28,s_vec_29,s_vec_30,s_vec_31
0,0.012852,-0.000564,0.000924,0.015578,-0.00946,-0.001825,0.006729,0.014028,-0.012019,-0.005941,...,-0.008684,-0.0054,-6e-05,-0.014494,0.008053,-0.01484,-0.005764,0.015105,-0.000736,0.01127
1,0.014496,-0.015362,-0.008466,0.012413,0.012671,-0.000903,0.01553,-0.00583,-0.007259,0.014899,...,-0.007146,-0.009037,-0.001791,-0.005722,0.00626,-0.007856,-0.015266,-0.008989,0.009174,0.00829
2,-0.005569,0.010911,0.007,0.004631,0.000567,0.012376,0.001706,-0.012309,0.000371,0.012049,...,0.009573,0.009243,-0.013573,-0.01391,-0.014415,-0.004039,-0.004773,0.006494,-0.014571,0.007027
3,0.01232,-0.007913,-0.014627,-0.005808,-0.004552,0.002174,-0.012969,0.000185,-0.009987,-0.009841,...,0.000689,-0.01339,-0.001545,-0.007197,-0.008211,-0.007027,-0.004479,0.010525,-0.005323,0.01
4,0.001876,-0.003734,-0.006766,-0.008286,0.006277,-0.012269,-0.00385,-0.011094,0.013328,0.01174,...,0.012417,-0.003519,0.00817,0.013473,-0.015407,-0.014435,0.003849,0.003812,0.005256,-0.013906


In [35]:
t_p_vec_df = pd.DataFrame(t_d2v_p_vecs, columns=p_vec_cols)
t_p_vec_df.head()

Unnamed: 0,p_vec_0,p_vec_1,p_vec_2,p_vec_3,p_vec_4,p_vec_5,p_vec_6,p_vec_7,p_vec_8,p_vec_9,p_vec_10,p_vec_11,p_vec_12,p_vec_13,p_vec_14,p_vec_15
0,0.025705,-0.001128,0.001848,0.031156,-0.018919,-0.003649,0.013457,0.028057,-0.024037,-0.011881,0.018372,0.013929,-0.028593,0.030767,0.003707,-0.001779
1,0.013783,0.018774,0.024949,0.012319,-0.005295,-0.016047,-0.009274,-0.00765,-0.001096,0.030937,-0.027963,0.006029,0.011325,0.028024,0.029917,0.030057
2,0.013783,0.018774,0.024949,0.012319,-0.005295,-0.016047,-0.009274,-0.00765,-0.001096,0.030937,-0.027963,0.006029,0.011325,0.028024,0.029917,0.030057
3,0.02464,-0.015825,-0.029253,-0.011617,-0.009103,0.004347,-0.025937,0.00037,-0.019974,-0.019681,0.001572,-0.006201,0.016997,-0.001124,-0.003639,-0.022111
4,0.003752,-0.007468,-0.013531,-0.016572,0.012554,-0.024539,-0.007701,-0.022187,0.026657,0.02348,-0.014577,0.012544,0.027701,-0.005998,-0.000147,0.010758


In [36]:
for col_name in p_vec_cols:
    t_vec_df[col_name] = t_p_vec_df[col_name]

In [37]:
t_vec_df.shape

(7935, 48)

**Save Vector DataFrames to Disk**

In [38]:
vec_df.to_csv('../data_vec/vec_df.csv', index=False)

In [39]:
t_vec_df.to_csv('../data_vec/t_vec_df.csv', index=False)

In [40]:
vec_df = pd.read_csv('../data_vec/vec_df.csv')

In [41]:
t_vec_df = pd.read_csv('../data_vec/t_vec_df.csv')

### 3.2.2 Combine Vectors with LDA Features

**Load LDA Features**

In [42]:
lda_train = pd.read_csv('../data_vec/lda_train.csv')
lda_test = pd.read_csv('../data_vec/lda_test.csv')

In [43]:
prep_train = pd.merge(vec_df, lda_train, right_index=True, left_index=True)
prep_test = pd.merge(t_vec_df, lda_test, right_index=True, left_index=True)

In [44]:
prep_train.head()

Unnamed: 0,s_vec_0,s_vec_1,s_vec_2,s_vec_3,s_vec_4,s_vec_5,s_vec_6,s_vec_7,s_vec_8,s_vec_9,...,p9_lda_civil,p10_lda_deadites,p11_lda_idea,p12_lda_love,p13_lda_sense,p14_lda_biomolecular,p15_lda_thee,a_num,p_num,s_num
0,0.005141,0.007563,-0.005119,0.003487,0.014759,-0.000401,-0.013146,-0.010836,0.013186,0.000359,...,0.012991,0.012991,0.80513,0.012991,0.012991,0.012991,0.012991,0,0,0
1,0.000672,-0.008255,0.004202,0.012191,-0.008325,0.010428,-0.010309,0.007246,0.000948,0.013675,...,0.012991,0.012991,0.80513,0.012991,0.012991,0.012991,0.012991,0,0,1
2,0.009192,0.0038,0.004282,0.003132,0.00984,0.002502,-0.015351,-0.012835,0.005541,-0.002684,...,0.012991,0.012991,0.80513,0.012991,0.012991,0.012991,0.012991,0,0,2
3,0.012862,0.005095,0.007222,-0.000288,0.005181,0.011893,-0.000838,-0.004347,0.000892,-0.013354,...,0.013657,0.013657,0.013657,0.013657,0.736205,0.013657,0.013657,0,1,0
4,0.005489,-0.004147,-0.00346,-0.008962,0.004228,-0.00099,-0.011614,-0.008368,0.003744,0.005757,...,0.013657,0.013657,0.013657,0.013657,0.73583,0.013657,0.013657,0,1,1


In [45]:
prep_train.shape

(63932, 99)

In [46]:
prep_test.head()

Unnamed: 0,s_vec_0,s_vec_1,s_vec_2,s_vec_3,s_vec_4,s_vec_5,s_vec_6,s_vec_7,s_vec_8,s_vec_9,...,p9_lda_civil,p10_lda_deadites,p11_lda_idea,p12_lda_love,p13_lda_sense,p14_lda_biomolecular,p15_lda_thee,a_num,p_num,s_num
0,0.012852,-0.000564,0.000924,0.015578,-0.00946,-0.001825,0.006729,0.014028,-0.012019,-0.005941,...,0.012556,0.012556,0.012556,0.012556,0.267877,0.012556,0.012556,0,0,0
1,0.014496,-0.015362,-0.008466,0.012413,0.012671,-0.000903,0.01553,-0.00583,-0.007259,0.014899,...,0.011455,0.011455,0.147549,0.011455,0.011455,0.011455,0.011455,0,1,0
2,-0.005569,0.010911,0.007,0.004631,0.000567,0.012376,0.001706,-0.012309,0.000371,0.012049,...,0.011455,0.011455,0.147652,0.011455,0.011455,0.011455,0.011455,0,1,1
3,0.01232,-0.007913,-0.014627,-0.005808,-0.004552,0.002174,-0.012969,0.000185,-0.009987,-0.009841,...,0.012644,0.012644,0.012644,0.012644,0.012644,0.012644,0.012644,0,2,0
4,0.001876,-0.003734,-0.006766,-0.008286,0.006277,-0.012269,-0.00385,-0.011094,0.013328,0.01174,...,0.015147,0.015147,0.015147,0.015147,0.015148,0.015147,0.015147,0,3,0


In [47]:
prep_test.shape

(7935, 99)

**Save Train and Test Features to Disk**

In [48]:
prep_train.to_csv('../data_vec/prep_train.csv', index=False)
prep_test.to_csv('../data_vec/prep_test.csv', index=False)

## Continue to Notebook 4: Multiclass Classification Models