# Capstone: Philosophical Factors for NLP
**_Measuring Similarity to Philosophical Concepts in Text Data_**

## Thomas W. Ludlow, Jr.
**General Assembly Data Science Immersive DSI-NY-6**

**February 12, 2019**

# Notebook 3 - Document Vectors

### Table of Contents

[**3.1 Gensim Doc2Vec**](#3.1-Gensim-Doc2Vec)
- [3.1.1 Create Tagged Documents](#3.1.1-Create-Tagged-Documents)
- [3.1.2 Build Doc2Vec Models](#3.1.2-Build-Doc2Vec-Models)

[**3.2 Doc2Vec Features for Corpora**](#3.2-Doc2Vec-Features-for-Corpora)
- [3.2.1 3.2.1 Combine Sentence and Paragraph Vectors](#3.2.1-Combine-Sentence-and-Paragraph-Vectors)
- [3.2.2 Combine Vectors with LDA Features](#3.2.2-Combine-Vectors-with-LDA-Features)

**Libraries**

In [1]:
# Python Data Science
import re
import ast
import time
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

# Natural Language Processing
import spacy
from nltk.stem import PorterStemmer

# Gensim
import gensim
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, ldamulticore, CoherenceModel
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
import pyLDAvis.gensim

# Modeling Prep
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib

# Plotting
import matplotlib.pyplot as plt
%matplotlib inline

# Override deprecation warnings
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

## 3.1 Gensim Doc2Vec

**Load Preprocessed Text Data**

In [2]:
nlp_df = pd.read_csv('../data_eda/nlp_df.csv')
t_nlp_df = pd.read_csv('../data_eda/t_nlp_df.csv')

In [3]:
text8_corpus = api.load('text8')

In [4]:
text8_data = [d for d in text8_corpus]

In [5]:
wiki_corpus = api.load('wiki-english-20171001')

In [6]:
wiki_data = [d for d in tqdm(wiki_corpus)]

4924894it [08:41, 9438.02it/s] 


### 3.1.1 Create Tagged Documents

In [7]:
def create_tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

In [8]:
wiki_tag = list(create_tagged_document(wiki_data))

In [9]:
len(wiki_tag)

4924894

**Sentence Vectors**

In [10]:
d2v_s = gensim.models.doc2vec.Doc2Vec(vector_size=32, min_count=2, epochs=5)

In [11]:
d2v_s.build_vocab(wiki_tag)

**Paragraph Vectors**

In [12]:
d2v_p = gensim.models.doc2vec.Doc2Vec(vector_size=18, min_count=2, epochs=5)

In [13]:
d2v_p.build_vocab(wiki_tag)

### 3.1.2 Build Doc2Vec Models

**Sentence Model**

In [14]:
d2v_s.train(wiki_tag, total_examples=d2v_s.corpus_count, epochs=d2v_s.epochs)

In [15]:
d2v_s_vecs = []

for sent_vec in [ast.literal_eval(sent) for sent in nlp_df.sent_lemma]:
    d2v_s_vecs.append(d2v_s.infer_vector(sent_vec))

In [16]:
len(d2v_s_vecs)

74115

In [17]:
len(d2v_s_vecs[0])

32

In [18]:
d2v_s_vecs[0]

array([ 0.01387677, -0.0121483 ,  0.00700643, -0.01539578, -0.00265582,
       -0.00348972,  0.01171596,  0.00422984,  0.00690689,  0.00254729,
        0.01471115, -0.00755273, -0.00990111, -0.00768929,  0.00917837,
        0.00609836,  0.01111884, -0.00650711,  0.00322653,  0.00478539,
        0.00688467,  0.00633643, -0.00989119, -0.01206473, -0.0112613 ,
        0.00740096,  0.00755055, -0.01261828,  0.01227136, -0.0049176 ,
       -0.00609248, -0.00477354], dtype=float32)

Testing Text

In [19]:
t_d2v_s_vecs = []

for sent_vec in [ast.literal_eval(sent) for sent in t_nlp_df.sent_lemma]:
    t_d2v_s_vecs.append(d2v_s.infer_vector(sent_vec))

In [20]:
len(t_d2v_s_vecs)

8870

In [21]:
len(t_d2v_s_vecs[0])

32

**Paragraph Model**

In [15]:
d2v_p.train(wiki_tag, total_examples=d2v_p.corpus_count, epochs=d2v_p.epochs)

In [23]:
d2v_p_vecs = []

for par_vec in [ast.literal_eval(par) for par in nlp_df.par_lemma]:
    d2v_p_vecs.append(d2v_p.infer_vector(par_vec))

In [24]:
len(d2v_p_vecs)

74115

In [25]:
len(d2v_p_vecs[0])

18

Testing Text

In [26]:
t_d2v_p_vecs = []

for par_vec in [ast.literal_eval(par) for par in t_nlp_df.par_lemma]:
    t_d2v_p_vecs.append(d2v_p.infer_vector(par_vec))

In [27]:
len(t_d2v_p_vecs)

8870

In [28]:
len(t_d2v_p_vecs[0])

18

**Save to Disk**

In [17]:
d2v_s_file = open('../models/d2v_s','wb')
pickle.dump(d2v_s, d2v_s_file)
d2v_s_file.close()

In [18]:
d2v_p_file = open('../models/d2v_p','wb')
pickle.dump(d2v_p, d2v_p_file)
d2v_p_file.close()

## 3.2 Doc2Vec Features for Corpora

### 3.2.1 Combine Sentence and Paragraph Vectors

In [29]:
# Feature names
s_vec_cols = ['s_vec_'+str(i) for i in range(len(d2v_s_vecs[0]))]
p_vec_cols = ['p_vec_'+str(j) for j in range(len(d2v_p_vecs[0]))]

In [30]:
vec_df = pd.DataFrame(d2v_s_vecs, columns=s_vec_cols)
vec_df.head()

Unnamed: 0,s_vec_0,s_vec_1,s_vec_2,s_vec_3,s_vec_4,s_vec_5,s_vec_6,s_vec_7,s_vec_8,s_vec_9,...,s_vec_22,s_vec_23,s_vec_24,s_vec_25,s_vec_26,s_vec_27,s_vec_28,s_vec_29,s_vec_30,s_vec_31
0,0.013877,-0.012148,0.007006,-0.015396,-0.002656,-0.00349,0.011716,0.00423,0.006907,0.002547,...,-0.009891,-0.012065,-0.011261,0.007401,0.007551,-0.012618,0.012271,-0.004918,-0.006092,-0.004774
1,-0.015537,-0.014925,-0.003562,0.007443,-0.003987,-0.000591,0.015054,0.01224,-0.015296,0.011213,...,0.000181,0.004094,0.000516,-0.004944,-0.003514,-0.000628,-0.007596,0.013407,0.015258,-0.012898
2,0.006117,0.000151,0.008452,-0.000479,-0.014821,0.008976,-0.01349,-0.006188,-0.013559,-0.002593,...,-0.012294,-0.000628,0.007572,0.002907,-0.010185,-0.006701,-0.006174,0.005899,-0.014035,0.007012
3,0.003889,0.011431,0.007139,0.012332,-0.00044,-0.009316,-0.000799,0.000181,-0.012577,-0.013329,...,-0.014215,-0.001203,0.009691,-0.006956,0.000836,-0.008287,0.011692,-0.008896,-0.015057,-0.000123
4,-0.00619,0.011925,0.012945,-0.004754,-0.004691,0.004129,-0.015233,-0.010919,0.009673,-0.005051,...,0.008736,-0.004417,-0.011493,0.008998,0.004071,-0.002987,-0.000368,0.012635,0.015053,-0.004915


In [31]:
p_vec_df = pd.DataFrame(d2v_p_vecs, columns=p_vec_cols)
p_vec_df.head()

Unnamed: 0,p_vec_0,p_vec_1,p_vec_2,p_vec_3,p_vec_4,p_vec_5,p_vec_6,p_vec_7,p_vec_8,p_vec_9,p_vec_10,p_vec_11,p_vec_12,p_vec_13,p_vec_14,p_vec_15,p_vec_16,p_vec_17
0,0.007508,-0.008221,-0.019116,-0.00525,0.001566,0.012003,-0.008436,-0.019355,-0.009274,0.018805,-0.014826,0.024534,-0.026125,-0.018985,-0.012861,0.023923,0.015246,0.018545
1,0.007508,-0.008221,-0.019116,-0.00525,0.001566,0.012003,-0.008436,-0.019355,-0.009274,0.018805,-0.014826,0.024534,-0.026125,-0.018985,-0.012861,0.023923,0.015246,0.018545
2,0.007508,-0.008221,-0.019116,-0.00525,0.001566,0.012003,-0.008436,-0.019355,-0.009274,0.018805,-0.014826,0.024534,-0.026125,-0.018985,-0.012861,0.023923,0.015246,0.018545
3,-0.012694,0.006602,-0.012751,0.012468,0.02046,-0.008088,-0.026354,-0.027765,0.025904,0.016994,0.006162,0.019634,0.017726,0.020254,0.01865,0.026383,0.023289,-0.010726
4,-0.012694,0.006602,-0.012751,0.012468,0.02046,-0.008088,-0.026354,-0.027765,0.025904,0.016994,0.006162,0.019634,0.017726,0.020254,0.01865,0.026383,0.023289,-0.010726


In [32]:
for col_name in p_vec_cols:
    vec_df[col_name] = p_vec_df[col_name]

In [33]:
vec_df.shape

(74115, 50)

Testing Texts

In [34]:
t_vec_df = pd.DataFrame(t_d2v_s_vecs, columns=s_vec_cols)
t_vec_df.head()

Unnamed: 0,s_vec_0,s_vec_1,s_vec_2,s_vec_3,s_vec_4,s_vec_5,s_vec_6,s_vec_7,s_vec_8,s_vec_9,...,s_vec_22,s_vec_23,s_vec_24,s_vec_25,s_vec_26,s_vec_27,s_vec_28,s_vec_29,s_vec_30,s_vec_31
0,8e-06,-0.008943,-0.008056,-0.015117,0.001085,0.000351,0.007255,-0.013254,-0.003751,-0.013496,...,0.015456,-0.011774,0.003357,0.006641,0.003017,-0.011106,-0.008946,-0.010291,0.014659,-0.010295
1,0.001525,0.006725,0.003211,0.001403,-0.002386,0.004559,-0.00195,0.012243,0.014489,-0.003642,...,-0.001204,0.008767,-0.011929,0.004373,-0.011145,0.013896,0.000683,-0.002667,-0.007358,0.00857
2,-0.00548,0.0137,0.010534,-0.001587,-0.010978,0.004592,-0.012572,0.008551,0.012623,-0.000427,...,0.006316,-0.009491,-0.004088,-0.015113,-0.002937,-0.013021,-0.004485,-0.001282,0.015381,-0.012993
3,0.008601,0.012692,0.002467,-0.011486,-0.014021,-0.013896,0.003447,0.004098,0.011823,0.002959,...,0.001026,-0.001139,0.015386,-6.4e-05,0.011331,0.005836,-0.002353,0.011362,-0.001275,-0.014506
4,0.013156,0.008079,0.006109,0.010705,0.007149,0.015479,0.013234,-0.003879,-0.004706,-0.006246,...,-0.002134,0.004156,-0.013784,0.002654,0.013832,-0.001058,-0.007675,0.010734,-0.012753,0.009998


In [35]:
t_p_vec_df = pd.DataFrame(t_d2v_p_vecs, columns=p_vec_cols)
t_p_vec_df.head()

Unnamed: 0,p_vec_0,p_vec_1,p_vec_2,p_vec_3,p_vec_4,p_vec_5,p_vec_6,p_vec_7,p_vec_8,p_vec_9,p_vec_10,p_vec_11,p_vec_12,p_vec_13,p_vec_14,p_vec_15,p_vec_16,p_vec_17
0,1.5e-05,-0.015899,-0.014323,-0.026875,0.001928,0.000625,0.012898,-0.023562,-0.006669,-0.023993,-0.005121,0.00044,0.026449,0.026704,-0.026971,0.015114,-0.014864,0.00792
1,1.5e-05,-0.015899,-0.014323,-0.026875,0.001928,0.000625,0.012898,-0.023562,-0.006669,-0.023993,-0.005121,0.00044,0.026449,0.026704,-0.026971,0.015114,-0.014864,0.00792
2,0.00124,0.020633,-0.003893,0.00228,-0.003181,-0.018474,-0.013665,0.00641,-0.019669,1.7e-05,0.003914,-0.004053,0.01785,-0.017501,-0.026088,0.003198,-0.004491,-0.01951
3,0.00124,0.020633,-0.003893,0.00228,-0.003181,-0.018474,-0.013665,0.00641,-0.019669,1.7e-05,0.003914,-0.004053,0.01785,-0.017501,-0.026088,0.003198,-0.004491,-0.01951
4,0.023388,0.014362,0.01086,0.019032,0.01271,0.027518,0.023526,-0.006895,-0.008366,-0.011105,0.007186,0.018796,0.019055,0.001408,-0.019603,0.018545,0.017376,0.014727


In [36]:
for col_name in p_vec_cols:
    t_vec_df[col_name] = t_p_vec_df[col_name]

In [37]:
t_vec_df.shape

(8870, 50)

**Save Vector DataFrames to Disk**

In [38]:
vec_df.to_csv('../data_vec/vec_df.csv', index=False)

In [39]:
t_vec_df.to_csv('../data_vec/t_vec_df.csv', index=False)

In [40]:
vec_df = pd.read_csv('../data_vec/vec_df.csv')

In [41]:
t_vec_df = pd.read_csv('../data_vec/t_vec_df.csv')

### 3.2.2 Combine Vectors with LDA Features

**Load LDA Features**

In [42]:
lda_train = pd.read_csv('../data_vec/lda_train.csv')
lda_test = pd.read_csv('../data_vec/lda_test.csv')

In [43]:
prep_train = pd.merge(vec_df, lda_train, right_index=True, left_index=True)
prep_test = pd.merge(t_vec_df, lda_test, right_index=True, left_index=True)

In [44]:
prep_train.head()

Unnamed: 0,s_vec_0,s_vec_1,s_vec_2,s_vec_3,s_vec_4,s_vec_5,s_vec_6,s_vec_7,s_vec_8,s_vec_9,...,p11_lda_consciousness,p12_lda_god,p13_lda_mankind,p14_lda_weekend,p15_lda_pleasure,p16_lda_opacity,p17_lda_downside,a_num,p_num,s_num
0,0.013877,-0.012148,0.007006,-0.015396,-0.002656,-0.00349,0.011716,0.00423,0.006907,0.002547,...,0.011534,0.011534,0.011534,0.011534,0.011534,0.011534,0.011534,0,0,0
1,-0.015537,-0.014925,-0.003562,0.007443,-0.003987,-0.000591,0.015054,0.01224,-0.015296,0.011213,...,0.011534,0.011534,0.011534,0.011534,0.011534,0.011534,0.011534,0,0,1
2,0.006117,0.000151,0.008452,-0.000479,-0.014821,0.008976,-0.01349,-0.006188,-0.013559,-0.002593,...,0.011534,0.011534,0.011534,0.011534,0.011534,0.011534,0.011534,0,0,2
3,0.003889,0.011431,0.007139,0.012332,-0.00044,-0.009316,-0.000799,0.000181,-0.012577,-0.013329,...,0.012096,0.012096,0.012096,0.012096,0.012096,0.012096,0.012096,0,1,0
4,-0.00619,0.011925,0.012945,-0.004754,-0.004691,0.004129,-0.015233,-0.010919,0.009673,-0.005051,...,0.012096,0.012096,0.012096,0.012096,0.012096,0.012096,0.012096,0,1,1


In [45]:
prep_train.shape

(74115, 103)

In [46]:
prep_test.head()

Unnamed: 0,s_vec_0,s_vec_1,s_vec_2,s_vec_3,s_vec_4,s_vec_5,s_vec_6,s_vec_7,s_vec_8,s_vec_9,...,p11_lda_consciousness,p12_lda_god,p13_lda_mankind,p14_lda_weekend,p15_lda_pleasure,p16_lda_opacity,p17_lda_downside,a_num,p_num,s_num
0,8e-06,-0.008943,-0.008056,-0.015117,0.001085,0.000351,0.007255,-0.013254,-0.003751,-0.013496,...,0.011146,0.011146,0.011146,0.011146,0.011146,0.011146,0.011146,0,0,0
1,0.001525,0.006725,0.003211,0.001403,-0.002386,0.004559,-0.00195,0.012243,0.014489,-0.003642,...,0.011146,0.011146,0.011146,0.011146,0.011146,0.011146,0.011146,0,0,1
2,-0.00548,0.0137,0.010534,-0.001587,-0.010978,0.004592,-0.012572,0.008551,0.012623,-0.000427,...,0.010188,0.010188,0.010188,0.010188,0.010188,0.010188,0.054689,0,1,0
3,0.008601,0.012692,0.002467,-0.011486,-0.014021,-0.013896,0.003447,0.004098,0.011823,0.002959,...,0.010188,0.010188,0.010188,0.010188,0.010188,0.010188,0.055025,0,1,1
4,0.013156,0.008079,0.006109,0.010705,0.007149,0.015479,0.013234,-0.003879,-0.004706,-0.006246,...,0.011233,0.011233,0.011233,0.011233,0.011233,0.011233,0.011233,0,2,0


In [47]:
prep_test.shape

(8870, 103)

**Save Train and Test Features to Disk**

In [48]:
prep_train.to_csv('../data_vec/prep_train.csv', index=False)
prep_test.to_csv('../data_vec/prep_test.csv', index=False)

## Continue to Notebook 4: Multiclass Classification Models