# Capstone: Philosophical Factors for NLP
**_Measuring Similarity to Philosophical Concepts in Text Data_**

## Thomas W. Ludlow, Jr.
**General Assembly Data Science Immersive DSI-NY-6**

**February 12, 2019**

# Notebook 3 - Document Vectors

### Table of Contents

[**3.1 Gensim Doc2Vec**](#3.1-Gensim-Doc2Vec)
- [3.1.1 Create Tagged Documents](#3.1.1-Create-Tagged-Documents)
- [3.1.2 Build Doc2Vec Models](#3.1.2-Build-Doc2Vec-Models)

[**3.2 Doc2Vec Features for Corpora**](#3.2-Doc2Vec-Features-for-Corpora)
- [3.2.1 3.2.1 Combine Sentence and Paragraph Vectors](#3.2.1-Combine-Sentence-and-Paragraph-Vectors)
- [3.2.2 Combine Vectors with LDA Features](#3.2.2-Combine-Vectors-with-LDA-Features)

**Libraries**

In [1]:
# Python Data Science
import re
import ast
import time
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm

# Natural Language Processing
import spacy
from nltk.stem import PorterStemmer

# Gensim
import gensim
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, ldamulticore, CoherenceModel
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pyLDAvis.gensim

# Modeling Prep
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib

# Plotting
import matplotlib.pyplot as plt
%matplotlib inline

# Override deprecation warnings
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

## 3.1 Gensim Doc2Vec

**Load Preprocessed Text Data**

In [2]:
nlp_df = pd.read_csv('../data_eda/nlp_df.csv')
t_nlp_df = pd.read_csv('../data_eda/t_nlp_df.csv')

In [3]:
text8_corpus = api.load('text8')

In [4]:
text8_data = [d for d in text8_corpus]

In [5]:
wiki_corpus = api.load('wiki-english-20171001')

In [6]:
wiki_data = [d for d in tqdm(wiki_corpus)]

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




### 3.1.1 Create Tagged Documents

In [7]:
def create_tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

In [8]:
wiki_tag = list(create_tagged_document(wiki_data))

In [9]:
len(wiki_tag)

4924894

**Sentence Vectors**

In [10]:
d2v_s = gensim.models.doc2vec.Doc2Vec(vector_size=32, min_count=2, epochs=5)

In [11]:
d2v_s.build_vocab(wiki_tag)

**Paragraph Vectors**

In [12]:
d2v_p = gensim.models.doc2vec.Doc2Vec(vector_size=16, min_count=2, epochs=5)

In [13]:
d2v_p.build_vocab(wiki_tag)

### 3.1.2 Build Doc2Vec Models

**Sentence Model**

In [14]:
d2v_s.train(wiki_tag, total_examples=d2v_s.corpus_count, epochs=d2v_s.epochs)

In [15]:
d2v_s_vecs = []

for sent_vec in [ast.literal_eval(sent) for sent in nlp_df.sent_lemma]:
    d2v_s_vecs.append(d2v_s.infer_vector(sent_vec))

In [16]:
len(d2v_s_vecs)

70922

In [17]:
len(d2v_s_vecs[0])

32

In [18]:
d2v_s_vecs[0]

array([ 0.00385751, -0.00105863,  0.01090039,  0.00760083, -0.00016888,
        0.01177285,  0.00322402,  0.00500347, -0.00305098, -0.00792306,
       -0.01459706, -0.00332637, -0.00256728,  0.01545913,  0.00960032,
        0.00344929,  0.01180919, -0.00051465, -0.01484104, -0.01062763,
        0.00904354,  0.00088135,  0.00616374,  0.00067539, -0.0051318 ,
       -0.00227692,  0.00019254,  0.01195077, -0.00568404,  0.00389648,
        0.00915742, -0.00359113], dtype=float32)

Testing Text

In [19]:
t_d2v_s_vecs = []

for sent_vec in [ast.literal_eval(sent) for sent in t_nlp_df.sent_lemma]:
    t_d2v_s_vecs.append(d2v_s.infer_vector(sent_vec))

In [20]:
len(t_d2v_s_vecs)

8395

In [21]:
len(t_d2v_s_vecs[0])

32

**Paragraph Model**

In [22]:
d2v_p.train(wiki_tag, total_examples=d2v_p.corpus_count, epochs=d2v_p.epochs)

In [23]:
d2v_p_vecs = []

for par_vec in [ast.literal_eval(par) for par in nlp_df.par_lemma]:
    d2v_p_vecs.append(d2v_p.infer_vector(par_vec))

In [24]:
len(d2v_p_vecs)

70922

In [25]:
len(d2v_p_vecs[0])

16

Testing Text

In [26]:
t_d2v_p_vecs = []

for par_vec in [ast.literal_eval(par) for par in t_nlp_df.par_lemma]:
    t_d2v_p_vecs.append(d2v_p.infer_vector(par_vec))

In [27]:
len(t_d2v_p_vecs)

8395

In [28]:
len(t_d2v_p_vecs[0])

16

## 3.2 Doc2Vec Features for Corpora

### 3.2.1 Combine Sentence and Paragraph Vectors

In [29]:
# Feature names
s_vec_cols = ['s_vec_'+str(i) for i in range(len(d2v_s_vecs[0]))]
p_vec_cols = ['p_vec_'+str(j) for j in range(len(d2v_p_vecs[0]))]

In [30]:
vec_df = pd.DataFrame(d2v_s_vecs, columns=s_vec_cols)
vec_df.head()

Unnamed: 0,s_vec_0,s_vec_1,s_vec_2,s_vec_3,s_vec_4,s_vec_5,s_vec_6,s_vec_7,s_vec_8,s_vec_9,...,s_vec_22,s_vec_23,s_vec_24,s_vec_25,s_vec_26,s_vec_27,s_vec_28,s_vec_29,s_vec_30,s_vec_31
0,0.003858,-0.001059,0.0109,0.007601,-0.000169,0.011773,0.003224,0.005003,-0.003051,-0.007923,...,0.006164,0.000675,-0.005132,-0.002277,0.000193,0.011951,-0.005684,0.003896,0.009157,-0.003591
1,-0.013389,0.01257,0.008184,-0.002391,-0.010678,0.010813,-0.007665,0.002444,0.003449,-0.014223,...,0.01318,0.001904,0.005881,0.001613,0.012513,0.001844,-0.011092,-0.006049,-0.004544,0.001458
2,-0.000618,0.006237,0.005052,0.010632,-0.007403,-0.006426,-0.005638,-0.002501,0.008501,0.015035,...,-0.001421,-0.012025,-0.008904,-0.001158,0.011842,-0.006222,0.011366,-0.000743,-0.012555,0.001841
3,-0.003668,-0.013479,0.004037,0.015057,0.009767,-0.004177,-0.010813,-0.002221,0.007194,-0.007292,...,0.014247,-0.005851,0.01426,0.004263,0.001678,0.01279,-0.007219,-0.014648,-0.002402,0.001446
4,-0.011028,0.015561,-0.015301,-0.000413,0.000686,-0.007448,-0.004838,0.002938,0.001321,0.01545,...,0.009706,-0.00414,-3.2e-05,-0.006838,0.004089,0.011523,-0.01544,0.008162,0.002061,-0.014123


In [31]:
p_vec_df = pd.DataFrame(d2v_p_vecs, columns=p_vec_cols)
p_vec_df.head()

Unnamed: 0,p_vec_0,p_vec_1,p_vec_2,p_vec_3,p_vec_4,p_vec_5,p_vec_6,p_vec_7,p_vec_8,p_vec_9,p_vec_10,p_vec_11,p_vec_12,p_vec_13,p_vec_14,p_vec_15
0,-0.023799,-0.030376,-0.009521,-0.025856,0.004975,0.019548,0.005511,0.021429,0.000847,0.01574,0.008186,0.011306,-0.028358,-0.026734,0.013352,-0.003708
1,-0.023799,-0.030376,-0.009521,-0.025856,0.004975,0.019548,0.005511,0.021429,0.000847,0.01574,0.008186,0.011306,-0.028358,-0.026734,0.013352,-0.003708
2,-0.023799,-0.030376,-0.009521,-0.025856,0.004975,0.019548,0.005511,0.021429,0.000847,0.01574,0.008186,0.011306,-0.028358,-0.026734,0.013352,-0.003708
3,-0.006215,0.014121,0.001989,0.005874,0.014431,0.016977,-0.010235,0.004929,-0.010111,0.002403,0.005654,-0.024702,0.012832,-0.001034,-0.022984,0.030666
4,-0.006215,0.014121,0.001989,0.005874,0.014431,0.016977,-0.010235,0.004929,-0.010111,0.002403,0.005654,-0.024702,0.012832,-0.001034,-0.022984,0.030666


In [32]:
for col_name in p_vec_cols:
    vec_df[col_name] = p_vec_df[col_name]

In [33]:
vec_df.shape

(70922, 48)

Testing Texts

In [34]:
t_vec_df = pd.DataFrame(t_d2v_s_vecs, columns=s_vec_cols)
t_vec_df.head()

Unnamed: 0,s_vec_0,s_vec_1,s_vec_2,s_vec_3,s_vec_4,s_vec_5,s_vec_6,s_vec_7,s_vec_8,s_vec_9,...,s_vec_22,s_vec_23,s_vec_24,s_vec_25,s_vec_26,s_vec_27,s_vec_28,s_vec_29,s_vec_30,s_vec_31
0,-0.006376,-0.014168,-0.01064,-0.004532,0.013724,0.005481,0.0003,0.010631,0.000568,-0.00753,...,-0.009673,0.00499,0.014329,0.009878,0.015131,-0.006123,0.007423,-0.010709,0.001111,0.005326
1,-0.005234,0.014281,-0.012951,-0.007159,-0.004458,0.013147,-0.004553,0.003022,0.006354,0.001301,...,0.005348,0.015044,0.005215,0.015615,0.009286,-0.010057,0.002151,0.007745,-0.00922,-0.00922
2,0.007182,0.011417,-0.003972,0.00176,-0.007267,0.010004,0.008006,0.006922,-0.010596,0.008708,...,-0.01233,-0.005333,0.00867,-0.001814,-0.002962,0.013703,0.005386,-0.005223,0.010753,-0.012451
3,-0.009829,0.010973,-0.0148,-0.004037,0.00342,0.003011,-0.013762,0.015622,0.00829,0.010655,...,-0.011648,-0.002401,0.013344,-0.010347,0.014583,-0.012783,0.006187,0.003055,0.006385,0.002331
4,-0.015248,0.003,-0.003895,0.01044,0.007598,-0.015034,0.008169,-0.003571,-0.001063,-0.005591,...,0.009227,0.00874,0.011451,0.000257,0.003004,-0.006843,-0.009474,0.00547,0.007503,0.011693


In [35]:
t_p_vec_df = pd.DataFrame(t_d2v_p_vecs, columns=p_vec_cols)
t_p_vec_df.head()

Unnamed: 0,p_vec_0,p_vec_1,p_vec_2,p_vec_3,p_vec_4,p_vec_5,p_vec_6,p_vec_7,p_vec_8,p_vec_9,p_vec_10,p_vec_11,p_vec_12,p_vec_13,p_vec_14,p_vec_15
0,-0.012751,-0.028335,-0.02128,-0.009065,0.027448,0.010962,0.000601,0.021261,0.001135,-0.01506,-0.011794,0.007805,-0.030081,0.019921,-0.008623,-0.005035
1,-0.025024,-0.020902,-0.016591,-0.013759,-0.01382,0.009578,0.020207,0.023601,-0.019704,-0.030108,-0.029877,0.003285,0.020205,-0.026307,0.004863,-0.019466
2,-0.025024,-0.020902,-0.016591,-0.013759,-0.01382,0.009578,0.020207,0.023601,-0.019704,-0.030108,-0.029877,0.003285,0.020205,-0.026307,0.004863,-0.019466
3,-0.019657,0.021945,-0.029601,-0.008074,0.00684,0.006022,-0.027525,0.031244,0.01658,0.02131,-0.023342,-0.003345,0.003277,-0.000798,0.017349,-0.028373
4,-0.030497,0.006001,-0.007789,0.02088,0.015195,-0.030068,0.016337,-0.007143,-0.002125,-0.011182,-0.030776,0.008701,0.010304,0.028061,0.002584,-0.004973


In [36]:
for col_name in p_vec_cols:
    t_vec_df[col_name] = t_p_vec_df[col_name]

In [37]:
t_vec_df.shape

(8395, 48)

**Save Vector DataFrames to Disk**

In [38]:
vec_df.to_csv('../data_vec/vec_df.csv', index=False)

In [39]:
t_vec_df.to_csv('../data_vec/t_vec_df.csv', index=False)

In [3]:
vec_df = pd.read_csv('../data_vec/vec_df.csv')

In [4]:
t_vec_df = pd.read_csv('../data_vec/t_vec_df.csv')

### 3.2.2 Combine Vectors with LDA Features

**Load LDA Features**

In [12]:
lda_train = pd.read_csv('../data_vec/lda_train.csv')
lda_test = pd.read_csv('../data_vec/lda_test.csv')

In [13]:
prep_train = pd.merge(vec_df, lda_train, right_index=True, left_index=True)
prep_test = pd.merge(t_vec_df, lda_test, right_index=True, left_index=True)

In [14]:
prep_train.head()

Unnamed: 0,s_vec_0,s_vec_1,s_vec_2,s_vec_3,s_vec_4,s_vec_5,s_vec_6,s_vec_7,s_vec_8,s_vec_9,...,lda_p_13,lda_p_14,lda_p_15,lda_p_16,lda_p_17,lda_p_18,lda_p_19,a_num,p_num,s_num
0,0.003858,-0.001059,0.0109,0.007601,-0.000169,0.011773,0.003224,0.005003,-0.003051,-0.007923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
1,-0.013389,0.01257,0.008184,-0.002391,-0.010678,0.010813,-0.007665,0.002444,0.003449,-0.014223,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
2,-0.000618,0.006237,0.005052,0.010632,-0.007403,-0.006426,-0.005638,-0.002501,0.008501,0.015035,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,2
3,-0.003668,-0.013479,0.004037,0.015057,0.009767,-0.004177,-0.010813,-0.002221,0.007194,-0.007292,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
4,-0.011028,0.015561,-0.015301,-0.000413,0.000686,-0.007448,-0.004838,0.002938,0.001321,0.01545,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1


In [15]:
prep_train.shape

(70922, 103)

In [16]:
prep_test.head()

Unnamed: 0,s_vec_0,s_vec_1,s_vec_2,s_vec_3,s_vec_4,s_vec_5,s_vec_6,s_vec_7,s_vec_8,s_vec_9,...,lda_p_13,lda_p_14,lda_p_15,lda_p_16,lda_p_17,lda_p_18,lda_p_19,a_num,p_num,s_num
0,-0.006376,-0.014168,-0.01064,-0.004532,0.013724,0.005481,0.0003,0.010631,0.000568,-0.00753,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
1,-0.005234,0.014281,-0.012951,-0.007159,-0.004458,0.013147,-0.004553,0.003022,0.006354,0.001301,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
2,0.007182,0.011417,-0.003972,0.00176,-0.007267,0.010004,0.008006,0.006922,-0.010596,0.008708,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1
3,-0.009829,0.010973,-0.0148,-0.004037,0.00342,0.003011,-0.013762,0.015622,0.00829,0.010655,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0
4,-0.015248,0.003,-0.003895,0.01044,0.007598,-0.015034,0.008169,-0.003571,-0.001063,-0.005591,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3,0


In [17]:
prep_test.shape

(8395, 103)

**Save Train and Test Features to Disk**

In [18]:
prep_train.to_csv('../data_vec/prep_train.csv', index=False)
prep_test.to_csv('../data_vec/prep_test.csv', index=False)

## Continue to Notebook 4: Multiclass Classification Models