# Capstone: Text Factorizing with NLP
## Thomas Ludlow

# 07 - Topic Vector Labeling

This notebook contains a method of automatically labeling LDA topic results using Word2Vec values, with each value manually averaged to determine central theme and apply to LDA topics.

**Libraries**

In [1]:
# Python Data Science
import re
import time
import numpy as np
import pandas as pd
from tqdm import tqdm

# Natural Language Processing
import spacy
import pyLDAvis.gensim
from nltk.stem import PorterStemmer

import gensim
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, ldamodel, ldamulticore, CoherenceModel
from gensim.models.word2vec import Word2Vec

# Plotting
import matplotlib.pyplot as plt
%matplotlib inline

# Override deprecation warnings
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


## Gensim Word2Vec

**`text8` Dataset**

In [2]:
text8_corpus = api.load('text8')

**Wikipedia English Dataset**

In [None]:
wiki_corpus = api.load('wiki-en')

In [3]:
wv_vecsize = 100

In [4]:
wv_model = Word2Vec(api_corpus, 
                    size=wv_vecsize, 
                    window=10, 
                    min_count=2, 
                    sg=1,
                    workers=4
)

In [5]:
cat = wv_model['cat']

In [6]:
cat

array([ 0.26472232, -0.0142126 , -0.00818526,  0.13113968,  0.18849553,
        0.20811017,  0.01688647, -0.45183334,  0.15992552,  0.29908857,
       -0.13427413,  0.00185853,  0.24724905,  0.00573835, -0.1929899 ,
        0.00212355,  0.12863968,  0.01991918,  0.22128783, -0.46156868,
       -0.19631663, -0.04326332,  0.5155393 , -0.18770577, -0.24224909,
       -0.24819106, -0.21459174, -0.23786674,  0.4772656 ,  0.00635874,
        0.13242227, -0.10513268,  0.2804414 , -0.72400445,  0.28935522,
       -0.06758695,  0.06273188,  0.11229169,  0.21561798,  0.38711905,
       -0.14327276,  0.37157035, -0.21790624, -0.19659127,  0.24513212,
       -0.27070078, -0.10880417,  0.50693566,  0.18778369,  0.28313178,
       -0.04191535, -0.3631849 ,  0.512073  ,  0.08967022,  0.08440157,
       -0.09960833,  0.19115895,  0.28187528,  0.19137958,  0.9573726 ,
        0.6206389 ,  0.49687356,  0.4938762 ,  0.00150871, -0.04593959,
       -0.06677195,  0.01408095,  0.3265416 , -0.8067601 ,  0.02

**Identify Most Similar to New Vector**

In [7]:
wv_model.wv.similar_by_vector(cat)

[('cat', 0.9999998807907104),
 ('kitten', 0.7112815380096436),
 ('meow', 0.6988402605056763),
 ('guppy', 0.6874876618385315),
 ('cats', 0.6847944259643555),
 ('sighthound', 0.6840217113494873),
 ('felis', 0.6832777857780457),
 ('dog', 0.6819274425506592),
 ('mammal', 0.6763498187065125),
 ('wolfhound', 0.6715655326843262)]

**Identify Vector Elements for Term Specificity**

In [8]:
tiger = wv_model['tiger']
cat = wv_model['cat']
mammal = wv_model['mammal']
animal = wv_model['animal']
organism = wv_model['organism']

In [9]:
tiger[0]

0.08551708

In [10]:
testlist = [tiger[0],cat[0],mammal[0],animal[0],organism[0]]
testlist

[0.08551708, 0.26472232, -0.019330576, -0.23395371, -0.6178287]

In [12]:
for i in range(wv_vecsize):
    x_vals = [1,2,3,4,5]
    y_vals = [tiger[i],cat[i],mammal[i],animal[i],organism[i]]
    i_min_dif = None
    min_dif = 999999

    # Check for elements with similarity
    dif = (max(y_vals) - min(y_vals))
    if dif < min_dif:
        i_min_dif = i
        min_dif = dif
    
    # Check for elements with unidirectionality
    if (tiger[i]<cat[i]) & (cat[i]<mammal[i]) & (mammal[i]<animal[i]) & (animal[i]<organism[i]):
        print(i, 'ascending')
        print(y_vals)
    elif (tiger[i]>cat[i]) & (cat[i]>mammal[i]) & (mammal[i]>animal[i]) & (animal[i]>organism[i]):
        print(i, 'descending')
        print(y_vals)
        
print('')
print(i_min_dif)
print(min_dif)
print([tiger[i_min_dif],cat[i_min_dif],mammal[i_min_dif],animal[i_min_dif],organism[i_min_dif]])
                      

8 ascending
[0.106050074, 0.15992552, 0.19117576, 0.6694827, 1.1557617]
47 descending
[0.68303746, 0.50693566, 0.48589846, 0.39318812, -0.018538954]
67 descending
[0.38492838, 0.3265416, 0.2855884, 0.20494707, 0.09016398]
70 ascending
[-0.2317701, -0.0776944, -0.05784192, 0.03954233, 0.27571255]
89 ascending
[0.032903757, 0.115257375, 0.1456704, 0.24947196, 0.377151]

99
0.69529223
[0.52951413, 0.050805602, -0.16577813, -0.00017925011, 0.021101555]


The vector elements pertaining to specificity for the word "tiger" are elements **59** and **64**.

## Topic Label Test

**Test 0**

(0,
  '0.053*"great" + 0.037*"good" + 0.027*"govern" + 0.023*"tax" + 0.019*"right" + 0.017*"pay" + 0.013*"valu" + 0.013*"peopl" + 0.013*"high" + 0.012*"revenu"'),
 

In [50]:
test_0 = ['great','good','govern','tax','right','pay','value','people','high','revenue']

In [51]:
t0_vec_list = [wv_model[term] for term in test_0]

In [66]:
t0_vec_list[0][0]

-0.17579739

In [67]:
t0_mean_vec = [np.mean([t0_vec_list[i][element] for i in range(len(t0_vec_list))]) for element in range(wv_vecsize)]

In [68]:
len(t0_mean_vec)

100

In [78]:
t0_mean_vec[:5]

[-0.4231813, -0.23181407, 0.17925094, 0.2683042, -0.25428733]

**Convert to Numpy Float32 Array**

In [76]:
t0_vec_in = np.array(t0_mean_vec, dtype='float32')

In [77]:
wv_model.wv.similar_by_vector(t0_vec_in)

[('pigovian', 0.8742243051528931),
 ('payers', 0.8705583810806274),
 ('stipends', 0.8670148253440857),
 ('perquisites', 0.8661175966262817),
 ('equalise', 0.8609489798545837),
 ('amortisation', 0.8589038848876953),
 ('externalities', 0.8583351373672485),
 ('irrevocable', 0.8572245836257935),
 ('issuer', 0.8557246923446655),
 ('mortgages', 0.8549227118492126)]

In [83]:
t0_vec_in[64]

0.12283288

In [84]:
t0_vec_in[64] *= -1

In [85]:
t0_vec_in[64]

-0.12283288

In [86]:
wv_model.wv.similar_by_vector(t0_vec_in)

[('payers', 0.8726302981376648),
 ('pigovian', 0.8708720207214355),
 ('perquisites', 0.8632028102874756),
 ('stipends', 0.8630079030990601),
 ('equalise', 0.8586211800575256),
 ('externalities', 0.8577253222465515),
 ('irrevocable', 0.8554208278656006),
 ('moneylending', 0.8549354076385498),
 ('amortisation', 0.852045476436615),
 ('remuneration', 0.851937472820282)]

In [87]:
t0_vec_in[59]

0.020428197

In [88]:
t0_vec_in[59] *= -1

In [90]:
t0_titles = wv_model.wv.similar_by_vector(t0_vec_in)

In [92]:
t0_titles[0][0].title()

'Payers'

**Test 1**

 (1,
  '0.034*"price" + 0.029*"labour" + 0.027*"land" + 0.026*"year" + 0.025*"money" + 0.022*"capit" + 0.017*"work" + 0.016*"silver" + 0.016*"time" + 0.015*"gold"'),
 


In [95]:
test_1 = ['price','labour','land','year','money','capital','work','silver','time','gold']

In [96]:
t1_vec_list = [wv_model[term] for term in test_1]

In [97]:
t1_vec_list[0][0]

-0.59992534

In [98]:
t1_mean_vec = [np.mean([t1_vec_list[i][element] for i in range(len(t1_vec_list))]) for element in range(wv_vecsize)]


In [99]:
len(t1_mean_vec)

100

In [100]:
t1_mean_vec[:5]

[-0.2977167, -0.27249134, 0.1448909, 0.17232029, -0.30534294]

In [101]:
t1_vec_in = np.array(t1_mean_vec, dtype='float32')

In [102]:
wv_model.wv.similar_by_vector(t1_vec_in)

[('overspending', 0.849362850189209),
 ('krugerrands', 0.8462162017822266),
 ('depreciated', 0.8456106185913086),
 ('depreciations', 0.8368104100227356),
 ('shekels', 0.8284364938735962),
 ('greenbacks', 0.8281338214874268),
 ('revaluation', 0.8269621133804321),
 ('dollarization', 0.8243699669837952),
 ('colones', 0.822127103805542),
 ('bullion', 0.8220473527908325)]

In [103]:
t1_vec_in[64]

0.14847198

In [104]:
t1_vec_in[64] *= -1

In [105]:
t1_vec_in[64]

-0.14847198

In [106]:
wv_model.wv.similar_by_vector(t1_vec_in)

[('overspending', 0.841806948184967),
 ('krugerrands', 0.8384612798690796),
 ('depreciated', 0.8359653949737549),
 ('shekels', 0.8287818431854248),
 ('depreciations', 0.8264131546020508),
 ('reinvested', 0.8207769989967346),
 ('surplus', 0.8203780651092529),
 ('saleable', 0.8196335434913635),
 ('dollarization', 0.8184707164764404),
 ('bullion', 0.8171533346176147)]

In [107]:
t1_vec_in[59]

-0.063191585

In [108]:
t1_vec_in[59] *= 2

In [109]:
t1_titles = wv_model.wv.similar_by_vector(t1_vec_in)

In [110]:
t1_titles[0][0].title()

'Overspending'

**Test 2**

 (2,
  '0.028*"natur" + 0.018*"state" + 0.017*"differ" + 0.013*"in" + 0.012*"mean" + 0.010*"certain" + 0.009*"mind" + 0.009*"note" + 0.009*"knowledg" + 0.009*"particular"')

In [111]:
test_2 = ['nature','state','differ','in','mean','certain','mind','note','knowledge','particular']

In [112]:
t2_vec_list = [wv_model[term] for term in test_2]

In [113]:
t2_vec_list[0][0]

0.031927023

In [114]:
t2_mean_vec = [np.mean([t2_vec_list[i][element] for i in range(len(t2_vec_list))]) for element in range(wv_vecsize)]


In [115]:
len(t2_mean_vec)

100

In [116]:
t2_mean_vec[:5]

[-0.27046198, -0.15278067, 0.050503295, 0.11848924, -0.2573365]

In [117]:
t2_vec_in = np.array(t2_mean_vec, dtype='float32')

In [118]:
wv_model.wv.similar_by_vector(t2_vec_in)

[('desirability', 0.8907757997512817),
 ('arbitrariness', 0.8871997594833374),
 ('universalizing', 0.8836926221847534),
 ('unmeasurable', 0.8811575174331665),
 ('contiguity', 0.8760883808135986),
 ('endoxa', 0.8739012479782104),
 ('contributive', 0.8738775849342346),
 ('absolutes', 0.8707197904586792),
 ('construal', 0.8687305450439453),
 ('dichotomies', 0.8683440685272217)]

In [119]:
t2_vec_in[64]

-0.122068

In [120]:
t2_vec_in[64] *= 2

In [121]:
t2_vec_in[64]

-0.244136

In [122]:
wv_model.wv.similar_by_vector(t2_vec_in)

[('desirability', 0.8851734399795532),
 ('arbitrariness', 0.8836332559585571),
 ('universalizing', 0.8809947967529297),
 ('unmeasurable', 0.8789762854576111),
 ('contiguity', 0.8750284910202026),
 ('endoxa', 0.8702093362808228),
 ('contributive', 0.8671209216117859),
 ('envisage', 0.866086483001709),
 ('absolutes', 0.8656156063079834),
 ('construal', 0.8654384613037109)]

In [123]:
t2_vec_in[59]

0.31779212

In [124]:
t2_vec_in[59] *= -1

In [125]:
t2_titles = wv_model.wv.similar_by_vector(t2_vec_in)

In [126]:
t2_titles[0][0].title()

'Arbitrariness'