# Capstone: Text Factorizing with NLP
## Thomas Ludlow

# 06 - Model Interface

This notebook contains the functions to process text input to match model training EDA steps.  This interface will be used to optimize the LDA and FFRNN models.

**Libraries**

In [15]:
# Python Data Science
import re
import time
import numpy as np
import pandas as pd

# Natural Language Processing
import spacy
import gensim
import pyLDAvis.gensim
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, ldamodel, ldamulticore, CoherenceModel
from nltk.stem import PorterStemmer

# Modeling Prep
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib

# Neural Net
import keras
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.callbacks import EarlyStopping

# Plotting
import matplotlib.pyplot as plt
%matplotlib inline

# Override deprecation warnings
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

**Preprocessing Function**

In [51]:
def preprocess(text_list, sw=['the','a','but','like','for'], to_stem=False):
    # Import spaCy "English Medium" library, which does not include vectors
    nlp = spacy.load('en_core_web_md')
    
    # Run spaCy process on each paragraph and store docs in list
    pars_nlp = []
    for par in text_list:
        pars_nlp.append(nlp(par))
    
    # Store lemma from spaCy docs
    pars_lemma = []
    for par_nlp in pars_nlp:
        pars_lemma.append([token.lemma_ for token in par_nlp     # List comprehension
                           if token.lemma_ != '-PRON-'           # Pronouns are excluded
                           and token.pos_ != 'PUNCT'             # Punctionation is excluded
                           and token.is_alpha                    # Numbers are excluded
                           and not token.is_stop])               # Stop words are excluded
        
    # Stem lemma with NLTK PorterStemmer and remove stop words
    if to_stem: ps = PorterStemmer()
    pars_lemma_sw = []
    for vec_list in pars_lemma:    
        update_list = []
        for token in vec_list:
            if token in sw: continue
            if to_stem: update_list.append(ps.stem(token))
            else: update_list.append(token)
        pars_lemma_sw.append(update_list)
        
    return pars_lemma_sw

**Load Bumper Stickers from Text Data**

In [4]:
text_data = pd.read_csv('./data/text_data.csv')
text_data.head()

Unnamed: 0,Title,Author,Filename,Start Key,End Key,Category,Bumper Sticker,Original Language,Country,Year,Year Val,Wiki Link,Wiki Text
0,Book of the Dead: The Papyrus of Ani,Ani,ani_papyrus.txt,THE PAPYRUS OF ANI,***END***,Polytheism,Magic spells will assist the dead in journey t...,Heiroglyphic,Egypt,2400-1250 BC,-1250,https://en.wikipedia.org/wiki/Book_of_the_Dead,The Book of the Dead is an ancient Egyptian fu...
1,The Categories,Aristotle,aristotle_categories.txt,*** START OF THIS PROJECT GUTENBERG EBOOK THE ...,End of the Project Gutenberg EBook of The Cate...,Hylomorphism,Being is a compound of matter and form,Greek,Greece,~335 BC,-335,https://en.wikipedia.org/wiki/Categories_(Aris...,The Categories (Greek Κατηγορίαι Katēgoriai; L...
2,The Poetics,Aristotle,aristotle_poetics.txt,ARISTOTLE ON THE ART OF POETRY,End of the Project Gutenberg EBook of The Poet...,Dramatic and Literary Theory,"Dramatic works imitate but vary in music, char...",Greek,Greece,335 BC,-335,https://en.wikipedia.org/wiki/Poetics_(Aristotle),Aristotle's Poetics (Greek: Περὶ ποιητικῆς; La...
3,The Gospel,"Buddha, Siddhartha Guatama",buddha_gospel.txt,500 BC,***END***,Buddhism,Human suffering and the cycle of death and reb...,English,India,~500 BC,-500,https://en.wikipedia.org/wiki/The_Gospel_of_Bu...,The Gospel of Buddha was an 1894 book by Paul ...
4,The Word,"Buddha, Siddhartha Guatama",buddha_word.txt,"BUDDHA, THE WORD",THE END,Buddhism,Four noble truths are understood by the enligh...,English,India,~500 BC,-500,https://en.wikipedia.org/wiki/Noble_Eightfold_...,The Noble Eightfold Path (Pali: ariyo aṭṭhaṅgi...


**Define Stop Words**
 - Roman Numerals
 - Additional stop words

In [5]:
sw = ['i','ii','iii','iv','v','vi','vii','viii',
      'ix','x','xi','xii','xiii','xiv','xv','xvi',
      'xvii','xviii','xix','xx','xxi','xxii',
      'the','a','but','like','for']

**Preprocess Bumper Stickers**

In [7]:
bs_vecs = preprocess(text_data['Bumper Sticker'].tolist(), sw)

In [8]:
len(bs_vecs)

45

In [9]:
bs_vecs[:5]

[['magic',
  'spell',
  'will',
  'assist',
  'dead',
  'in',
  'journey',
  'through',
  'underworld',
  'and',
  'into',
  'afterlif'],
 ['be', 'be', 'compound', 'of', 'matter', 'and', 'form'],
 ['dramat',
  'work',
  'imit',
  'vari',
  'in',
  'music',
  'charact',
  'good',
  'and',
  'narr',
  'present'],
 ['human',
  'suffer',
  'and',
  'cycl',
  'of',
  'death',
  'and',
  'rebirth',
  'can',
  'be',
  'overcom',
  'with',
  'nirvana',
  'through',
  'enlighten'],
 ['four',
  'nobl',
  'truth',
  'be',
  'understand',
  'by',
  'enlighten',
  'where',
  'nirvana',
  'can',
  'be',
  'achiev',
  'through',
  'follow',
  'eightfold',
  'path']]

## Get LDA Topic Values for Input Text

**Load LDA Dictionary and Model**

In [10]:
g_dict = Dictionary.load('./models/g_dict')
lda_model = Dictionary.load('./models/lda_model')

**LDA Values Function**

In [11]:
def lda_values_df(pp_vectors, g_dict=g_dict, lda_model=lda_model):
    bow_vectors = []
    for vec in pp_vectors:
        bow_vectors.append(g_dict.doc2bow(vec))
        
    lda_values = []
    for bow in bow_vectors:
        lda_values.append(lda_model.get_document_topics(bow))
        
    lda_df = pd.DataFrame(columns=[n for n in range(lda_model.num_topics)])
    for i in range(len(lda_values)):
        for topic, proba in lda_values[i]:
            lda_df.loc[i, topic] = proba
    
    return lda_df

In [17]:
lda_df = lda_values_df(bs_vecs)

In [18]:
lda_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.076627,0.142051,0.156698,0.11229,0.221816,0.179438,0.0819595,0.0291192
1,0.0851372,0.0407786,0.300815,0.0784416,0.264091,0.102458,0.0439741,0.084305
2,0.118225,0.070841,0.392167,0.0695351,0.142963,0.0896406,0.0442353,0.0723928
3,0.140233,0.0309666,0.111486,0.11209,0.374928,0.097772,0.0636337,0.0688906
4,0.173145,0.0319661,0.152119,0.0984919,0.394654,0.0510391,0.0314807,0.0671046


## Factorize Topic Values for Input Text

**Load Scaler and FFRNN**

In [16]:
ss = joblib.load('./models/ss_full')

In [14]:
model_full = load_model('./models/model_full')

**Factorizing Function**

In [19]:
def nlp_factorize(lda_df, model=model_full, ss=ss):
    lda_sc = ss.transform(lda_df.values)
    preds = model.predict_proba(lda_sc)
    return preds

In [29]:
preds = nlp_factorize(lda_df)



In [36]:
len(preds[0])

33

## Function `display_nlp_factors`

In [49]:
def display_nlp_factors(preds, raw_text, doc_number=0, thresh=0.01, target_authors=None):
    print('-'*80)
    print('Doc #:', doc_number)
    if target_authors: print('Target Author:', target_authors[doc_number])
    print('Text:\n', raw_text[doc_number])
    print('\nPhilosophical Factors:\n')
    
    result_list = [(text_data.Author.unique()[i], preds[doc_number][i]) for i in range(len(text_data.Author.unique()))]
    result_list.sort(key=lambda tup: tup[1], reverse=True)
    
    for j in range(len(text_data.Author.unique())):
        n_sp = 35 - len(result_list[j][0])
        if result_list[j][1] >= thresh:
            print('\t{}{}{}'.format(result_list[j][0],str(' '*n_sp),str(round(result_list[j][1], 3))))
    print('-'*80)

In [50]:
for k in range(len(preds)):
    display_nlp_factors(preds, text_data['Bumper Sticker'].tolist(), doc_number=k, target_authors=text_data['Author'].tolist(), thresh=.05)

--------------------------------------------------------------------------------
Doc #: 0
Target Author: Ani
Text:
 Magic spells will assist the dead in journey through the underworld and into the afterlife

Philosophical Factors:

	Muhammad                           0.265
	Nietzsche, Friedrich               0.238
	Moses                              0.181
	Matthew, Mark, Luke, John          0.15
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Doc #: 1
Target Author: Aristotle
Text:
 Being is a compound of matter and form

Philosophical Factors:

	Plato                              0.37
	Nietzsche, Friedrich               0.127
	Sun Tzu                            0.096
	Hobbes, Thomas                     0.083
	Muhammad                           0.061
	Spinoza, Baruch                    0.057
--------------------------------------------------------------------------------
--