# Topic 39+: Deeper NLP

1. Word vectors
    - https://github.com/learn-co-students/nyc-ds-033020-lectures/blob/master/Mod_5/Deep_NLP/nlp_lecture_matt.pdf
    - Word vectors with Gensim
    - Word vectors with SpaCy
2. Topic Modeling

In [None]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.decomposition import PCA

import gensim.downloader as api
from gensim.test.utils import datapath
from gensim.models import KeyedVectors

### Gensim Documentation

* Pretrained vectors: https://github.com/RaRe-Technologies/gensim-data
* Vector methods: https://radimrehurek.com/gensim/models/keyedvectors.html#gensim.models.keyedvectors.Word2VecKeyedVectors.load_word2vec_format

In [None]:
word_vectors = api.load("glove-wiki-gigaword-100")

## Vector Lookup

In [None]:
word_vectors['caffeine']

## Word similarity 

In [None]:
word_vectors.most_similar('coffee')

In [None]:
word_vectors.most_similar('hilton')

In [None]:
result = word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))

## Analogies

In [None]:
def analogy(x1, x2, y1):
    result = word_vectors.most_similar(positive=[y1, x2], negative=[x1])
    return result

In [None]:
analogy('japan', 'japanese', 'australia')

In [None]:
analogy('australia', 'beer', 'france')

In [None]:
analogy('obama', 'clinton', 'reagan')

In [None]:
analogy('tall', 'tallest', 'long')

In [None]:
analogy('particular', 'fussy', 'subservient')

## Odd One Out?

In [None]:
word_vectors.doesnt_match("england france germany russia".split())

## Sentences/Documents

In [None]:
sentence = 'I like my coffee hot'

In [None]:
vectors = []
for w in sentence.split():
    try:
        vectors.append(word_vectors[w])
    except KeyError:
        pass

In [None]:
sum(vectors)

## Graphical Representation

In [None]:
def display_pca_scatterplot(model, words=None, sample=0):
    if words == None:
        if sample > 0:
            words = np.random.choice(list(model.vocab.keys()), sample)
        else:
            words = [ word for word in model.vocab ]
        
    word_vectors = np.array([model[w] for w in words])

    twodim = PCA().fit_transform(word_vectors)[:,:2]
    
    plt.figure(figsize=(6,6))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.05, y+0.05, word)

In [None]:
display_pca_scatterplot(word_vectors, 
                        ['coffee', 'tea', 'beer', 'wine', 'brandy', 'rum', 'champagne', 'water',
                         'spaghetti', 'borscht', 'hamburger', 'pizza', 'falafel', 'sushi', 'meatballs',
                         'dog', 'horse', 'cat', 'monkey', 'parrot', 'koala', 'lizard',
                         'frog', 'toad', 'monkey', 'ape', 'kangaroo', 'wombat', 'wolf',
                         'france', 'germany', 'hungary', 'luxembourg', 'australia', 'fiji', 'china',
                         'homework', 'assignment', 'problem', 'exam', 'test', 'class',
                         'school', 'college', 'university', 'institute'])

## SpaCy

* Available SpaCy libraries: https://spacy.io/usage/models
* Documentation: https://spacy.io/usage/processing-pipelines

In [None]:
import spacy
from tqdm import tqdm
import pandas as pd
tqdm.pandas()

In [None]:
### This downloads the specific pretrained word embeddings

# !python3 -m spacy download en_core_web_md

In [None]:
raw = pd.read_csv("resources/nlp_classification.csv")
raw

In [None]:
raw.shape

In [None]:
nlp = spacy.load('en_core_web_md')
raw['spacy'] = raw.body.progress_apply(lambda x: nlp(x))

In [None]:
raw

In [None]:
# now each element under "spacy" is its own object!
first_spacy = raw.spacy[0]
print(type(first_spacy))
print(type(first_spacy[0]))

* https://spacy.io/api/token
* https://spacy.io/api/doc

In [None]:
print(len(first_spacy.vector))
first_spacy.vector

In [None]:
print(len(first_spacy[0].vector))
first_spacy[0].vector

In [None]:
[w.pos_ for w in first_spacy]

In [None]:
df = pd.DataFrame(np.vstack([x.vector for x in raw.spacy]))

In [None]:
df

# Topic Modeling

In [None]:
import gensim

from nltk.corpus import stopwords
import gensim.corpora as corpora

import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [None]:
def process_words(texts, stop_words=stopwords.words("english"), allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):

    texts = [[word for word in doc.split() if word not in stop_words] for doc in texts]
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in doc if word not in stop_words] for doc in texts_out]    
    return texts_out

data_ready = process_words(raw.body) 

In [None]:
data_ready

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

In [None]:
print(lda_model.print_topics())

In [None]:
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis