# Natural Language Processing with Neural Networks

Using Krohn (2020)

In [1]:
# import dependencies
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')
import nltk.stem as stemmer

import string

import gensim
from gensim.models.phrases import Phraser, Phrases
from gensim.models.word2vec import Word2Vec

from sklearn.manifold import TSNE

import pandas as pd
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure
%matplotlib inline

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/tessmonks/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /Users/tessmonks/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tessmonks/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ImportError: cannot import name 'TypeGuard' from 'typing_extensions' (/Users/tessmonks/opt/anaconda3/lib/python3.7/site-packages/typing_extensions.py)

In [4]:
# get the dataset
from nltk.corpus import gutenberg

In [5]:
# tokenize the corpus into a list of sentences
gberg_sent_tokens = sent_tokenize(gutenberg.raw())

In [6]:
# tokenize further into sentences as list of lists
gberg_sents = gutenberg.sents()

In [7]:
# convert all to lowercase
[w.lower() for w in gberg_sents[4]]

['she',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [8]:
# remove stop words and punctuation
stpwrds = stopwords.words('english') + list(string.punctuation)
[w.lower() for w in gberg_sents[4] if w.lower() not in stpwrds]

['youngest',
 'two',
 'daughters',
 'affectionate',
 'indulgent',
 'father',
 'consequence',
 'sister',
 'marriage',
 'mistress',
 'house',
 'early',
 'period']

In [9]:
# stemming words using the Porter algorithm built in to NLTK
[stemmer.stem(w.lower()) for w in gberg_sents[4]
if w.lower() not in stpwrds]

AttributeError: module 'nltk.stem' has no attribute 'stem'

In [10]:
#handling n-grams so that something like New York is counted as one word and not two 
phrases = Phrases(gberg_sents)
bigram = Phraser(phrases)

In [12]:
# tokenizing a sentence by using split method
tokenized_sentence = 'John lives in New York City'.split()
print(tokenized_sentence)

['John', 'lives', 'in', 'New', 'York', 'City']


Now that we've gone through some basic preprocessing requests, we can easily preprocess the entire corpus in one step rather than so many tedious steps

In [13]:
lower_sents = []
for s in gberg_sents:
    lower_sents.append([w.lower() for w in s if w.lower() 
                       not in list(string.punctuation)])

In [14]:
# too low of a threshold in the traditional way of scoring bigrams, so we need to cap what is considered a bigram
lower_bigram = Phraser(Phrases(lower_sents, min_count=32, threshold = 64))

In [15]:
# Now we can create a clean corpus that includes bigrams 
clean_sents = []
for s in lower_sents:
    clean_sents.append(lower_bigram[s])

In [16]:
clean_sents

[['emma', 'by', 'jane', 'austen', '1816'],
 ['volume', 'i'],
 ['chapter', 'i'],
 ['emma',
  'woodhouse',
  'handsome',
  'clever',
  'and',
  'rich',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her'],
 ['she',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  'indulgent',
  'father',
  'and',
  'had',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  's',
  'marriage',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period'],
 ['her',
  'mother',
  'had',
  'died',
  'too',
  'long',
  'ago',
  'for',
  'her',
  'to',
  'have',
  'more',
  'than',
  'an',
 

Now that we have a clean corpus of data, we can embed words from the corpus into the word-vector space. word2vec uses predictive models, GloVe uses count models and is good across multiple NLP applications, fastText is good for subword level, so good for rare words and out of vocabulary words

In [21]:
# we will use word2vec here
# sentences is a list of lists as a corpus
# size is the number of dimensions that the word vector space will result
# sg set to 1 to choose the skip-gram architecture or leave at 0 for CBOW (this is a small corpus, so using SG)
# window is good to be 10 for sg and 5 for CBOW, means the context words considered
# iter by default is 5, multiple iters is like multiple epochs
# min_count is the number of times a word must occur in order to fit into the word vector space
# workers is the number of processing cores committed to this task
model = Word2Vec(sentences=clean_sents,
                sg=1, window=10,
                min_count=10, workers=4)

In [22]:
model.save('clean_gutenberg_model.w2v')

In [23]:
# can load up prebuilt vectors instead too
model = gensim.models.Word2Vec.load('clean_gutenberg_model.w2v')

In [24]:
# most similar method checks to make sure the word vectors are quality
model.wv.most_similar('father', topn=3)

[('mother', 0.800160825252533),
 ('brother', 0.690881073474884),
 ('sister', 0.6901630759239197)]

In [25]:
# check which word does not belong
model.wv.doesnt_match("mother father sister brother dog".split())

'dog'

In [26]:
# check familiarity score
model.wv.similarity('father', 'dog')

0.43870568

In [29]:
# compute vfather - vman + vwoman to test the word vector analogies
model.wv.most_similar(positive=['father', 'woman'], negative =['man'])

[('husband', 0.7378063797950745),
 ('sister', 0.7107278108596802),
 ('mother', 0.7068012952804565),
 ('daughter', 0.6761484146118164),
 ('wife', 0.6636595129966736),
 ('tamar', 0.6290584802627563),
 ('sarah', 0.6178107857704163),
 ('daughters', 0.605286717414856),
 ('maid', 0.6042471528053284),
 ('rachel', 0.601134181022644)]

In [28]:
model.wv.most_similar(positive=['husband', 'woman'], negative =['man'])

[('wife', 0.6442449688911438),
 ('sister', 0.6429240107536316),
 ('daughter', 0.6196812987327576),
 ('maid', 0.6077455282211304),
 ('harlot', 0.5861538052558899),
 ('nurse', 0.5856027007102966),
 ('child', 0.5683725476264954),
 ('married', 0.5658974647521973),
 ('mother', 0.5624232292175293),
 ('conceived', 0.561240017414093)]

t-distributed stochastic neighbor embedding (t-SNE) means we can compress vectors to two or three dimension

In [39]:
# n_components is the number of dimensions that should be returned
# n_iter is the number of iterations over the input data, analogous to epochs associated with training neural network

tsne = TSNE(n_components=2, n_iter=1000)
X_2d = tsne.fit_transform(model.wv[model.wv.vocab])
coords_df = pd.DataFrame(X_2d, columns=['x','y'])
coords_df['token']=model.wv.vocab.keys()

AttributeError: The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4

In [42]:
coords_df = pd.read_csv('clean_gutenberg_tsne.csv')
coords_df.head()
_ = coords_df.plot.scatter('x','y', figsize=(12,12), marker='.', s=10, alpha=0.2)

FileNotFoundError: [Errno 2] No such file or directory: 'clean_gutenberg_tsne.csv'

In [43]:
# can also have interactive bokeh plot
output_notebook()
subset_df=coords_df.sample(n=5000)
p=figure(plot_width=800, plot_height=800)
_=p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)
show(p)

NameError: name 'output_notebook' is not defined

The area under the curve of the receiver operating characteristic ROC AUC is a metric for assessing performance. It uses the true positive rate and the false positive rate and it lets us evaluate a binary classifer across the full range 0-1 rather than just a threshold of 0.5. An ROC AUC should be as close to 1 as possible and definitely above 0.5. 