# Best Practices for Preprocessing Natural Language Data

#### Load dependencies

In [1]:
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure
%matplotlib inline

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/suraj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
import string
from nltk.corpus import stopwords
from nltk.stem.porter import *
from gensim.models.phrases import Phraser, Phrases
from keras.preprocessing.text import one_hot

Using TensorFlow backend.


In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/suraj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Load data

In [7]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /home/suraj/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [8]:
from nltk.corpus import gutenberg

In [10]:
gberg_sents = gutenberg.sents()

#### Preprocess the corpus

#### To lowercase and remove stop words and punctuations

In [11]:
stpwrds = stopwords.words('english') + list(string.punctuation)

In [12]:
[w.lower() for w in gberg_sents[4] if w not in stpwrds]

[u'she',
 u'youngest',
 u'two',
 u'daughters',
 u'affectionate',
 u'indulgent',
 u'father',
 u'consequence',
 u'sister',
 u'marriage',
 u'mistress',
 u'house',
 u'early',
 u'period']

#### Stem Words

In [13]:
stemmer = PorterStemmer()

In [14]:
[stemmer.stem(w.lower()) for w in gberg_sents[4] if w not in stpwrds]

[u'she',
 u'youngest',
 u'two',
 u'daughter',
 u'affection',
 u'indulg',
 u'father',
 u'consequ',
 u'sister',
 u'marriag',
 u'mistress',
 u'hous',
 u'earli',
 u'period']

#### Handle bigrams collections

In [15]:
phrases = Phrases(gberg_sents) # train detector
bigram = Phraser(phrases) # create a more efficient Phraser object for transforming sentences
bigram.phrasegrams # output count and score of each bigram

{('surprised', 'at'): (27, 14.056416752074586),
 ('any', 'rate'): (47, 83.92034351736973),
 ('next', 'week'): (12, 51.500749500333114),
 ('stretch', 'out'): (24, 25.357046669918123),
 ('three', 'minutes'): (10, 10.882367432840969),
 ('Wilt', 'thou'): (30, 76.70737576407082),
 ('Grandfather', 'Frog'): (19, 21644.734999999997),
 ('hundred', 'twenty'): (24, 33.41922501521641),
 ('Am', 'I'): (43, 15.552098209854075),
 ('could', 'possibly'): (21, 30.820114126236575),
 ('m', 'sorry'): (14, 54.9859615498607),
 ('Free', '-'): (11, 19.965165456012915),
 ('walked', 'along'): (8, 11.422274896105895),
 ('offer', 'burnt'): (12, 32.483506670868785),
 ('border', 'went'): (14, 14.558770906465638),
 ('very', 'pleasant'): (19, 11.64578255559322),
 ('two', 'cubits'): (13, 10.241216848415597),
 ('.--', 'Poor'): (10, 33.94933025911286),
 ('Leicester', 'Square'): (8, 5984.71935483871),
 ('She', 'wished'): (10, 10.481853991240559),
 ('They', 'were'): (188, 10.653371553453791),
 ('Now', 'therefore'): (108, 43

#### Preprocess the corpus now

In [16]:
lower_sents = []
for s in gberg_sents:
    lower_sents.append([w.lower() for w in s if w not in list(string.punctuation)])

In [17]:
lower_sents[:5]

[[u'emma', u'by', u'jane', u'austen', u'1816'],
 [u'volume', u'i'],
 [u'chapter', u'i'],
 [u'emma',
  u'woodhouse',
  u'handsome',
  u'clever',
  u'and',
  u'rich',
  u'with',
  u'a',
  u'comfortable',
  u'home',
  u'and',
  u'happy',
  u'disposition',
  u'seemed',
  u'to',
  u'unite',
  u'some',
  u'of',
  u'the',
  u'best',
  u'blessings',
  u'of',
  u'existence',
  u'and',
  u'had',
  u'lived',
  u'nearly',
  u'twenty',
  u'one',
  u'years',
  u'in',
  u'the',
  u'world',
  u'with',
  u'very',
  u'little',
  u'to',
  u'distress',
  u'or',
  u'vex',
  u'her'],
 [u'she',
  u'was',
  u'the',
  u'youngest',
  u'of',
  u'the',
  u'two',
  u'daughters',
  u'of',
  u'a',
  u'most',
  u'affectionate',
  u'indulgent',
  u'father',
  u'and',
  u'had',
  u'in',
  u'consequence',
  u'of',
  u'her',
  u'sister',
  u's',
  u'marriage',
  u'been',
  u'mistress',
  u'of',
  u'his',
  u'house',
  u'from',
  u'a',
  u'very',
  u'early',
  u'period']]

In [18]:
lower_bigram = Phraser(Phrases(lower_sents))

In [19]:
lower_bigram.phrasegrams # miss taylor, mr woodhouse, mr weston

{('barring', 'out'): (17, 69.89360596083867),
 ('surprised', 'at'): (28, 13.305893084575402),
 ('thou', 'liest'): (7, 22.378828229027963),
 ('any', 'rate'): (47, 81.39444780766237),
 ('next', 'week'): (13, 53.87957922858575),
 ('stretch', 'out'): (29, 26.767763985002045),
 ('three', 'minutes'): (10, 10.22196466946896),
 ('cried', 'bull'): (8, 11.628783947413943),
 ('villages', '15'): (11, 23.142807963738598),
 ('hundred', 'twenty'): (24, 31.79527702792406),
 ('could', 'possibly'): (21, 29.277491471292734),
 ('m', 'sorry'): (14, 47.82961492516337),
 ('moby', 'dick'): (84, 6252.967817896389),
 ('long', 'lean'): (9, 17.35673252770108),
 ('walked', 'along'): (8, 10.917841095692655),
 ('offer', 'burnt'): (12, 31.000812490392853),
 ('border', 'went'): (14, 14.133776659944091),
 ('very', 'pleasant'): (20, 11.433252454704187),
 ('asked', 'macian'): (10, 11.95511452553706),
 ('at', 'oxford'): (10, 13.739780902550686),
 ('second', 'time'): (44, 18.990808144462072),
 ('mrs', 'elton'): (142, 115.9

In [20]:
lower_bigram["jon lives in new york city".split()]

[u'jon', u'lives', u'in', u'new_york', u'city']

In [21]:
lower_bigram = Phraser(Phrases(lower_sents, min_count=32, threshold=64))
lower_bigram.phrasegrams

{('afar', 'off'): (52, 108.14220347465505),
 ('burnt', 'offering'): (184, 297.524653753951),
 ('burnt', 'offerings'): (86, 299.15702343127646),
 ('buster', 'bear'): (142, 479.87410772225826),
 ('captain', 'benwick'): (56, 241.49037086312987),
 ('captain', 'wentworth'): (196, 529.8756608388247),
 ('charles', 'hayter'): (33, 92.03437785214481),
 ('chief', 'priests'): (65, 116.31947753846512),
 ('colonel', 'brandon'): (132, 1313.0078125),
 ('couldn', 't'): (89, 171.76138536935215),
 ('cut', 'off'): (217, 129.60290535032792),
 ('dare', 'say'): (115, 89.94000515807346),
 ('de', 'grey'): (77, 603.2109624246722),
 ('didn', 't'): (180, 220.51081560283686),
 ('doesn', 't'): (53, 106.2634985949418),
 ('don', 't'): (830, 250.30957446808512),
 ('dr', 'bull'): (65, 680.7870294599019),
 ('dr', 'middleton'): (40, 162.73103819257668),
 ('drawing', 'room'): (49, 84.91494947493561),
 ('farmer', 'brown'): (100, 386.05179596892236),
 ('father', 'brown'): (207, 91.68277248710235),
 ('few', 'minutes'): (86,

In [22]:
lower_sents[5]

[u'her',
 u'mother',
 u'had',
 u'died',
 u'too',
 u'long',
 u'ago',
 u'for',
 u'her',
 u'to',
 u'have',
 u'more',
 u'than',
 u'an',
 u'indistinct',
 u'remembrance',
 u'of',
 u'her',
 u'caresses',
 u'and',
 u'her',
 u'place',
 u'had',
 u'been',
 u'supplied',
 u'by',
 u'an',
 u'excellent',
 u'woman',
 u'as',
 u'governess',
 u'who',
 u'had',
 u'fallen',
 u'little',
 u'short',
 u'of',
 u'a',
 u'mother',
 u'in',
 u'affection']

In [23]:
clean_sents = []
for s in lower_sents:
    clean_sents.append(lower_bigram[s])

#### Run word2vec

In [25]:
# max_vocab_size can be used instead of min_count (which has increased here)
model = Word2Vec(sentences=clean_sents, size=64, sg=1, window=10, min_count=10, seed=42, workers=4)
model.save('clean_gutenberg_model.w2v')

In [26]:
# skip re-training the model with the next line:  
model = gensim.models.Word2Vec.load('clean_gutenberg_model.w2v')

In [27]:
len(model.wv.vocab) # down from 17k in previous notebook

10329

In [28]:
model.wv['ma_am']

array([-1.83234870e-01,  4.38774861e-02,  4.28177625e-01,  1.14294969e-01,
       -2.02704594e-01,  7.72256553e-01,  1.47584036e-01, -5.01925826e-01,
       -7.51536191e-02,  4.08323228e-01, -5.30121401e-02,  4.30399358e-01,
        8.78836691e-01, -4.27728117e-01,  1.00709230e-01, -1.83450013e-01,
       -6.18932508e-02, -4.25275654e-01, -2.73635715e-01, -4.01608676e-01,
        3.54504138e-01, -4.46737051e-01, -9.89378244e-02, -2.38469064e-01,
       -1.52660474e-01, -4.49618809e-02,  5.39689839e-01, -1.53348725e-02,
       -8.70272424e-03,  1.18837789e-01, -1.68661803e-01, -7.16181099e-02,
        4.82205749e-01, -3.06096911e-01, -2.50804603e-01, -1.33387789e-01,
        2.09662855e-01, -3.16792250e-01, -5.74795961e-01, -6.67049825e-01,
       -1.76017225e-01,  1.18781395e-01,  5.16788602e-01, -3.80379856e-01,
        5.90392649e-02,  6.73091188e-02,  1.94426075e-01,  8.59851897e-01,
        1.49169445e-01, -3.26239109e-01,  3.53278704e-02, -1.87543765e-01,
       -1.71259902e-02, -

In [30]:
model.wv.most_similar('ma_am')

[(u'mamma', 0.831045925617218),
 (u'm_sure', 0.8275237083435059),
 (u'betty', 0.8272860646247864),
 (u'madam', 0.8125379085540771),
 (u'goose', 0.8082011938095093),
 (u"'--", 0.7999484539031982),
 (u'.--"', 0.7955338358879089),
 (u'shan', 0.7953079342842102),
 (u'ay', 0.7943003177642822),
 (u'sophy', 0.7932109832763672)]

#### Reduce word vector dimensionality with t-SNE¶

In [31]:
tsne = TSNE(n_components=2, n_iter=1000, verbose=True)

In [33]:
X_2d = tsne.fit_transform(model.wv[model.wv.vocab])

In [34]:
coords_df = pd.DataFrame(X_2d, columns=['x','y'])
coords_df['token'] = model.wv.vocab.keys()
coords_df.head()

Unnamed: 0,x,y,token
0,-31.321402,-45.234707,yellow
1,-66.711624,8.743932,four
2,-33.889462,-21.462734,woods
3,-52.61087,-14.852501,hanging
4,-25.588411,-31.586494,marching


In [None]:
# coords_df.to_csv('clean_gutenberg_tsne.csv', index=False)

In [None]:
coords_df = pd.read_csv('clean_gutenberg_tsne.csv')
_ = coords_df.plot.scatter('x', 'y', figsize=(12,12), marker='.', s=10, alpha=0.2)

In [35]:
output_notebook()

In [36]:
subset_df = coords_df.sample(n=5000)

In [37]:
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [38]:
show(p)