In [1]:
### This should go into __init__()
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import show, figure
%matplotlib inline



In [2]:
nltk.download('punkt') # English-language sentence tokenizer (not all periods end sentences; not all sentences start with a capital letter)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/swapnilashtekar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Loading Data

In [3]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/swapnilashtekar/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [4]:
from nltk.corpus import gutenberg

In [5]:
len(gutenberg.fileids()), gutenberg.fileids() 

(18,
 [u'austen-emma.txt',
  u'austen-persuasion.txt',
  u'austen-sense.txt',
  u'bible-kjv.txt',
  u'blake-poems.txt',
  u'bryant-stories.txt',
  u'burgess-busterbrown.txt',
  u'carroll-alice.txt',
  u'chesterton-ball.txt',
  u'chesterton-brown.txt',
  u'chesterton-thursday.txt',
  u'edgeworth-parents.txt',
  u'melville-moby_dick.txt',
  u'milton-paradise.txt',
  u'shakespeare-caesar.txt',
  u'shakespeare-hamlet.txt',
  u'shakespeare-macbeth.txt',
  u'whitman-leaves.txt'])

Tokenizing text

In [6]:
gbergSentTokens = sent_tokenize(gutenberg.raw())

In [7]:
gbergSentTokens[0:5], gbergSentTokens[1]

([u'[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.',
  u"She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period.",
  u'Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.',
  u"Sixteen years had Miss Taylor been in Mr. Woodhouse's family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.",
  u'Between _them_ it was more the intimacy\nof sisters.'],
 u"She was the youngest of the two daughters 

In [9]:
word_tokenize(gbergSentTokens[1])

[u'She',
 u'was',
 u'the',
 u'youngest',
 u'of',
 u'the',
 u'two',
 u'daughters',
 u'of',
 u'a',
 u'most',
 u'affectionate',
 u',',
 u'indulgent',
 u'father',
 u';',
 u'and',
 u'had',
 u',',
 u'in',
 u'consequence',
 u'of',
 u'her',
 u'sister',
 u"'s",
 u'marriage',
 u',',
 u'been',
 u'mistress',
 u'of',
 u'his',
 u'house',
 u'from',
 u'a',
 u'very',
 u'early',
 u'period',
 u'.']

In [10]:
word_tokenize(gbergSentTokens[1])[14]

u'father'

In [11]:
# a convenient method that handles newlines, as well as tokenizing sentences and words in one shot
gbergSents = gutenberg.sents()


In [12]:
gbergSents[0:5]

[[u'[', u'Emma', u'by', u'Jane', u'Austen', u'1816', u']'],
 [u'VOLUME', u'I'],
 [u'CHAPTER', u'I'],
 [u'Emma',
  u'Woodhouse',
  u',',
  u'handsome',
  u',',
  u'clever',
  u',',
  u'and',
  u'rich',
  u',',
  u'with',
  u'a',
  u'comfortable',
  u'home',
  u'and',
  u'happy',
  u'disposition',
  u',',
  u'seemed',
  u'to',
  u'unite',
  u'some',
  u'of',
  u'the',
  u'best',
  u'blessings',
  u'of',
  u'existence',
  u';',
  u'and',
  u'had',
  u'lived',
  u'nearly',
  u'twenty',
  u'-',
  u'one',
  u'years',
  u'in',
  u'the',
  u'world',
  u'with',
  u'very',
  u'little',
  u'to',
  u'distress',
  u'or',
  u'vex',
  u'her',
  u'.'],
 [u'She',
  u'was',
  u'the',
  u'youngest',
  u'of',
  u'the',
  u'two',
  u'daughters',
  u'of',
  u'a',
  u'most',
  u'affectionate',
  u',',
  u'indulgent',
  u'father',
  u';',
  u'and',
  u'had',
  u',',
  u'in',
  u'consequence',
  u'of',
  u'her',
  u'sister',
  u"'",
  u's',
  u'marriage',
  u',',
  u'been',
  u'mistress',
  u'of',
  u'his',
  

In [13]:
gbergSents[4]

[u'She',
 u'was',
 u'the',
 u'youngest',
 u'of',
 u'the',
 u'two',
 u'daughters',
 u'of',
 u'a',
 u'most',
 u'affectionate',
 u',',
 u'indulgent',
 u'father',
 u';',
 u'and',
 u'had',
 u',',
 u'in',
 u'consequence',
 u'of',
 u'her',
 u'sister',
 u"'",
 u's',
 u'marriage',
 u',',
 u'been',
 u'mistress',
 u'of',
 u'his',
 u'house',
 u'from',
 u'a',
 u'very',
 u'early',
 u'period',
 u'.']

In [14]:
gbergSents[4][14]

u'father'

In [16]:
# another convenient method that we don't immediately need: 
gutenberg.words(), len(gutenberg.words())

([u'[', u'Emma', u'by', u'Jane', u'Austen', u'1816', ...], 2621613)

Running Word2Vec

In [18]:
model = Word2Vec(sentences=gbergSents, size=64, sg=1, window=10, min_count=5, seed=42, workers=8)

Saving model for future use

In [19]:
model.save('raw_gutenberg_model.w2v')

In [21]:
#close, but not quite; distinctly in female direction: 
model.most_similar(positive=['father', 'woman'], negative=['man'])

  from ipykernel import kernelapp as app


[(u'sister', 0.8012499809265137),
 (u'mother', 0.7899263501167297),
 (u'wife', 0.766146183013916),
 (u'daughter', 0.7561405897140503),
 (u'husband', 0.7329741716384888),
 (u'Sarah', 0.7202413082122803),
 (u'conceived', 0.716133713722229),
 (u'Sarai', 0.7085055112838745),
 (u'daughters', 0.7066256403923035),
 (u'brother', 0.7046873569488525)]

In [22]:
model.most_similar(positive=['husband', 'woman'], negative=['man'])

  if __name__ == '__main__':


[(u'wife', 0.727226972579956),
 (u'sister', 0.7122801542282104),
 (u'conceived', 0.7090096473693848),
 (u'child', 0.6877676248550415),
 (u'mother', 0.6778206825256348),
 (u'widow', 0.6662559509277344),
 (u'daughter', 0.6532835364341736),
 (u'maid', 0.6359975934028625),
 (u'nurse', 0.631703794002533),
 (u'daughters', 0.6315330862998962)]

Reducing word vector dimensionality with t-SNE

In [24]:
model.wv.vocab

{u'realms': <gensim.models.keyedvectors.Vocab at 0x10c620d50>,
 u'Doeg': <gensim.models.keyedvectors.Vocab at 0x10ede3050>,
 u'both': <gensim.models.keyedvectors.Vocab at 0x10efcb550>,
 u'yellow': <gensim.models.keyedvectors.Vocab at 0x10c529a50>,
 u'four': <gensim.models.keyedvectors.Vocab at 0x10ede30d0>,
 u'woods': <gensim.models.keyedvectors.Vocab at 0x10297bfd0>,
 u'hanging': <gensim.models.keyedvectors.Vocab at 0x10ec36c10>,
 u'conjuring': <gensim.models.keyedvectors.Vocab at 0x10c529ad0>,
 u'woody': <gensim.models.keyedvectors.Vocab at 0x10ec36b10>,
 u'Harriet': <gensim.models.keyedvectors.Vocab at 0x10c3fa690>,
 u'marching': <gensim.models.keyedvectors.Vocab at 0x10c50de50>,
 u'Caes': <gensim.models.keyedvectors.Vocab at 0x10c0c3450>,
 u'Foundation': <gensim.models.keyedvectors.Vocab at 0x10c437c10>,
 u'euery': <gensim.models.keyedvectors.Vocab at 0x10ede3250>,
 u'eligible': <gensim.models.keyedvectors.Vocab at 0x10ef31c90>,
 u'Libnah': <gensim.models.keyedvectors.Vocab at 0x10

In [25]:
len(model.wv.vocab)

17011

In [26]:
X = model[model.wv.vocab]

  if __name__ == '__main__':


In [27]:
tsne = TSNE(n_components=2, n_iter=1000) # 200 is minimum iter; default is 1000

In [28]:
X_2d = tsne.fit_transform(X)

In [29]:
X_2d[0:5]

array([[ 1.9839099 , -0.4439315 ],
       [-3.45972208,  0.25792175],
       [-1.47373042, -1.77787965],
       [ 1.08239664,  4.91757874],
       [ 2.35435429,  5.18763845]])

In [30]:
# create DataFrame for storing results and plotting
coords_df = pd.DataFrame(X_2d, columns=['x','y'])
coords_df['token'] = model.wv.vocab.keys()

In [31]:
coords_df.head()

Unnamed: 0,x,y,token
0,1.98391,-0.443932,realms
1,-3.459722,0.257922,Doeg
2,-1.47373,-1.77788,both
3,1.082397,4.917579,yellow
4,2.354354,5.187638,four


Visualization

In [None]:
coords_df = pd.read_csv('raw_gutenberg_tsne.csv')

In [None]:
_ = coords_df.plot.scatter('x', 'y', figsize=(12,12), marker='.', s=10, alpha=0.2)

In [None]:
output_notebook() # output bokeh plots inline in notebook

Bokeh

In [None]:

subset_df = coords_df.sample(n=5000)

In [None]:
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [None]:
show(p)