In [1]:
import codecs
import glob
import multiprocessing
import os
import re

In [2]:
import nltk
import gensim.models.word2vec as w2v
import numpy as np

In [3]:
# download nltk tokenizers

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /Users/abhishek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abhishek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# load books

book_filenames = sorted(glob.glob("data/*.txt"))

In [5]:
book_filenames

['data/got1.txt',
 'data/got2.txt',
 'data/got3.txt',
 'data/got4.txt',
 'data/got5.txt']

In [7]:
# combine them into single large corpus

corpus = u""

for book in book_filenames:
    with codecs.open(book,"r","utf-8") as book_file:
        corpus += book_file.read()
    print("Corpus length {0}".format(len(corpus)))

Corpus length 1770659
Corpus length 4071041
Corpus length 6391405
Corpus length 8107945
Corpus length 9719485


In [8]:
# split into sentences

tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

In [9]:
# split using tokenizer

raw_sentences = tokenizer.tokenize(corpus)

In [10]:
print(raw_sentences[10])

It was here the ravens came, after long flight.


In [11]:
# remove non letters and split into words

def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [12]:
# split raw sentences

sentences = []

for raw in raw_sentences:
    if len(raw)>0:
        sentences.append(sentence_to_wordlist(raw))

In [13]:
sentences[10]

['It', 'was', 'here', 'the', 'ravens', 'came', 'after', 'long', 'flight']

In [14]:
token_count = sum([len(sentence) for sentence in sentences])
token_count

1818103

In [18]:
# train word 2 vec

word2vec = w2v.Word2Vec(
    sg=1,
    seed=1,
    workers= multiprocessing.cpu_count(),
    size=300,
    min_count=3,
    window=7,
    sample=1e-3
)

In [19]:
# build vocab

word2vec.build_vocab(sentences)

In [20]:
len(word2vec.wv.vocab)

17277

In [26]:
# train model

word2vec.train(sentences)

7021134

In [27]:
word2vec.wv.most_similar("Dany")

[('Jorah', 0.5570077896118164),
 ('Doreah', 0.5511056184768677),
 ('Khal', 0.5330901145935059),
 ('Kraznys', 0.5310728549957275),
 ('handmaids', 0.527921736240387),
 ('Drogo', 0.5277822017669678),
 ('Jhogo', 0.5204181671142578),
 ('Rakharo', 0.5188958644866943),
 ('Naharis', 0.5162404775619507),
 ('Duur', 0.5159556269645691)]

In [30]:
word2vec.wv.most_similar_cosmul(positive=["king","woman"],negative=["man"])

[('princess', 0.8006671071052551),
 ('queen', 0.7967936396598816),
 ('Dalla', 0.7949143648147583),
 ('seamstress', 0.7635628581047058),
 ('Alayaya', 0.7592761516571045),
 ('kneeling', 0.7547597289085388),
 ('pregnant', 0.7514938116073608),
 ('goddess', 0.7475717067718506),
 ('shares', 0.7472423911094666),
 ('womanhood', 0.7460522055625916)]