In [1]:
from __future__ import absolute_import, division, print_function
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

Using TensorFlow backend.


In [3]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     /home/vaibhavgeek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vaibhavgeek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
book_filenames = sorted(glob.glob("data/*.txt"))

In [5]:
print(book_filenames)

['data/got1.txt', 'data/got2.txt', 'data/got3.txt', 'data/got4.txt', 'data/got5.txt']


In [6]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

Reading 'data/got1.txt'...
Corpus is now 1611540 characters long

Reading 'data/got2.txt'...
Corpus is now 3911922 characters long

Reading 'data/got3.txt'...
Corpus is now 6232286 characters long

Reading 'data/got4.txt'...
Corpus is now 7948826 characters long

Reading 'data/got5.txt'...
Corpus is now 9719485 characters long



In [7]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


In [8]:
raw_sentences = tokenizer.tokenize(corpus_raw)


In [9]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [10]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [11]:
print(raw_sentences[116])

“We’ll see how warm you can dress when the winter comes.” He pulled up his hood and hunched over his garron, silent and sullen.


In [12]:
token_count = sum([len(sentence) for sentence in sentences])
print("{0:,} NUmber of Tokens ".format(token_count))

1,818,103 NUmber of Tokens 


In [21]:
num_features = 400
min_word_count = 3
num_workers = multiprocessing.cpu_count()
context_size = 7
downsampling = 1e-4
seed = 1


In [22]:
thrones2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [23]:
thrones2vec.build_vocab(sentences)


In [None]:
thrones2vec.train(sentences, total_examples=len(sentences), epochs=7)


In [17]:
thrones2vec.most_similar("Stark")

[(u'Eddard', 0.8770244121551514),
 (u'beheaded', 0.7730748653411865),
 (u'Winterfell', 0.7683526277542114),
 (u'executed', 0.7445040941238403),
 (u'Robb', 0.7288854718208313),
 (u'North', 0.7155802249908447),
 (u'Starks', 0.7136485576629639),
 (u'Karstark', 0.7114234566688538),
 (u'Rickard', 0.7087119817733765),
 (u'Lyanna', 0.6973978281021118)]

In [18]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = thrones2vec.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2  

In [19]:
       nearest_similarity_cosmul("Stark", "Winterfell", "Riverrun")


Stark is related to Winterfell, as Tully is related to Riverrun


u'Tully'

In [20]:
nearest_similarity_cosmul("Arya", "Nymeria", "dragons")

Arya is related to Nymeria, as Mollander is related to dragons


u'Mollander'