In [6]:
from __future__ import absolute_import, division, print_function
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [7]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [8]:
%pylab inline



Populating the interactive namespace from numpy and matplotlib


In [9]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [10]:
nltk.download("punkt")
nltk.download("stopwords")


[nltk_data] Downloading package punkt to
[nltk_data]     /home/vaibhavgeek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vaibhavgeek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
book_filenames = sorted(glob.glob("data/*.txt"))
book_filenames

['data/got1.txt',
 'data/got2.txt',
 'data/got3.txt',
 'data/got4.txt',
 'data/got5.txt']

In [12]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

Reading 'data/got1.txt'...
Corpus is now 1611540 characters long

Reading 'data/got2.txt'...
Corpus is now 3911922 characters long

Reading 'data/got3.txt'...
Corpus is now 6232286 characters long

Reading 'data/got4.txt'...
Corpus is now 7948826 characters long

Reading 'data/got5.txt'...
Corpus is now 9719485 characters long



In [13]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


In [14]:
raw_sentences = tokenizer.tokenize(corpus_raw)


In [15]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [16]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [17]:
print(raw_sentences[5])


For information address: Bantam Books.


In [18]:
token_count = sum([len(sentence) for sentence in sentences])
print("{0:,} ".format(token_count))

1,818,103 


In [None]:
print(multiprocessing.cpu_count())


In [19]:
num_features = 300
min_word_count = 3
num_workers = multiprocessing.cpu_count()
context_size = 7


In [20]:
downsampling = 1e-3


In [21]:
seed = 1


In [23]:
thrones2vec.build_vocab(sentences)


2017-09-06 15:22:28,674 : INFO : collecting all words and their counts
2017-09-06 15:22:28,676 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-09-06 15:22:28,756 : INFO : PROGRESS: at sentence #10000, processed 148061 words, keeping 9326 word types
2017-09-06 15:22:28,811 : INFO : PROGRESS: at sentence #20000, processed 302454 words, keeping 13009 word types
2017-09-06 15:22:28,856 : INFO : PROGRESS: at sentence #30000, processed 440999 words, keeping 15692 word types
2017-09-06 15:22:28,899 : INFO : PROGRESS: at sentence #40000, processed 573069 words, keeping 17519 word types
2017-09-06 15:22:28,941 : INFO : PROGRESS: at sentence #50000, processed 700737 words, keeping 18835 word types
2017-09-06 15:22:28,994 : INFO : PROGRESS: at sentence #60000, processed 849333 words, keeping 21530 word types
2017-09-06 15:22:29,049 : INFO : PROGRESS: at sentence #70000, processed 994857 words, keeping 22882 word types
2017-09-06 15:22:29,100 : INFO : PROGRESS: at s

In [24]:
len(thrones2vec.wv.vocab)

17277

In [25]:
print("Word2Vec vocabulary length:", len(thrones2vec.wv.vocab))

Word2Vec vocabulary length: 17277


In [26]:
thrones2vec.train(sentences, total_examples=128867, epochs=1)


2017-09-06 15:22:52,251 : INFO : training model with 4 workers on 17277 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=7
2017-09-06 15:22:53,405 : INFO : PROGRESS: at 10.99% examples, 146710 words/s, in_qsize 8, out_qsize 0
2017-09-06 15:22:54,437 : INFO : PROGRESS: at 22.14% examples, 151770 words/s, in_qsize 8, out_qsize 0
2017-09-06 15:22:55,550 : INFO : PROGRESS: at 35.01% examples, 152140 words/s, in_qsize 8, out_qsize 0
2017-09-06 15:22:56,583 : INFO : PROGRESS: at 46.55% examples, 153305 words/s, in_qsize 8, out_qsize 0
2017-09-06 15:22:57,637 : INFO : PROGRESS: at 57.85% examples, 153571 words/s, in_qsize 8, out_qsize 0
2017-09-06 15:22:58,655 : INFO : PROGRESS: at 70.02% examples, 155649 words/s, in_qsize 8, out_qsize 0
2017-09-06 15:22:59,656 : INFO : PROGRESS: at 82.34% examples, 157480 words/s, in_qsize 8, out_qsize 0
2017-09-06 15:23:00,759 : INFO : PROGRESS: at 94.47% examples, 157081 words/s, in_qsize 8, out_qsize 0
2017-09-06 15:23:01,037 : 

1405068

In [27]:
len(sentences)

128867

In [None]:
thrones2vec.most_similar("direwolf")



In [None]:
thrones2vec.most_similar("stark")


In [None]:
thrones2vec.most_similar("Winterfell")

In [None]:
thrones2vec.most_similar("Stark")

In [None]:
thrones2vec.most_similar("King")

In [None]:
thrones2vec.most_similar("Dragons")

In [None]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [None]:
thrones2vec.save(os.path.join("trained", "thrones2vec.w2v"))


In [None]:
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)


In [None]:
all_word_vectors_matrix = thrones2vec.wv.syn0


In [None]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = thrones2vec.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2                                                                                                                                                                            

In [None]:
       nearest_similarity_cosmul("Stark", "Winterfell", "Riverrun")
                                                                                                                                                                                                                                                                                                                                                                                 

In [None]:
nearest_similarity_cosmul("Tyrion", "wine", "Jaime")


In [None]:
nearest_similarity_cosmul("Arya", "Nymeria", "dragons")

In [28]:
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)
