## Word2vec on game of thrones dataset

In [1]:
import os
import gensim
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords

In [2]:
story = []

for file_name in os.listdir('dataset/gameOfThrones'):
    file_path = os.path.join('dataset/gameOfThrones', file_name)
    
    with open(file_path,  encoding='latin-1') as file:
        contents = file.read()
        sentences = sent_tokenize(contents)
        for sentence in sentences:
            words = simple_preprocess(sentence)
            words = [word for word in words if word not in stopwords.words('english')]
            story.append(words)

In [3]:
len(story)

145020

In [4]:
story[0]

['george',
 'martin',
 'dance',
 'dragons',
 'book',
 'five',
 'song',
 'ice',
 'fire',
 'dedication',
 'one',
 'fans',
 'lodey',
 'trebla',
 'stego',
 'pod',
 'caress',
 'yags',
 'ray',
 'mr',
 'kate',
 'chataya',
 'mormont',
 'mich',
 'jamie',
 'vanessa',
 'ro',
 'stubby',
 'louise',
 'agravaine',
 'wert',
 'malt',
 'jo',
 'mouse',
 'telisiane',
 'blackfyre',
 'bronn',
 'stone',
 'coyote',
 'daughter',
 'rest',
 'madmen',
 'wild',
 'women',
 'brotherhood',
 'without',
 'banners',
 'website',
 'wizards',
 'elio',
 'linda',
 'lords',
 'westeros',
 'winter',
 'fabio',
 'wic',
 'gibbs',
 'dragonstone',
 'started',
 'men',
 'women',
 'asshai',
 'spain',
 'sang',
 'us',
 'bear',
 'maiden',
 'fair',
 'fabulous',
 'fans',
 'italy',
 'gave',
 'much',
 'wine',
 'readers',
 'finland',
 'germany',
 'brazil',
 'portugal',
 'france',
 'netherlands',
 'distant',
 'lands',
 'waiting',
 'dance',
 'friends',
 'fans',
 'yet',
 'meet',
 'thanks',
 'patience',
 'contents',
 'dedication',
 'cavil',
 'chro

In [5]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    vector_size=300
)

In [6]:
model.build_vocab(story)

In [7]:
print(model.corpus_count)
print(model.epochs)

145020
5


In [8]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(4396871, 4579390)

In [9]:
model.wv.most_similar('daenerys')

[('stormborn', 0.9317414164543152),
 ('unburnt', 0.9066680669784546),
 ('targaryen', 0.8551686406135559),
 ('court', 0.8439016342163086),
 ('rhaella', 0.8395536541938782),
 ('viserys', 0.8382883071899414),
 ('elia', 0.837653398513794),
 ('dorne', 0.8321992754936218),
 ('myrcella', 0.8296521902084351),
 ('beloved', 0.8265693783760071)]

In [10]:
model.wv.doesnt_match(['jon', 'rikon', 'robb', 'arya', 'sansa', 'bran'])

'jon'

In [11]:
model.wv['jon'].shape

(300,)

In [12]:
model.wv.get_normed_vectors().shape

(17310, 300)

In [13]:
y = model.wv.index_to_key
len(y)

17310

In [14]:
y

['said',
 'lord',
 'would',
 'one',
 'ser',
 'could',
 'man',
 'king',
 'men',
 'back',
 'well',
 'like',
 'jon',
 'father',
 'old',
 'hand',
 'even',
 'tyrion',
 'never',
 'know',
 'see',
 'made',
 'eyes',
 'black',
 'told',
 'lady',
 'thought',
 'time',
 'long',
 'might',
 'us',
 'come',
 'face',
 'still',
 'head',
 'red',
 'way',
 'boy',
 'page',
 'must',
 'queen',
 'good',
 'two',
 'brother',
 'night',
 'little',
 'took',
 'came',
 'though',
 'say',
 'three',
 'away',
 'dead',
 'son',
 'blood',
 'take',
 'go',
 'half',
 'make',
 'arya',
 'saw',
 'day',
 'white',
 'jaime',
 'first',
 'look',
 'want',
 'much',
 'enough',
 'sword',
 'tell',
 'girl',
 'bran',
 'great',
 'looked',
 'left',
 'knew',
 'asked',
 'gave',
 'maester',
 'called',
 'wall',
 'every',
 'heard',
 'sansa',
 'let',
 'yet',
 'went',
 'turned',
 'dany',
 'behind',
 'need',
 'around',
 'woman',
 'another',
 'snow',
 'beneath',
 'across',
 'knight',
 'keep',
 'grace',
 'found',
 'gold',
 'last',
 'cersei',
 'castle',
 '

In [15]:
# Reduce the dimension from 300 to 3 using PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
X = pca.fit_transform(model.wv.get_normed_vectors())

In [16]:
X.shape

(17310, 3)

In [18]:
# Draw the relationship of the first 50 words in 3D dimensions
import plotly.express as px
fig = px.scatter_3d(X[:100], x=0, y=1, z=2, color=y[:100])
fig.show()