#Word2Vec

In [None]:
# https://drive.google.com/file/d/1lbtAwzE7l0otXYFDtGUKKWzI83bD5D5H/view?usp=drive_link

In [1]:
# Download Game of Thrones books from Kaggle
import kagglehub

# Download latest version
path = kagglehub.dataset_download("khulasasndh/game-of-thrones-books")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/khulasasndh/game-of-thrones-books?dataset_version_number=1...


100%|██████████| 3.71M/3.71M [00:00<00:00, 61.0MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/khulasasndh/game-of-thrones-books/versions/1


In [2]:
import numpy as np
import pandas as pd
import gensim # Is a versatile Python package commonly used for natural language processing (NLP) tasks, such as topic modeling, text similarity analysis, and document indexing.
import os

In [3]:
# Let's update gensim
!pip install --upgrade gensim --user



In [37]:
from nltk import sent_tokenize
from time import time  # To time our operations
from gensim.utils import simple_preprocess # It converts text to lowercase, eliminates punctuation, and splits text into individual words.
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
for filename in os.listdir(f"{path}"):
    print(filename)

005ssb.txt
001ssb.txt
003ssb.txt
004ssb.txt
002ssb.txt


There are 5 books available. Sadly, files 005 and 004 have some weird encoding, so we will leave them out.

In [6]:
story = []
for filename in ["001ssb.txt", "002ssb.txt", "003ssb.txt"]:
    if filename == '.ipynb_checkpoints':
      pass
    print(filename)
    with open(f"{path}/{filename}", "r") as f:
        corpus = f.read().replace("\n", "")
    raw_sent = sent_tokenize(corpus)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

001ssb.txt
002ssb.txt
003ssb.txt


In [26]:
story[:5]

[['game',
  'of',
  'thrones',
  'book',
  'one',
  'of',
  'song',
  'of',
  'ice',
  'and',
  'fire',
  'by',
  'george',
  'martin',
  'prologue',
  'we',
  'should',
  'start',
  'back',
  'gared',
  'urged',
  'as',
  'the',
  'woods',
  'began',
  'to',
  'grow',
  'dark',
  'around',
  'them'],
 ['the', 'wildlings', 'are', 'dead'],
 ['do', 'the', 'dead', 'frighten', 'you'],
 ['ser',
  'waymar',
  'royce',
  'asked',
  'with',
  'just',
  'the',
  'hint',
  'of',
  'smile'],
 ['gared', 'did', 'not', 'rise', 'to', 'the', 'bait']]

In [7]:
len(story)

94734

In [8]:
story[0]

['game',
 'of',
 'thrones',
 'book',
 'one',
 'of',
 'song',
 'of',
 'ice',
 'and',
 'fire',
 'by',
 'george',
 'martin',
 'prologue',
 'we',
 'should',
 'start',
 'back',
 'gared',
 'urged',
 'as',
 'the',
 'woods',
 'began',
 'to',
 'grow',
 'dark',
 'around',
 'them']

 `Word2Vec` uses the **Continuous Bag-of-Words (CBOW)** or **Skip-gram** model, which are neural network architectures. Word embeddings, generated by `Word2Vec` Gensim, are dense vector representations of words that encode semantic and syntactic information.

 `Word2Vec` is a widely used algorithm based on neural networks, commonly referred to as “**deep learning**” (though word2vec itself is rather shallow). Using large amounts of unannotated plain text, word2vec learns relationships between words automatically. The output are vectors, one vector per word, with remarkable linear relationships that allow us to do things like:
```
vec(“king”) - vec(“man”) + vec(“woman”) =~ vec(“queen”)

vec(“Montreal Canadiens”) – vec(“Montreal”) + vec(“Toronto”) =~ vec(“Toronto Maple Leafs”).
```
Word2vec is very useful in automatic text tagging, recommender systems and machine translation.

In [39]:
# Initialize a Word2Vec model with specified hyperparameters
model = gensim.models.Word2Vec(
    window=10,  # The maximum distance between the current and predicted word within a sentence. Words within this window are considered as context.
    min_count=2  # Ignores all words that appear less than `min_count` times in the dataset, reducing noise.
)

In [46]:
# let's try the default values
model = gensim.models.Word2Vec(
    window=5,  # The maximum distance between the current and predicted word within a sentence. Words within this window are considered as context.
    min_count=5  # Ignores all words that appear less than `min_count` times in the dataset, reducing noise.
)

In [47]:
# Build vocabulary from the provided corpus (list of tokenized sentences)
t = time()
model.build_vocab(story, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.01 mins


In [48]:
model.epochs

5

In [49]:
# Train the Word2Vec model using the given corpus
t = time()
model.train(story, total_examples=model.corpus_count, epochs=20, report_delay=1) # you could use `epochs = model.epochs`
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 0.5 mins


In [50]:
# Retrieve the most similar words to 'daenerys' based on cosine similarity
model.wv.most_similar('daenerys')

[('stormborn', 0.7103463411331177),
 ('targaryen', 0.6712801456451416),
 ('rhaegar', 0.6466792225837708),
 ('unburnt', 0.6303451657295227),
 ('viserys', 0.6259397268295288),
 ('myrcella', 0.5618194341659546),
 ('elia', 0.5505236387252808),
 ('drogo', 0.544474184513092),
 ('aegon', 0.5281243920326233),
 ('illyrio', 0.5219383835792542)]

In [51]:
# Compute the cosine similarity between 'arya' and 'sansa'
model.wv.similarity('arya','sansa')

0.7331195

In [45]:
# Get the shape of the word vector for the word 'deep'. It appears as all vectors have the same shape
model.wv['sansa'].shape

(100,)

In [19]:
# Retrieve normalized word vectors (i.e., unit vectors for each word)
vec = model.wv.get_normed_vectors()

In [20]:
vec

array([[-0.06604158,  0.04604711,  0.01600297, ..., -0.11948179,
        -0.06598475,  0.02449968],
       [-0.12688668,  0.27850735,  0.13095976, ...,  0.01626877,
        -0.01831332, -0.1656104 ],
       [-0.0084589 , -0.00468547, -0.02021996, ...,  0.00380767,
         0.14752606, -0.06917705],
       ...,
       [-0.14824516,  0.08083956, -0.1247558 , ...,  0.00837042,
        -0.00738801,  0.03141685],
       [-0.19505522,  0.10683527, -0.01477464, ..., -0.0251078 ,
         0.00836608, -0.09943503],
       [ 0.01064054,  0.115258  , -0.09687652, ..., -0.09502752,
        -0.06967658, -0.16586386]], dtype=float32)

In [21]:
model.wv.get_normed_vectors().shape

(13774, 100)

In [22]:
# Get the list of words in the vocabulary, indexed by frequency
y = model.wv.index_to_key

In [23]:
len(y)

13774

In [27]:
y[200:250]

['last',
 'seemed',
 'while',
 'woman',
 'feet',
 'hard',
 'hands',
 'mother',
 'done',
 'winterfell',
 'nothing',
 'ever',
 'hair',
 'hear',
 'voice',
 'another',
 'beneath',
 'stone',
 'many',
 'those',
 'water',
 'high',
 'between',
 'under',
 'until',
 'snow',
 'gone',
 'stood',
 'wine',
 'find',
 'small',
 'put',
 'sister',
 'give',
 'dark',
 'grey',
 'cersei',
 'end',
 'right',
 'place',
 'watch',
 'name',
 'grace',
 'fingers',
 'sam',
 'seen',
 'get',
 'beside',
 'cloak',
 'door']

In [28]:
from sklearn.decomposition import PCA

In [29]:
pca = PCA(n_components=3)

In [30]:

X = pca.fit_transform(model.wv.get_normed_vectors())

In [31]:
X

array([[-0.15094817, -0.44532454, -0.40168142],
       [-0.24812233, -0.22499356, -0.25647002],
       [ 0.38997477, -0.1372887 , -0.5963321 ],
       ...,
       [ 0.34013367,  0.18639071, -0.3547238 ],
       [ 0.3854748 , -0.1241499 ,  0.19776344],
       [ 0.02786247, -0.43707842, -0.00766999]], dtype=float32)

In [32]:
X.shape

(13774, 3)

In [35]:
import plotly.express as px
fig = px.scatter_3d(X[400:500],x=0,y=1,z=2, color=y[400:500])
fig.show()