In [None]:
# Lab 13: Word2Vec with Game of Thrones
This lab explores Word2Vec embeddings using Game of Thrones text data.

Objectives:
- Understand distributional semantics
- Train CBOW and Skip-Gram models
- Explore word similarity, analogy, and visualization

In [2]:
!pip install gensim nltk plotly --quiet

import numpy as np
import pandas as pd
import gensim
import os
import nltk
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

# Download required NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
[nltk_data] Downloading package punkt to C:\Users\AL
[nltk_data]     MAKKAH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\AL
[nltk_data]     MAKKAH\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [3]:
DATA_PATH = r"C:/Users/AL MAKKAH/Desktop/Artificial intelligence lab/Lab13_Assignment"

story = []
for filename in os.listdir(DATA_PATH):
    if filename.endswith(".txt"):
        file_path = os.path.join(DATA_PATH, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                corpus = f.read()
        except UnicodeDecodeError:
            with open(file_path, "r", encoding="cp1252") as f:
                corpus = f.read()
        for sent in sent_tokenize(corpus):
            story.append(simple_preprocess(sent))

print("Number of sentences:", len(story))
print("Sample:", story[:2])

Number of sentences: 145020
Sample: [['game', 'of', 'thrones', 'book', 'one', 'of', 'song', 'of', 'ice', 'and', 'fire', 'by', 'george', 'martin', 'prologue', 'we', 'should', 'start', 'back', 'gared', 'urged', 'as', 'the', 'woods', 'began', 'to', 'grow', 'dark', 'around', 'them'], ['the', 'wildlings', 'are', 'dead']]


In [4]:
# sg=1 → Skip-Gram
model_skipgram = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    sg=1
)

model_skipgram.build_vocab(story)
model_skipgram.train(story, total_examples=model_skipgram.corpus_count, epochs=model_skipgram.epochs)

(6569451, 8628190)

In [5]:
# sg=0 → CBOW
model_cbow = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    sg=0
)

model_cbow.build_vocab(story)
model_cbow.train(story, total_examples=model_cbow.corpus_count, epochs=model_cbow.epochs)

(6568703, 8628190)

In [6]:
print(model_skipgram.wv.most_similar('daenerys'))
print(model_skipgram.wv.doesnt_match(['jon','rikon','robb','arya','sansa','bran']))
print(model_skipgram.wv.doesnt_match(['cersei', 'jaime', 'bronn', 'tyrion']))
print("Vector for 'king':", model_skipgram.wv['king'])
print("Similarity Arya vs Sansa:", model_skipgram.wv.similarity('arya','sansa'))
print("Similarity Tywin vs Sansa:", model_skipgram.wv.similarity('tywin','sansa'))

[('stormborn', 0.8164581060409546), ('khaleesi', 0.7637990713119507), ('unburnt', 0.7509299516677856), ('targaryen', 0.7417117357254028), ('kneel', 0.7155182957649231), ('dragons', 0.7055129408836365), ('khaleen', 0.6951469779014587), ('dosh', 0.691207230091095), ('dany', 0.6748249530792236), ('undying', 0.6713566184043884)]
jon
bronn
Vector for 'king': [ 0.22312668 -0.19526942  0.2889272   0.27672654  0.0893752   0.16197677
  0.35197693  0.47172466 -0.3977285   0.3188624  -0.0992123  -0.39246553
  0.30144337  0.13863242 -0.06383487 -0.49485096 -0.24974239  0.06844958
 -0.307186   -0.09262989 -0.2333644   0.1248661   0.5173552  -0.655453
 -0.45033014  0.08659264 -0.14802504 -0.22410189 -0.25168437 -0.09285128
 -0.17524858 -0.01489216  0.2728658  -0.10870969 -0.01117838  0.03562184
 -0.36498144 -0.1277479  -0.15487489 -0.2818414   0.08132784 -0.35039067
  0.23407145  0.04196651  0.09226379 -0.00407417  0.20803142 -0.35804012
  0.56385267 -0.22551897 -0.38104966 -0.24528556 -0.25945836  

In [7]:
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.io as pio

pca = PCA(n_components=3)
X = pca.fit_transform(model_skipgram.wv.get_normed_vectors())
y = model_skipgram.wv.index_to_key

pio.renderers.default = "browser"
df = pd.DataFrame(X[200:300], columns=["x", "y", "z"])
df["label"] = y[200:300]
fig = px.scatter_3d(df, x="x", y="y", z="z", color="label")
fig.show()