In [1]:
import os
from gensim.utils import simple_preprocess

# List to store sentences
all_sentences = []

# Loop through files named '001ssb.txt' to '005ssb.txt'
for i in range(1, 6):
    # Create the filename
    filename = f'dataset/{i:03d}ssb.txt'
    
    # Open the file and read the contents with error handling
    try:
        with open(filename, 'r', encoding='utf-8', errors='replace') as file:
            # Read the file content
            content = file.read()
            
            # Split the content into sentences (can be further customized as needed)
            sentences = content.split('.')  # Assuming sentences are separated by periods
            
            # Tokenize each sentence using simple_preprocess
            tokenized_sentences = [simple_preprocess(sentence) for sentence in sentences if sentence]
            
            # Add the tokenized sentences to the list
            all_sentences.extend(tokenized_sentences)
    
    except UnicodeDecodeError as e:
        print(f"Error reading file {filename}: {e}")

# Output the sentences for verification
print(len(all_sentences))


146453


In [2]:
for sentence in all_sentences[:3]:
    print(sentence)

['game', 'of', 'thrones', 'book', 'one', 'of', 'song', 'of', 'ice', 'and', 'fire', 'by', 'george']
[]
['martin', 'prologue', 'we', 'should', 'start', 'back', 'gared', 'urged', 'as', 'the', 'woods', 'began', 'to', 'grow', 'dark', 'around', 'them']


In [3]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

# Remove stopwords from tokenized sentences
cleaned_sentences = [[word for word in sentence if word not in STOPWORDS] for sentence in all_sentences]
len(cleaned_sentences)

146453

In [7]:
for sentence in cleaned_sentences[:3]:
    print(sentence)

['game', 'thrones', 'book', 'song', 'ice', 'george']
[]
['martin', 'prologue', 'start', 'gared', 'urged', 'woods', 'began', 'grow', 'dark']


In [8]:
cleaned_sentences = [
    ['iron_throne' if word == 'iron' and idx + 1 < len(sentence) and sentence[idx + 1] == 'throne' 
     else word for idx, word in enumerate(sentence)] 
    for sentence in cleaned_sentences
]
len(cleaned_sentences)

146453


In [13]:
from gensim.models import Word2Vec

# Training the Word2Vec model using CBOW
model_cbow = Word2Vec(sentences=cleaned_sentences, vector_size=100, window=2, min_count=1, sg=0)

# Training the Word2Vec model using Skip-Gram
model_skipgram = Word2Vec(sentences=cleaned_sentences, vector_size=100, window=2, min_count=1, sg=1)

In [12]:
model_cbow.save("word2vec_cbow.model")
model_skipgram.save("word2vec_skipgram.model")


NameError: name 'model_cbow' is not defined

In [19]:
from gensim.models import Word2Vec

# Loading the CBOW model
model_cbow = Word2Vec.load("word2vec_cbow.model")

# Loading the Skip-Gram model
model_skipgram = Word2Vec.load("word2vec_skipgram.model")

In [27]:
# Get the vector for a word from the CBOW model
vector_iron_throne_cbow = model_cbow.wv['iron_throne']

# Get the vector for a word from the Skip-Gram model
vector_iron_throne_skipgram = model_skipgram.wv['iron_throne']


In [21]:
model_cbow.wv['jon']

array([ 7.5620943e-01, -2.4885155e-01,  1.1386960e+00,  3.7612301e-01,
       -2.6545791e-02, -1.4147528e+00,  8.2844299e-01, -4.3518519e-01,
       -5.9370720e-01, -9.5779550e-01,  2.4015382e-01, -6.1506528e-01,
        9.2488825e-03,  1.1070847e-01, -4.3834284e-01, -4.4569808e-01,
       -3.6657352e-02, -2.8070518e-01, -9.3288469e-01, -1.0664799e+00,
        1.7675240e-01,  7.1801430e-01,  5.5871081e-01, -1.3113753e+00,
       -4.6027428e-01,  9.0649945e-01, -8.9804769e-01,  7.7424628e-01,
       -8.0078834e-01,  6.5633863e-01, -2.1794173e-01,  2.5627020e-01,
        5.9267098e-01, -1.7264159e+00, -8.7621361e-02,  1.1811959e-01,
       -4.0051216e-01,  3.7502930e-01, -4.6692187e-01,  1.3635179e-01,
        8.6818302e-01, -2.6004326e-01, -5.4952836e-01,  2.5428715e-01,
       -7.0807226e-02, -1.3207475e+00, -7.0047897e-01, -2.3358734e-01,
        8.0037642e-01,  8.6103618e-01, -3.6515000e-01, -6.9662023e-01,
        1.0320263e+00, -8.9662910e-01, -4.1205952e-01,  1.3783135e+00,
      

In [22]:
vector_king_cbow

array([-3.3909363e-01, -2.2002006e-01,  4.7506067e-01,  7.7871668e-01,
        5.3651970e-02, -5.2936649e-01,  1.5886328e+00,  2.0001726e-01,
       -1.2297843e+00,  9.4697887e-01, -1.5030047e-01, -9.2730635e-01,
        3.9230552e-02,  4.4592194e-02,  3.9217886e-02, -1.3617882e+00,
        8.2519382e-02, -4.6770964e-02, -6.2201005e-01, -1.3790336e+00,
        8.3843723e-02, -9.9067819e-01,  1.0594131e+00,  2.1284914e-01,
       -7.4863583e-02,  6.1514056e-01, -6.4188844e-01, -6.2276477e-01,
       -1.7749134e-01,  2.2741379e-01, -6.9773555e-02, -5.3160846e-01,
        6.9203397e-04,  7.8685653e-01,  1.2047927e+00,  1.0464323e+00,
       -3.2411852e-01,  6.0192398e-03, -1.3213059e+00, -1.3241209e+00,
        5.8795843e-02, -1.0450821e+00, -1.1407553e+00,  2.7123210e-01,
        4.2258427e-01, -6.0977143e-01, -3.1508815e-01,  4.4198966e-01,
        8.4960675e-01,  6.7589730e-01,  2.7892965e-01, -5.4389244e-01,
       -9.2235573e-02,  6.5729505e-01, -1.1411744e+00, -3.4988809e-01,
      

In [26]:
# Find most similar words to 'king' in the CBOW model
similar_words_cbow = model_cbow.wv.most_similar(positive=[vector_king_cbow], topn=10)

# Find most similar words to 'king' in the Skip-Gram model
similar_words_skipgram = model_skipgram.wv.most_similar(positive=[vector_king_skipgram], topn=10)

# Print the similar words along with their similarity scores
#print("Similar words to 'iron throne' in CBOW model:", similar_words_cbow)
#print("Similar words to 'iron throne' in Skip-Gram model:", similar_words_skipgram)

In [96]:
similarity_score = model_cbow.wv.similarity('iron_throne', 'stark')
similarity_score

0.381342

In [53]:
import pandas as pd

characters = pd.read_csv('dataset/character-deaths.csv')[["Name"]]
characters.head()

Unnamed: 0,Name
0,Addam Marbrand
1,Aegon Frey (Jinglebell)
2,Aegon Targaryen
3,Adrack Humble
4,Aemon Costayne


In [89]:
characters['Name'].iloc[0].split(" ")[1].lower()

'marbrand'

In [91]:
# Function to calculate similarity considering both first and last name
def compute_similarity(name):
    name_parts = name.split(" ")
    first_name = name_parts[0].lower()
    last_name = name_parts[1].lower() if len(name_parts) > 1 else None
    
    similarities = []
    
    # Calculate similarity for the first name
    if first_name in model_cbow.wv:
        similarities.append(model_cbow.wv.similarity('iron_throne', first_name))
    
    # Calculate similarity for the last name (if it exists)
    if last_name and last_name in model_cbow.wv:
        similarities.append(model_cbow.wv.similarity('iron_throne', last_name))
    
    # Return the average similarity if both names exist, otherwise just one similarity
    if similarities:
        return sum(similarities) / len(similarities)
    else:
        return None  # Return None if no name part is in the vocabulary

# Function to calculate similarity considering both first and last name
def get_last_name(name):
    name_parts = name.split(" ")
    last_name = name_parts[1].lower() if len(name_parts) > 1 else None
    return last_name
        
# Apply the function to calculate similarity for each character
characters['similarity'] = characters['Name'].apply(compute_similarity)

# Apply the function to calculate similarity for each character
characters['house'] = characters['Name'].apply(get_last_name)

# Print the resulting DataFrame
print(characters.head())


                      Name  similarity      house
0           Addam Marbrand    0.464045   marbrand
1  Aegon Frey (Jinglebell)    0.613120       frey
2          Aegon Targaryen    0.744675  targaryen
3            Adrack Humble    0.561879     humble
4           Aemon Costayne    0.492332   costayne


In [82]:
characters[:4]

Unnamed: 0,Name,similarity
0,Addam Marbrand,0.464045
1,Aegon Frey (Jinglebell),0.61312
2,Aegon Targaryen,0.744675
3,Adrack Humble,0.561879


In [92]:
# Sort the DataFrame by 'similarity' column in descending order
characters_sorted = characters.sort_values(by='similarity', ascending=False)

# Print the sorted DataFrame

characters_sorted


Unnamed: 0,Name,similarity,house
2,Aegon Targaryen,0.744675,targaryen
689,Renly Baratheon,0.717126,baratheon
702,Robert Baratheon,0.716023,baratheon
411,Joffrey Baratheon,0.712842,baratheon
911,Zei,0.707267,
...,...,...,...
379,Jack-Be-Lucky,,
405,Jommy,,
805,Three-Tooth,,
881,Will,,


In [93]:
characters_sorted.to_csv('sorted.csv')