This Jupyter notebook tests the performance of word2vec on identifying words similar in usage to "servant" from EEBO texts. Spring 2025.

By Kirin Mohile and Jerry Zou

In [11]:
from gensim.models import Word2Vec
import pandas as pd
import nltk
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download the stopwords 
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('punkt_tab')

# Get the set of English stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove punctuation, numbers, and stop words using regex
    # \b[a-zA-Z]+\b matches only alphabetic words (removes punctuation and numbers)
    words = [word.lower() for word in re.findall(r'\b[a-zA-Z]+\b', text)]
    return words

In [242]:
df = pd.read_csv(r'C:\Users\kmm212\Documents\EEBOdataALL.csv')

In [243]:
df.head(10)

Unnamed: 0,Labelled Sentences
0,"“His father also was very wealthy, hauing many..."
1,“From which imputation the Lord would free his...
2,“​​The Lord then doth promise to make all his ...
3,“Hee that raised vp those godly men friends ab...
4,And thus the Prophet Zacharie bringing foorth ...
5,“So much the more ignorant we are of knowing w...
6,"Many worthy sonnes and seruants of God, aswell..."
7,“let them taste of thy fauours and loue contin...
8,AS there is areciprocal dutie between the husb...
9,Make the case of your corporall seruants disob...


In [244]:
sentences = df["Labelled Sentences"].values
sentenceVec = [clean_text(sentence) for sentence in sentences]

In [248]:
model = Word2Vec(sentenceVec,min_count=1,vector_size=32)

In [249]:
model.wv.most_similar('seruant')

[('but', 0.599021315574646),
 ('goe', 0.547112762928009),
 ('imply', 0.5244221091270447),
 ('what', 0.5040825009346008),
 ('implied', 0.49454736709594727),
 ('longer', 0.4848790466785431),
 ('held', 0.46419757604599),
 ('willingly', 0.4548797309398651),
 ('a', 0.45085760951042175),
 ('way', 0.44341328740119934)]

In [250]:
def get_most_similar_excluding_stopwords(model, word, topn=10):
    if word not in model.wv.key_to_index:
        return f"Word '{word}' not in vocabulary."
    
    similar_words = model.wv.most_similar(word, topn=topn)
    # Filter out stopwords from the most similar words
    filtered_similar_words = [(w, sim) for w, sim in similar_words if w not in stop_words]
    
    return filtered_similar_words

# Query the most similar words to 'servant', excluding stop words, and get top 20 similar words
similar_words = get_most_similar_excluding_stopwords(model, 'seruant', topn=20)
print(similar_words)

[('goe', 0.547112762928009), ('imply', 0.5244221091270447), ('implied', 0.49454736709594727), ('longer', 0.4848790466785431), ('held', 0.46419757604599), ('willingly', 0.4548797309398651), ('way', 0.44341328740119934), ('liue', 0.43995875120162964), ('meerely', 0.4342024624347687), ('mistresses', 0.41352418065071106), ('doth', 0.40915006399154663), ('sciences', 0.40911513566970825), ('great', 0.3868977129459381), ('example', 0.38597995042800903)]


In [251]:
model.wv['seruant']

array([-0.00960584,  0.00890332,  0.02065947, -0.01186525,  0.00826552,
       -0.03135111,  0.03304799,  0.03098423, -0.01740754,  0.00103027,
        0.00607685,  0.01041439, -0.02622513, -0.00124748,  0.0228003 ,
        0.00550258,  0.00252749, -0.02692831,  0.02543562, -0.00816675,
       -0.0001763 ,  0.0222545 ,  0.01088543,  0.0032712 ,  0.00706901,
       -0.02254255, -0.03993632,  0.03189945,  0.0200764 , -0.02774495,
        0.00968992,  0.0035363 ], dtype=float32)

In [257]:
similarity = model.wv.similarity('seruant', 'god')
print(similarity)

0.061343446


Analyzing sentences with different word2vec vector lengths and window sizes.

In [12]:
allSentencesDF = pd.read_csv("/Users/Jerry/Desktop/BassConnections2024-5/ECBC2024-5/EEBOdataALL.csv")
allSentences = allSentencesDF["Labelled Sentences"].tolist()
print(type(allSentences))
    

<class 'list'>


In [21]:
def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text.lower())  # Remove punctuation and lowercase
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return tokens

tokenizedSentences = [preprocess(sentence) for sentence in allSentences]

In [22]:
embedWithStopwords = Word2Vec(sentences=tokenizedSentences, vector_size=300, window=5, min_count=1, workers=4)


In [None]:
similarWords = embedWithStopwords.wv.most_similar("seruant", topn=10)
print(type(similarWords))
print("Words most similar to 'seruant':", similar_words)

<class 'list'>
Words most similar to 'seruant': [('shew', 0.17914585769176483), ('length', 0.16879016160964966), ('filld', 0.16274015605449677), ('church', 0.1569209098815918), ('schoole', 0.15550550818443298), ('looke', 0.15275436639785767), ('thou', 0.14728382229804993), ('salary', 0.1465924084186554), ('corrupting', 0.14316125214099884), ('commanded', 0.14269207417964935)]


In [34]:
# creating two sets of models. One has 4 versions with different vector lengths. The other has the same vector length (300 dimensions) but with different window sizes.
modelsDifferentVectorSize = {}
for vectorLength in [100, 200, 300, 400]:
    modelsDifferentVectorSize[f'vector_{vectorLength}'] = Word2Vec(
        sentences=tokenizedSentences,
        vector_size=vectorLength,
        window=5,
        min_count=1,
        workers=4
    )


modelsDifferentWindowSize = {}
for windowSize in [4, 6, 8, 10]:
    modelsDifferentWindowSize[f'window_{windowSize}'] = Word2Vec(
        sentences=tokenizedSentences,
        vector_size=300,
        window=windowSize,
        min_count=1,
        workers=4
    )


In [37]:
target_word = "seruant"
simialrWordsStorage = {}
counter = set()
for vectorLength, model in modelsDifferentVectorSize.items():
    print(f"Top words similar to '{target_word}' with vector length={vectorLength}:")
    simialrWordsStorage[vectorLength] = []
    try:
        resultsList = model.wv.most_similar(target_word, topn=10)
        for wordResult in resultsList:
            print(wordResult)
            counter.add(wordResult[0])
            simialrWordsStorage[vectorLength].append(wordResult[0])
    except KeyError:
        print(f"'{target_word}' not found in the vocabulary for vector length={vectorLength}.")

for vectorVersion, wordList in simialrWordsStorage.items():
    print(vectorVersion)
    print(wordList)

print(len(counter))
# target_word = "seruant"
# for windowSize, model in modelsDifferentWindowSize.items():
#     print(f"Top words similar to '{target_word}' with window size={windowSize}:")
#     try:
#         resultsList = model.wv.most_similar(target_word, topn=10)
#         for wordResult in resultsList:
#             print(wordResult)
#     except KeyError:
#         print(f"'{target_word}' not found in the vocabulary for window size={windowSize}.")


Top words similar to 'seruant' with vector length=vector_100:
('iudicially', 0.3639525771141052)
('stipend', 0.22085310518741608)
('sister', 0.21995605528354645)
('presumption', 0.2175326943397522)
('exceedingly', 0.21661525964736938)
('censure', 0.21420413255691528)
('doctor', 0.21195511519908905)
('mispend', 0.2071177065372467)
('secrets', 0.20537249743938446)
('sit', 0.2009040117263794)
Top words similar to 'seruant' with vector length=vector_200:
('liberall', 0.20895951986312866)
('messengers', 0.18673625588417053)
('lot', 0.1794871836900711)
('115', 0.17131057381629944)
('till', 0.16365258395671844)
('haue', 0.16234120726585388)
('respect', 0.15825983881950378)
('grapes', 0.15748943388462067)
('raised', 0.15035907924175262)
('deuout', 0.14684529602527618)
Top words similar to 'seruant' with vector length=vector_300:
('shew', 0.17914585769176483)
('length', 0.16879016160964966)
('filld', 0.16274015605449677)
('church', 0.1569209098815918)
('schoole', 0.15550550818443298)
('looke', 