# Word Embeddings with Word2Vec

## Newcastle University Special Collections

***

**Table of Contents**
* [Training](#train-embeddings)
* [Analysis](#analysis)

***

In [1]:
import emb_utils
import pandas as pd
from pathlib import Path
import os, re, glob, string
import gensim
from gensim.models import Word2Vec
from gensim import utils

In [2]:
# Create a directory to save the models
emb_model_dir = "embedding_models/"
Path(emb_model_dir).mkdir(exist_ok=True, parents=True)

Other files (from `analysis_metadata_nusc.ipynb`) to experiment with creating embeddings with include:

In [3]:
nusc_dir = "data/"
nusc_lower = "nusc_descriptions_lower.txt"  # includes numbers and punctuation attached to (not separating, like periods or commas) a token (i.e., the backslashes in 5/1/1)
nusc_lower_alpha = "nusc_descriptions_lower_alpha.txt"
nusc_lower_alpha_no_stopwords = "nusc_descriptions_lower_alpha_no_stopwords.txt"

## Training

Use [Word2Vec](https://perma.cc/R282-M8UM)* to create custom word embeddings from the Newcastle and Edinburgh datasets.

**Check out [this resource](https://perma.cc/49GV-E236) for an illustrated explanation of Word2Vec.*

First we'll evaluate how different parameter combinations represent the corpus to determine whether to use skip-gram or continuous bag-of-words, and what to set `context_window` and `min_count`, to. We'll stick with 100 for the dimensions of the vectors.

In [None]:
nusc_files = [nusc_lower, nusc_lower_alpha, nusc_lower_alpha_no_stopwords]
file_paths = [nusc_dir+f for f in nusc_files]
params = {
    "file_paths": file_paths, 
    "arch": [0, 1], 
    "min_count": [3, 4, 5], 
    "window": [6, 8, 10],     # Generally ~10 is suitable for skip-gram and ~5 is suitable for CBOW
    "vector_size": [100]      # Default is 100
    }
similar_words1 = ["photograph", "photographs"]
similar_words2 = ["influential", "greatest"]

In [5]:
class MyCorpus:
    def __iter__(self):
            # corpus_path = file_path
            for line in open(corpus_path):
                yield utils.simple_preprocess(line)  # assumes one doc per line, tokens separated by whitespace in each line

In [6]:
architectures = params["arch"]
windows = params["window"]
min_counts = params["min_count"]
vector_sizes = params["vector_size"]
sim_col1 = f"cosine_similarity_{similar_words1[0]}_{similar_words1[1]}"
sim_col2 = f"cosine_similarity_{similar_words2[0]}_{similar_words2[1]}"
df = pd.DataFrame(columns=[
    "file", "architecture", "context_window", "min_freq_count", "vector_size", sim_col1, sim_col2
    ])
for file_path in file_paths:
    corpus_path = file_path
    sentences = MyCorpus()
    for a in architectures:
        for min_count in min_counts:
            for w in windows:
                for vector_size in vector_sizes:
                    # print(file_path, a, min_count, w, vector_size)
                    model = Word2Vec(
                        sentences=sentences, 
                        window=w,
                        min_count=min_count, 
                        workers=3,        # Default is 3
                        epochs=5, 
                        sg=a,
                        vector_size=vector_size
                    )
                    sim1 = model.wv.similarity(similar_words1[0], similar_words1[1])
                    sim2 = model.wv.similarity(similar_words2[0], similar_words2[1])
                    new_row = pd.DataFrame.from_dict({
                        "file":[file_path], "architecture":[a], "context_window":[w], 
                        "min_freq_count":[min_count], "vector_size":[vector_size], 
                        sim_col1 : [sim1], sim_col2: [sim2]
                        })
                    df = pd.concat([df, new_row])

  df = pd.concat([df, new_row])


In [7]:
df.head(2)

Unnamed: 0,file,architecture,context_window,min_freq_count,vector_size,cosine_similarity_photograph_photographs,cosine_similarity_influential_greatest
0,data/nusc_descriptions_lower.txt,0,6,3,100,0.651196,0.861426
0,data/nusc_descriptions_lower.txt,0,8,3,100,0.65297,0.840397


In [8]:
df.to_csv(emb_model_dir+"nusc_word2vec_model_eval1.csv")  #nusc_word2vec_model_eval2.csv

Based on the evaluation results, we'll use the lowercased alphabetic corpus that excludes stop words (`nusc_descriptions_lower_alpha_no_stopwords.txt`) and continuous bag of words (CBOW) for the architecture, as those resulted in the highest cosine similarity scores for the chosen word pairs in both evaluation runs.  A context window of 8 paired with a minimum token frequency count of 3 as well as a window of 6 paired with a min. count of 5 both yield results that are among the highest (top seven) cosine similarity scores.

(These parameters returned a cosine similarity of about 0.70-0.71 for "photograph" and "photographs," and 0.93-0.97 for "influential" and "greatest.")

Since Gensim's Word2Vec documentation recommends a context window of about 5 for CBOW, let's use the latter set of parameters for our word embedding model.

In [11]:
corpus_path = file_paths[2]
print(corpus_path)

data/nusc_descriptions_lower_alpha_no_stopwords.txt


In [12]:
sentences = MyCorpus()
window = 6           
sg = 0
if sg == 1:
    arch = "sg"
else:
    arch = "cbow"
min_count = 5
vector_size = 100

In [13]:
nusc_model = Word2Vec(
    sentences=sentences, 
    window=window,
    min_count=min_count, 
    workers=3,             # Default is 3
    epochs=5, 
    sg=sg,
    vector_size=vector_size
    )
nusc_model.save(emb_model_dir+f"nusc_word2vec_{arch}_{vector_size}d_context{window}.model")

In [14]:
# Look at a sample of the words in the model to make sure it was trained as expected
for index, word in enumerate(nusc_model.wv.index_to_key):
    if index == 5:
        break
    print(f"word #{index}/{len(nusc_model.wv.index_to_key)} is {word}")

word #0/8705 is letter
word #1/8705 is file
word #2/8705 is consists
word #3/8705 is includes
word #4/8705 is concerning


## Analysis

Let's investigate relationships between grammatically and lexically gendered words and the top most common adjectives from the `nusc_uoe_comarison` notebook.

In [None]:
mas = ["man", "men", "boy", "boys"]
fem = ["woman", "women", "girl", "girls"]

In [51]:
similar = []
for w in fem:
    similar += [nusc_model.wv.most_similar(word, topn=10)]
cat = ["feminine"]*len(fem)
df_fem = pd.DataFrame.from_dict({"word":fem, "category":cat, "top10_cosine_similarity":similar})
df_fem = df_fem.explode("top10_cosine_similarity")
df_fem.head(2)

Unnamed: 0,word,category,top10_cosine_similarity
0,woman,feminine,"(permissions, 0.7298351526260376)"
0,woman,feminine,"(memos, 0.7233912944793701)"


In [52]:
similar = []
for w in mas:
    similar += [nusc_model.wv.most_similar(word, topn=10)]
cat = ["masculine"]*len(mas)
df_mas = pd.DataFrame.from_dict({"word":mas, "category":cat, "top10_cosine_similarity":similar})
df_mas = df_mas.explode("top10_cosine_similarity")
df_mas.head(2)
# f = "embedding_models/gender_analysis.csv"

Unnamed: 0,word,category,top10_cosine_similarity
0,man,masculine,"(permissions, 0.7298351526260376)"
0,man,masculine,"(memos, 0.7233912944793701)"


In [53]:
df_gender = pd.concat([df_fem, df_mas])
df_gender = df_gender.sort_values(by="top10_cosine_similarity")
df_gender.tail()

Unnamed: 0,word,category,top10_cosine_similarity
3,boys,masculine,"(publications, 0.6969491243362427)"
0,woman,feminine,"(publications, 0.6969491243362427)"
2,girl,feminine,"(publications, 0.6969491243362427)"
3,girls,feminine,"(publications, 0.6969491243362427)"
2,boy,masculine,"(publications, 0.6969491243362427)"


In [54]:
df_gender.head()

Unnamed: 0,word,category,top10_cosine_similarity
1,women,feminine,"(bids, 0.6911782622337341)"
0,man,masculine,"(bids, 0.6911782622337341)"
2,girl,feminine,"(bids, 0.6911782622337341)"
1,men,masculine,"(bids, 0.6911782622337341)"
2,boy,masculine,"(bids, 0.6911782622337341)"


In [55]:
df_gender.to_csv("embedding_models/gendered_word_cosine_similarity_top10.csv")

In [64]:
# These adjectives were also analyzed using concordances
old_adjs = ["key", "historic", "influential", "notable", "significant", "successful", "major", "distinctive", "remarkable"]
adjs = []
for a in old_adjs:
    if a in nusc_model.wv.key_to_index:
        adjs += [a]
print(adjs)

['key', 'historic', 'influential', 'notable', 'significant', 'successful', 'major', 'distinctive']


In [73]:
adj_col, f_col, f_scores, = [], [], []
for adj in adjs:
    f_scores += [nusc_model.wv.similarity(f, adj) for f in fem]
    adj_col += [adj for f in fem]
    f_col += [f for f in fem]
cat = ["feminine"]*len(f_col)
df_adj_fem = pd.DataFrame.from_dict({"adjective": adj_col, "word": f_col, "cosine_similarity": f_scores, "category": cat})
df_adj_fem.head()

Unnamed: 0,adjective,word,cosine_similarity,category
0,key,woman,0.101171,feminine
1,key,women,-0.06203,feminine
2,key,girl,0.217084,feminine
3,key,girls,0.26465,feminine
4,historic,woman,0.02057,feminine


In [75]:
adj_col, m_col, m_scores, = [], [], []
for adj in adjs:
    m_scores += [nusc_model.wv.similarity(m, adj) for m in mas]
    adj_col += [adj for m in mas]
    m_col += [m for m in mas]
cat = ["masculine"]*len(f_col)
df_adj_mas = pd.DataFrame.from_dict({"adjective": adj_col, "word": m_col, "cosine_similarity": m_scores, "category": cat})
df_adj_mas.head()

Unnamed: 0,adjective,word,cosine_similarity,category
0,key,man,0.055019,masculine
1,key,men,0.054343,masculine
2,key,boy,0.20273,masculine
3,key,boys,0.392058,masculine
4,historic,man,-0.124435,masculine


In [76]:
df_adj = pd.concat([df_adj_fem, df_adj_mas])
df_adj.shape

(64, 4)

In [77]:
df_adj.to_csv("embedding_models/gendered_word_adj_cosine_similarity.csv")