In [6]:
import pickle
import pandas as pd
import numpy as np
from collections import defaultdict
import os

https://nlp.stanford.edu/projects/histwords/ 

http://snap.stanford.edu/historical_embeddings/eng-fiction-all.zip

In [36]:
# Folder containing Stanford HistWords embedding files
embedding_folder = "../embeddings-eng-fiction-19th-century"
output_file = "../19th-century-averaged-embeddings.txt"

# Step 1: Detect all available decades based on matching vocab+embedding pairs
decades = []
for file in os.listdir(embedding_folder):
    if file.endswith("-vocab.pkl"):
        decade = file.split("-")[0]
        npy_file = os.path.join(embedding_folder, f"{decade}-w.npy")
        if os.path.exists(npy_file):
            decades.append(decade)

decades = sorted(decades)
print(f"Found {len(decades)} decades: {decades}")

# Step 2: Load vocab and vectors for each decade
decade_embeddings = {}
vocab_sets = []

for decade in decades:
    vocab_path = os.path.join(embedding_folder, f"{decade}-vocab.pkl")
    emb_path = os.path.join(embedding_folder, f"{decade}-w.npy")
    
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    vectors = np.load(emb_path)

    word_to_vec = {word: vectors[i] for i, word in enumerate(vocab)}
    decade_embeddings[decade] = word_to_vec
    vocab_sets.append(set(vocab))

# Step 3: Get words shared across all decades
common_vocab = set.intersection(*vocab_sets)
print(f"Vocabulary size shared across all decades: {len(common_vocab)}")

# Step 4: Average embeddings
avg_embeddings = {}
for word in common_vocab:
    vecs = [decade_embeddings[decade][word] for decade in decades]
    avg_vec = np.mean(vecs, axis=0)
    avg_embeddings[word] = avg_vec

# Step 5: Save in GloVe-style format
with open(output_file, 'w', encoding='utf-8') as f:
    for word, vec in avg_embeddings.items():
        vec_str = ' '.join(f"{v:.6f}" for v in vec)
        f.write(f"{word} {vec_str}\n")

print(f"Saved averaged embeddings to: {output_file}")

Found 12 decades: ['1800', '1810', '1820', '1830', '1840', '1850', '1860', '1870', '1880', '1890', '1900', '1910']
Vocabulary size shared across all decades: 100000
Saved averaged embeddings to: ../19th-century-averaged-embeddings.txt


### Filter ChiLit words

In [7]:
df_par = pd.read_csv('./data/ChiLit_Paragraphs.csv', encoding = 'utf-8')
df_par = df_par.fillna("")

In [37]:
vocab = set()
for text in df_par['tokens']:
    vocab.update(text.split())

vocab = [word for word in vocab if len(word)>=2 and word.isalpha()]
print(len(vocab))

24441


In [31]:
missing_words = []
avg_embeddings_chilit = {}
for word in vocab:
    if word not in avg_embeddings:
        missing_words.append(word)
    else:
        avg_embeddings_chilit[word] = avg_embeddings[word]

In [32]:
len(missing_words)

4427

In [33]:
len(avg_embeddings_chilit)

20014

In [34]:
missing_words

['painstake',
 'poulticed',
 'vegetarianism',
 'delver',
 'geologize',
 'antipode',
 'congealment',
 'burlesquing',
 'patho',
 'byhood',
 'methink',
 'devill',
 'raither',
 'callowness',
 'unfriend',
 'passon',
 'choppe',
 'waskal',
 'murdher',
 'follie',
 'saucebottle',
 'civilise',
 'turbin',
 'parlaying',
 'impassion',
 'karoo',
 'uncease',
 'broider',
 'chicky',
 'zap',
 'papyri',
 'chirrupy',
 'biddit',
 'birching',
 'sublimate',
 'gyrate',
 'untrammell',
 'moneywort',
 'intrenchment',
 'assuredst',
 'delicatest',
 'benumb',
 'shiningest',
 'anim',
 'hairier',
 'zigzagg',
 'overgrew',
 'seemin',
 'squeakiest',
 'stableyard',
 'unman',
 'watchest',
 'jobation',
 'equalise',
 'immanâla',
 'quadratic',
 'shramm',
 'geniuse',
 'gypsum',
 'schlooping',
 'hinfancy',
 'onnatural',
 'bachelder',
 'inexpediency',
 'battleaxe',
 'plie',
 'teye',
 'upness',
 'breakfasttime',
 'poon',
 'shillelah',
 'wagonload',
 'bely',
 'reconnoitr',
 'accurse',
 'discoloure',
 'tutorial',
 'tipshy',
 'dhk'

In [39]:
output_file = "./data/chilit-19th-century-averaged-embeddings.txt"
with open(output_file, 'w', encoding='utf-8') as f:
    for word, vec in avg_embeddings_chilit.items():
        vec_str = ' '.join(f"{v:.6f}" for v in vec)
        f.write(f"{word} {vec_str}\n")