Navigating to the folder with the embedding data

In [7]:
%cd drive/MyDrive/IndividualProject/

[Errno 2] No such file or directory: 'drive/MyDrive/IndividualProject/'
/content/drive/MyDrive/IndividualProject


In [8]:
import pickle
import numpy as np
import torch
import math
import matplotlib.pyplot as plt
from io import open
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Loading the embedding files

To load embeddings from other language, feel free to replace "en-embs" in the path file with "fr-embs" or "de-embs"

In [9]:
words = []
weights = []

for year in range(1800, 2000, 10):
  vocab_filename = str(year) + "-vocab.pkl"
  weights_filename = str(year) + "-w.npy"
  words.append(pickle.load(open("ProjectSoftwareArchive/en-embs/" + vocab_filename, 'rb')))
  weights.append(np.load("ProjectSoftwareArchive/en-embs/" + weights_filename))

# Executing dimensionality reduction on each of the embedding spaces

In [None]:
resized_embeddings = []

for idx, embeddings in enumerate(weights):
  pca_2 = PCA(n_components=2)
  pca_2.fit(embeddings)
  resized_embeddings.append(pca_2.transform(embeddings))

# Sketching the representation of a set of words in 2 dimensions

To modify the language, modify "sample_words_en" on line 20 with either "sample_words_fr" or "sample_words_de".

In [None]:
def index2year(idx):
  return 1800 + idx * 10

def word2embedding(word, idx):
  return resized_embeddings[idx][words[idx].index(word)]

sample_words_en = ["husband", "king", "queen", "mother", "father", "man", "woman", "egg",
                 "egg", "eye", "eye", "eat", "fish", "knee", "neck", "nose",
                 "broadcast", "seed", "radio"]
sample_words_fr = ["mari", "roi", "reine", "mère", "père", "homme", "femme", "œuf", 
                "oeuf", "œil", "oeil", "manger", "poisson", "genou", "cou", "nez", 
                "diffuser", "graine", "radio"]
sample_words_de = ["ehemann", "könig", "königin", "kutter", "kater", "mann", "frau", 
                   "ei", "auge", "essen", "fisch", "knie", "hals", "nase", 
                   "sendung", "samen", "radio"]

for idx, emb in enumerate(resized_embeddings):
  coordinates = []
  word_labels = []
  for word in sample_words_en:
    if words[idx].count(word) > 0:
      coordinates.append(word2embedding(word, idx))
      word_labels.append(word)
  coordinates = np.array(coordinates)
  xs, ys = coordinates.T
  plt.scatter(xs, ys)
  if idx == 20:
    space_title = "Usual Word Embeddings"
  else:
    space_title = str(index2year(idx)) + " embeddings"
  plt.title(space_title)
  for idx2, lbl in enumerate(word_labels):
    plt.annotate(lbl, (xs[idx2], ys[idx2]), textcoords = "offset points",
                 xytext=(0,10), ha='center')
  plt.show()
