In [1]:
from __future__ import print_function

import numpy as np
import os
import argparse
from collections import OrderedDict
import cPickle as pickle

def loadYear(year):
  filename = 'trained_vectors_400/vectors-{}-ngram.txt'.format(year)
  # filename = 'trained_vectors/vectors-{}-ngram.txt'.format(year)
  mapping = OrderedDict()
  with open(filename, 'r') as f:
    for line in f:
      row = line.strip().split()
      w = row[0]
      v = [float(x) for x in row[1:]]
      mapping[w] = v
  return mapping


Load the pre-trained vectors

In [2]:
years = [1900 + x for x in range(101)]

models = OrderedDict()
for year in years:
    try:
      models[year] = loadYear(year)
      print("year {}: {} tokens".format(year, len(models[year])))
    except:
      pass
years = models.keys()


year 1908: 31808 tokens
year 1917: 26737 tokens
year 1926: 24187 tokens
year 1935: 21510 tokens
year 1944: 17432 tokens
year 1953: 25589 tokens
year 1962: 29019 tokens
year 1971: 25715 tokens
year 1980: 31480 tokens
year 1989: 37904 tokens
year 1998: 46254 tokens


Extract a commom set of tokens. We will compare the shift of language using the set of common
tokens. Pairwise inner product (PIP) matrices will be constructed that captures the meanings
of the tokens.

In [3]:
tokens = models[years[0]].keys()
# extract common vocab from two embeddings
for year in years:
    tokens_tmp = models[year].keys()
    tokens = list(set(tokens).intersection(tokens_tmp))
subsample = len(tokens)
if subsample:
  tokens = np.random.choice(tokens, size = subsample, replace=False)
print(len(tokens))


15803


In [4]:
paired_embeds = {}
for token in tokens:
    paired_embeds[token] = []
    for year in years:
        paired_embeds[token].append(models[year][token])

# del models
print("{} tokens to be compared".format(len(paired_embeds)))

15803 tokens to be compared


In [5]:
normalize = True
Es = OrderedDict()
for idx, year in enumerate(years):
    Es[year] = []
    for token in tokens:
        Es[year].append(paired_embeds[token][idx])
    Es[year] = np.array(Es[year])
    if normalize:
        Es[year] = Es[year] / np.linalg.norm(Es[year], axis=1)[:,None]
    column_energy = np.mean(np.linalg.norm(Es[year], axis=0))
    Es[year] = Es[year] / column_energy
del paired_embeds

Now, consider the PIP loss between embeddings between different years. Depending on the size, 
we use two solutions. It depends on if the PIP matrices are small enough to fit in memory.

In [6]:
alignment, anchor = [], []
for idx1, year1 in enumerate(years):
    new_alignment, new_anchor = [], []
    for idx2, year2 in enumerate(years):
        if idx1 == idx2:
            continue
        M = Es[year1].T.dot(Es[year2])
        U, D, V = np.linalg.svd(M)
        Q = V.T.dot(U.T)
        align = np.linalg.norm(Es[year1] - Es[year2].dot(Q), 'fro')
        anc = np.linalg.norm(Es[year1].dot(Es[year1].T) - Es[year2].dot(Es[year2].T), 'fro')
        new_alignment.append(align)
        new_anchor.append(anc)
        print("comparing {} and {}: align / anc={}".format(
            year1, year2, align / anc))
    alignment.append(new_alignment)
    anchor.append(new_anchor)
    


comparing 1908 and 1917: align / anc=0.848509814188
comparing 1908 and 1926: align / anc=0.846371193357
comparing 1908 and 1935: align / anc=0.815797362807
comparing 1908 and 1944: align / anc=0.777492372841
comparing 1908 and 1953: align / anc=0.835726171278
comparing 1908 and 1962: align / anc=0.834771752272
comparing 1908 and 1971: align / anc=0.806339554095
comparing 1908 and 1980: align / anc=0.802042076302
comparing 1908 and 1989: align / anc=0.808598296846
comparing 1908 and 1998: align / anc=0.797327026407
comparing 1917 and 1908: align / anc=0.848509814188
comparing 1917 and 1926: align / anc=0.849236465242
comparing 1917 and 1935: align / anc=0.8302638596
comparing 1917 and 1944: align / anc=0.798769545933
comparing 1917 and 1953: align / anc=0.845858640154
comparing 1917 and 1962: align / anc=0.841883964631
comparing 1917 and 1971: align / anc=0.821805230354
comparing 1917 and 1980: align / anc=0.81624926484
comparing 1917 and 1989: align / anc=0.818612220946
comparing 1917 

Now we visualize the result. 

In [7]:
alignment = np.array(alignment)
anchor = np.array(anchor)
print(alignment.shape)
print(anchor.shape)
ratio = alignment / anchor
#ratio = ratio[~np.eye(ratio.shape[0],dtype=bool)]
print(ratio.shape)


(11, 10)
(11, 10)
(11, 10)


In [8]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd


np.save('alignment.npy', alignment)
np.save('anchor.npy', anchor)
print("max={}, min={}, mean={}, std={}".format(
    np.max(ratio), np.min(ratio), np.mean(ratio), np.std(ratio)))
plt.figure()
plt.figure(figsize=(18, 9))  # in inches
plt.hist(ratio, bins=100)
plt.savefig("ratio_hist.pdf")


max=0.849236465242, min=0.753596795002, mean=0.814637767738, std=0.019665169675
