In [None]:
import pandas as pd
import os

## Load diachronic embedding spaces

In [None]:
SOURCE_DIR = "ppmi_embeddings"

In [None]:
def load_diachronic_embeddings(source_dir, name, n_epochs):
    embedding_tables = []
    
    for i in range(1, n_epochs+1):
        df = pd.read_csv(os.path.join(source_dir, name, name+"_{}.emb".format(i)), sep=',', header=0, index_col="CHAR")
        embedding_tables.append(df)
        
    assert all([list(df.index) == list(embedding_tables[0].index) and list(df.columns) == list(embedding_tables[0].columns) for df in embedding_tables])
    
    return [df.to_numpy() for df in embedding_tables], list(embedding_tables[0].index), list(embedding_tables[0].columns) 
        

In [None]:
parupa, parupa_vocab, parupa_dimensions = load_diachronic_embeddings(SOURCE_DIR, "parupa", 5)
parupa_control, parupa_control_vocab, parupa_control_dimensions = load_diachronic_embeddings(SOURCE_DIR, "parupa_control", 5)

In [None]:
UDDanish, UDDanish_vocab, UDDanish_dimensions = load_diachronic_embeddings(SOURCE_DIR, "UDDanish", 5)
UDDanish_control, UDDanish_control_vocab, UDDanish_control_dimensions = load_diachronic_embeddings(SOURCE_DIR, "UDDanish_control", 5)

In [None]:
danmarksstednavne, danmarksstednavne_vocab, danmarksstednavne_dimensions = load_diachronic_embeddings(SOURCE_DIR, "danmarksstednavne", 11)
danmarksstednavne_control, danmarksstednavne_control_vocab, danmarksstednavne_control_dimensions = load_diachronic_embeddings(SOURCE_DIR, "danmarksstednavne_control", 11)

## Analysis

In [None]:
from scipy.spatial.distance import euclidean

In [None]:
def distance(a, b, element_wise):
    if element_wise: return a-b
    else: return euclidean(a,b)


In [None]:
def pairwise_distance(embedding_spaces, a, b, vocab, element_wise=False):
    base = embedding_spaces[0]
    a_i = vocab.index(a)
    b_i = vocab.index(b)
    
    return [
        distance(base[a_i], embedding_spaces[i][b_i], element_wise)
            for i in range(0,len(embedding_spaces))
    ]

In [None]:
def results_csv(change, control, name):
    with open("results/{}.csv".format(name), "w") as f:
        f.write(",".join(["Distance", "Bin", "Corpus"])+"\n")
        for row in list(zip(change, range(1, len(change)+1),["Change",]*len(change)))+list(zip(control, range(1, len(control)+1),["Control",]*len(control))):
            print(row)
            f.write(",".join([str(v) for v in row])+"\n")
    
    

### Parupa (p > b)

In [None]:
change = pairwise_distance(parupa, "p", "b", parupa_vocab)
change

In [None]:
control = pairwise_distance(parupa_control, "p", "b", parupa_control_vocab)
control

In [None]:
results_csv(change, control, "parupa")

### UDDanish (g > k)

In [None]:
change = pairwise_distance(UDDanish, "g", "k", UDDanish_vocab)
change

In [None]:
control = pairwise_distance(UDDanish_control, "g", "k", UDDanish_control_vocab)
control

In [None]:
results_csv(change, control, "UD-Da")

### Danmarks Stednavne (g > k)

In [None]:
change = pairwise_distance(danmarksstednavne, "k", "g", danmarksstednavne_vocab)
change

In [None]:
control = pairwise_distance(danmarksstednavne_control, "k", "g", danmarksstednavne_control_vocab)
control

In [None]:
results_csv(change, control, "geo_k")

### Danmarks Stednavne (t > d)

In [None]:
change = pairwise_distance(danmarksstednavne, "t", "d", danmarksstednavne_vocab)
change

In [None]:
control = pairwise_distance(danmarksstednavne_control, "t", "d", danmarksstednavne_control_vocab)
control

In [None]:
results_csv(change, control, "geo_t")

### Danmarks Stednavne (p>b)

In [None]:
change = pairwise_distance(danmarksstednavne, "p", "b", danmarksstednavne_vocab)
change

In [None]:
control = pairwise_distance(danmarksstednavne_control, "p", "b", danmarksstednavne_control_vocab)
control

In [None]:
results_csv(change, control, "geo_p")

## Evaluating salient dimensions

In [None]:
from scipy.stats import linregress
import numpy as np

In [None]:
def evaluate_dimensions(embedding_tables, a, b, vocab, dimensions):
    pairwise_distances = pairwise_distance(embedding_tables, a, b, vocab, element_wise=True)
    
    out_dict = {}
    for i, dimension in enumerate(dimensions):
        distances = np.array(pairwise_distances)[:, i]
        slope, _, r, p, _ = linregress(list(range(0,len(distances))), distances)
        
        out_dict[dimension] = {"slope":slope, "r":r, "p":p}
        
    filter_items = filter(lambda item: item[1]["p"]<0.05 and item[1]["r"]<-0.2, out_dict.items())
    filter_items = filter(lambda item: embedding_tables[0][vocab.index(a)][dimensions.index(item[0])]>0, filter_items)
    
    sorted_items = sorted(list(filter_items), key=lambda item:item[1]["slope"])
    
    return sorted_items
        

In [None]:
evaluate_dimensions(parupa, "p", "b", parupa_vocab, parupa_dimensions)[0:5]

In [None]:
evaluate_dimensions(UDDanish, "g", "k", UDDanish_vocab, UDDanish_dimensions)[0:5]

In [None]:
evaluate_dimensions(danmarksstednavne, "k", "g", danmarksstednavne_vocab, danmarksstednavne_dimensions)[0:10]