In [None]:
import os
import sys
import torch
import transformers
from types import SimpleNamespace
import matplotlib.pyplot as plt

from dep_model import *
from dep_data_load import *

In [None]:
args = SimpleNamespace()
args.base_path = './'
args.data_path = '../datasets/UD_English-EWT'
args.lang = 'en'
train_filename = 'en_ewt-ud-train.conllu'
valid_filename = 'en_ewt-ud-dev.conllu'
test_filename = 'en_ewt-ud-test.conllu'
args.shuffle = False

args.word_embed_size = 100
args.pos_embed_size = 100
args.attention_hidden_size = 200
args.lm_model_name = 'bert-base-uncased'
args.encoder = 'lm'
args.lstm_hidden_size = 400
args.lstm_layers = 3
args.dropout = 0.33
args.lm_layer = 8
args.scale = 0
args.typological = False
args.typ_embed_size = 32
args.num_typ_features = 289
args.typ_encode = None
args.fine_tune = False

In [None]:
train_loader, val_loader, test_loader, vocab_dict, pos_dict, label_dict = dep_data_loaders(args, train_filename, valid_filename, test_filename)
pad_index = len(vocab_dict)

In [None]:
lm_classifier = BiaffineDependencyModel(n_words = len(vocab_dict), n_pos = len(pos_dict), n_rels = len(label_dict), word_embed_size = args.word_embed_size, pos_embed_size = args.pos_embed_size, lstm_hidden_size = args.lstm_hidden_size, encoder = args.encoder, lstm_layers = args.lstm_layers, 
    lm_model_name = args.lm_model_name, dropout = args.dropout, n_lm_layer = args.lm_layer, n_arc_mlp = 500, n_rel_mlp = 100, scale = args.scale, pad_index = pad_index, 
    unk_index = 0, typological = args.typological, typ_embed_size = args.typ_embed_size, num_typ_features = args.num_typ_features, 
    typ_encode = args.typ_encode, attention_hidden_size = args.attention_hidden_size, fine_tune = args.fine_tune)

In [None]:
model_path = os.path.join(args.base_path, 'saved_models', 'lm_dep_model.pt')
lm_classifier.load_state_dict(torch.load(model_path))

In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
with open('common_words.txt', 'r') as f:
    lines = f.readlines()
non_typ_embeddings  = {}
for w in lines:
    tokens = tokenizer.encode(w.strip('\n'), return_tensors = 'pt')
    non_typ_embeddings[w] = torch.mean(lm_classifier.encode.lm(tokens)[:, 1:-1, :], dim = 1)

In [None]:
args.typological = True
args.typ_encode = 'add_att'
args.typ_feature = 'syntax_knn+phonology_knn+inventory_knn'

In [None]:
typ_lm_classifier = BiaffineDependencyModel(n_words = len(vocab_dict), n_pos = len(pos_dict), n_rels = len(label_dict), word_embed_size = args.word_embed_size, pos_embed_size = args.pos_embed_size, lstm_hidden_size = args.lstm_hidden_size, encoder = args.encoder, lstm_layers = args.lstm_layers, 
    lm_model_name = args.lm_model_name, dropout = args.dropout, n_lm_layer = args.lm_layer, n_arc_mlp = 500, n_rel_mlp = 100, scale = args.scale, pad_index = pad_index, 
    unk_index = 0, typological = args.typological, typ_embed_size = args.typ_embed_size, num_typ_features = args.num_typ_features, 
    typ_encode = args.typ_encode, attention_hidden_size = args.attention_hidden_size, fine_tune = args.fine_tune)

In [None]:
typ_embeddings = {}
typ_embed = typ_lm_classifier.encode.typ(lang = 'en', typ_feature = args.typ_feature, device = 'cpu')
for w in lines:
    tokens = tokenizer.encode(w.strip('\n'), return_tensors = 'pt')
    embed = typ_lm_classifier.encode.lm(tokens)
    embed = torch.mean(embed[:, 1:-1, :], dim = 1)
    output = typ_lm_classifier.encode.attention.forward(typ_embed, embed.squeeze(0))
    typ_embeddings[w] = output

In [None]:
norm_embeddings = torch.cat(list(non_typ_embeddings.vals())).detach().cpu.numpy()
typ_embeddings = torch.cat(list(typ_embeddings.vals())).detach().cpu().numpy()
similarities = np.matmul(typ_embeddings, norm_embeddings.transpose())
lines = map(str.strip('\n'), lines)
words = np.array(lines)
sorted_indices = words.argsort()
result = similarities[sorted_indices][:, sorted_indices]
plt.title('Cosine similarity between non-typological and typological BERT embeddings')
plt.xlabel('non-typological embedding')
plt.ylabel('typological embedding')
plt.imshow(1-result, cmap = 'hot', interpolation = 'nearest')
plt.show()

In [None]:
happy_embed = non_typ_embeddings['happy'].detach().cpu().numpy()
happy_sim_dict = {}
for w in non_typ_embeddings:
    if w == 'happy':
        continue
    word_embed = non_typ_embeddings[w].detach().cpu().numpy()
    happy_sim_dict[w] = np.matmul(happy_embed, word_embed.transpose())
happy_sim_pairs = sorted(happy_sim_dict.items(), key = lambda item: item[1])
print(happy_sim_pairs[:5])

In [None]:
typ_happy_embed = typ_embeddings['happy'].detach().cpu().numpy()
typ_happy_sim_dict = {}
for w in typ_embeddings:
    if w == 'happy':
        continue
    word_embed = typ_embeddings[w].detach().cpu().numpy()
    typ_happy_sim_dict[w] = np.matmul(happy_embed, word_embed.transpose())
typ_happy_sim_pairs = sorted(typ_happy_sim_dict.items(), key = lambda item: item[1])
print(typ_happy_sim_pairs[:5])

In [None]:
words = [x[0] for x in happy_sim_pairs]
scores = [x[1] for x in happy_sim_pairs]
x_pos = [i for i,_ in enumerate(words)]
plt.bar(x_pos, scores, color = 'green')
plt.xlabel('Words')
plt.ylabel('Similarity')
plt.title('Similarity to happy and most common english words using frozen embeddings')
plt.xticks(x_pos, words)
plt.show()

In [None]:
words = [x[0] for x in typ_happy_sim_pairs]
scores = [x[1] for x in typ_happy_sim_pairs]
x_pos = [i for i,_ in enumerate(words)]
plt.bar(x_pos, scores, color = 'green')
plt.xlabel('Words')
plt.ylabel('Similarity')
plt.title('Similarity to happy and most common english words using typological embeddings')
plt.xticks(x_pos, words)
plt.show()