In [1]:
import numpy as np
import pandas as pd
import torch
import sys
import re

from sklearn.manifold import TSNE
import plotly.graph_objects as go

import argparse
import os
from os.path import dirname, abspath
from functools import partial
import json
import yaml
import numpy as np

import torch
import torch.nn as nn
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR

from datasets import load_dataset

import seaborn as sns
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt

from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

from datasets import Dataset
from tqdm.auto import tqdm

from sklearn.cluster import KMeans
import itertools

In [11]:
EMBED_DIMENSION = 300
EMBED_MAX_NORM = 1

In [3]:
with open("/users/ujan/sports-language-in-politics/data/processed/the_donald_bert_embed.json") as f:
    bert_embed = json.load(f)

In [43]:
def get_embeddings_and_vocab(embed):
    
    vocab = list(embed.keys())
    print('vocab size: {}'.format(len(vocab)))

    def normalization(embeddings):
        norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
        norms = np.reshape(norms, (len(norms), 1))
        embeddings_norm = embeddings / norms
        return embeddings_norm
    
    embed_list = []
    for word in vocab:
        embed_list.append(embed[word])
    embeddings = np.array(embed_list)
    
    # normalization
    embeddings_norm = normalization(embeddings)

    return embeddings_norm, embed


def get_top_similar(word: str, embeddings_norm, vocab, topN: int=5):
    if word not in vocab:
        print("Out of vocabulary word")
        return
    vocab_list = list(vocab.keys())
    q_id = vocab_list.index(word)
    word_vec = embeddings_norm[q_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]
    print(topN_ids)

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab_list[sim_word_id]
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict
    

def get_analogy(a,b,c, embeddings_norm, vocab):
    vocab_list = list(vocab.keys())
    if a not in vocab_list or b not in vocab_list or c not in vocab_list:
        print('not in vocab')
        return
    a_id = vocab_list.index(a)
    b_id = vocab_list.index(b)
    c_id = vocab_list.index(c)

    a_vec = embeddings_norm[a_id]
    b_vec = embeddings_norm[b_id]
    c_vec = embeddings_norm[c_id]
    
    sim = -2
    target = None
    sim_dict = {}
    for token in vocab_list:
        #if token in [a,b,c]:
            #continue
        token_vec = vocab[token]
        s = np.dot(token_vec, b_vec-a_vec+c_vec)
        if s > sim:
            sim = s
            target = token
        if len(sim_dict) < 5:
            sim_dict[token] = s
        else:
            min_key = min(sim_dict, key=sim_dict.get)
            min_val = sim_dict[min_key]
            if s > min_val:
                del sim_dict[min_key]
                sim_dict[token] = s
                    
    print(target, sim)
    print(list(sim_dict.keys()))


In [19]:
embeddings_norm, vocab = get_embeddings_and_vocab(bert_embed)

vocab size: 141315


##### similarities

In [26]:
get_top_similar('red', embeddings_norm, vocab)

[ 1489  1341 12215  6426  1340]


{'yellow': 0.7891525729468087,
 'blue': 0.7852215814655803,
 'redzone': 0.7581747161029492,
 'purple': 0.7456772740160242,
 'green': 0.7396133718002599}

In [31]:
get_top_similar('captain', embeddings_norm, vocab)

[ 8243 24551 60487   537   770]


{'captains': 0.8689232470403931,
 'captaincy': 0.8382648958171743,
 'captained': 0.752838344329176,
 'player': 0.7472572215821804,
 'leader': 0.7404562584459732}

In [32]:
get_top_similar('bus', embeddings_norm, vocab)

[ 40103  10228 122133   1291  66016]


{'busses': 0.8235935669807237,
 'buses': 0.8126035558194924,
 'busload': 0.7932670976685302,
 'train': 0.7556962560867636,
 'tram': 0.6987717388377841}

In [33]:
get_top_similar('president', embeddings_norm, vocab)

[ 7314  5833  2476 20818  4125]


{'presidents': 0.8734757463719285,
 'presidency': 0.869463995409985,
 'presidential': 0.8367583351716343,
 'presidental': 0.7933712318362054,
 'administration': 0.7792574784366104}

In [34]:
get_top_similar('election', embeddings_norm, vocab)

[ 3071 60089 25266   666 55184]


{'elections': 0.9072837601504775,
 'electioneering': 0.8756772888097173,
 'reelection': 0.8547783510794441,
 'elected': 0.8392037156559514,
 'electionday': 0.8272017592504846}

In [35]:
get_top_similar('race', embeddings_norm, vocab)

[ 4854  9223 26476 28426  1486]


{'races': 0.872780962016001,
 'racial': 0.8235731372009734,
 'ethnicity': 0.7938107639573562,
 'whiteness': 0.7535558855366566,
 'color': 0.7503275426978833}

In [36]:
get_top_similar('voters', embeddings_norm, vocab)

[ 2201  2512  3061 32216 90886]


{'voter': 0.8552195927212787,
 'votes': 0.8437491481399981,
 'voting': 0.8352483727434798,
 'electorate': 0.8336844772137472,
 'electorates': 0.8186677502996491}

##### analogies

In [44]:
get_analogy('man', 'king', 'woman', embeddings_norm, vocab)

queen 9.460865361000687
['woman', 'queen', 'princess', 'womanhood', 'queenie']


In [45]:
get_analogy('paris', 'france', 'berlin', embeddings_norm, vocab)

germany 10.45201836807469
['german', 'germany', 'berlin', 'deutschland', 'prussia']


In [37]:
## fine tune on data?