In [36]:
import numpy as np
import pandas as pd
import torch
import sys
import re

from sklearn.manifold import TSNE
import plotly.graph_objects as go

import argparse
import os
from os.path import dirname, abspath
from functools import partial
import json
import yaml
import numpy as np

import torch
import torch.nn as nn
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR

from datasets import load_dataset

import seaborn as sns
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt

from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

from datasets import Dataset
from tqdm.auto import tqdm

from sklearn.cluster import KMeans
import itertools

from collections import deque
from itertools import permutations
import statistics

In [2]:
EMBED_DIMENSION = 300
EMBED_MAX_NORM = 1

data_dir = "/users/ujan/sports-language-in-politics/models/cbow/"

In [3]:
if torch.backends.mps.is_available(): device = "mps"
elif torch.cuda.is_available(): device = "cuda"
else: device = "cpu"

In [4]:
class CBOW_Model(nn.Module):
    """
    Implementation of CBOW model described in paper:
    https://arxiv.org/abs/1301.3781
    """

    def __init__(self, vocab_size: int):
        super(CBOW_Model, self).__init__()
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=EMBED_DIMENSION,
            max_norm=EMBED_MAX_NORM,
        )
        self.linear = nn.Linear(
            in_features=EMBED_DIMENSION,
            out_features=vocab_size,
        )

    def forward(self, inputs):
        x = self.embeddings(inputs)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x

In [35]:
def get_embeddings_and_vocab(sub_name):
    
    vocab = torch.load(data_dir+sub_name+"_vocab.pt")
    print('vocab size: {}'.format(len(vocab.get_itos())))
    model = CBOW_Model(vocab_size=len(vocab.get_itos()))
    model = torch.load(data_dir+sub_name+"_model.pt", map_location=device)

    def normalization(embeddings):
        norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
        norms = np.reshape(norms, (len(norms), 1))
        embeddings_norm = embeddings / norms
        return embeddings_norm
    
    # embedding from first model layer
    embeddings = list(model.parameters())[0].cpu().detach().numpy()
    
    # normalization
    embeddings_norm = normalization(embeddings)

    return embeddings_norm, vocab


def get_top_similar(word: str, embeddings_norm, vocab, topN: int=5):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict


def get_sim(word1: str, word2: str, embeddings_norm, vocab):
    if word1 not in vocab.get_itos() or word2 not in vocab.get_itos():
        print('not in vocab')
        return
    word1_id = vocab[word1]
    if word1_id == 0:
        print("Out of vocabulary word")
        return
    word2_id = vocab[word2]
    if word2_id == 0:
        print("Out of vocabulary word")
        return

    word1_vec = embeddings_norm[word1_id]
    #word1_vec = np.reshape(word1_vec, (len(word1_vec), 1))
    
    word2_vec = embeddings_norm[word2_id]
    #word2_vec = np.reshape(word2_vec, (len(word2_vec), 1))
    
    return np.dot(word1_vec, word2_vec)


def get_analogy(a,b,c, embeddings_norm, vocab):
    vocab_list = vocab.get_itos()
    if a not in vocab_list or b not in vocab_list or c not in vocab_list:
        print('not in vocab')
        return
    a_id = vocab[a]
    b_id = vocab[b]
    c_id = vocab[c]
    if a_id == 0 or b_id == 0 or c_id == 0:
        print("Out of vocabulary word")
        return
    a_vec = embeddings_norm[a_id]
    b_vec = embeddings_norm[b_id]
    c_vec = embeddings_norm[c_id]
    sim = -2
    target = None
    sim_dict = {}
    for token in vocab_list:
        if token in [a,b,c]:
            continue
        token_id = vocab[token]
        token_vec = embeddings_norm[token_id]
        s = np.dot(token_vec, b_vec-a_vec+c_vec)
        if s > sim:
            sim = s
            target = token
        if len(sim_dict) < 5:
            sim_dict[token] = s
        else:
            min_key = min(sim_dict, key=sim_dict.get)
            min_val = sim_dict[min_key]
            if s > min_val:
                del sim_dict[min_key]
                sim_dict[token] = s
                    
    print(target, sim)
    print(sim_dict)


def get_outlier(word_list, embeddings_norm, vocab):
    vocab_list = vocab.get_itos()
    for word in word_list:
        if word not in vocab_list:
            print('not in vocab')
            return
    pairs = list(itertools.combinations(word_list, 2))
    sim_dict = {w:0 for w in word_list}
    for pair in pairs:
        vec1 = embeddings_norm[vocab[pair[0]]]
        vec2 = embeddings_norm[vocab[pair[1]]]
        sim = np.dot(vec1, vec2)
        sim_dict[pair[0]] += sim
        sim_dict[pair[1]] += sim
    new_d = {w:0 for w in word_list}
    for key, val in sim_dict.items():
        new_d[key] = val/(len(word_list)-1)
    word = None
    value = 2
    for key, val in new_d.items():
        if val < value:
            word = key
            value = val
    print(word, value)

#### Similarities

##### the_donald

In [81]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs') # freq 20, sample 100000
get_top_similar('politics', embeddings_norm, vocab)

vocab size: 10378


{'hearings': 0.22003981,
 'leaning': 0.2085969,
 'sweden': 0.20754245,
 'reddits': 0.20666334,
 'pen': 0.20566946}

In [20]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs') # freq 20, sample 100000
get_top_similar('voters', embeddings_norm, vocab)

vocab size: 10378


{'dems': 0.27929482,
 'vote': 0.2669506,
 'demographics': 0.26626676,
 'voting': 0.26432395,
 'republicans': 0.26054394}

In [8]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs') # freq 20, sample 100000
get_top_similar('woman', embeddings_norm, vocab)

vocab size: 10378


{'girl': 0.3978128,
 'person': 0.33237395,
 'women': 0.3177452,
 'guy': 0.25549906,
 'husband': 0.2481705}

In [10]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs') # freq 20, sample 100000
get_top_similar('trump', embeddings_norm, vocab)

vocab size: 10378


{'obama': 0.33401677,
 'hillary': 0.33117822,
 'trumps': 0.3227671,
 'geotus': 0.30152357,
 'potus': 0.2929576}

##### conservative

In [12]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs') # freq 20, sample 100000
get_top_similar('bad', embeddings_norm, vocab)

vocab size: 9061


{'good': 0.34852636,
 'terrible': 0.3032945,
 'shitty': 0.29318285,
 'horrible': 0.27719852,
 'insignificant': 0.24304685}

In [13]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs') # freq 20, sample 100000
get_top_similar('biden', embeddings_norm, vocab)

vocab size: 9061


{'trump': 0.32836217,
 'bidens': 0.31359124,
 'obama': 0.28926462,
 'sleepy': 0.28519666,
 'pence': 0.2774164}

In [9]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs') # freq 20, sample 100000
get_top_similar('captain', embeddings_norm, vocab)

vocab size: 9061


{'prosecuted': 0.24044552,
 'directors': 0.22615686,
 'debatable': 0.22474325,
 'classified': 0.22291517,
 'comey': 0.22155952}

##### politics

In [11]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs') # freq 20, sample 100000
get_top_similar('bad', embeddings_norm, vocab)

vocab size: 10715


{'good': 0.4490177,
 'terrible': 0.36272916,
 'awful': 0.2903085,
 'bug': 0.25489658,
 'smart': 0.25289032}

In [12]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs') # freq 20, sample 100000
get_top_similar('biden', embeddings_norm, vocab)

vocab size: 10715


{'bidens': 0.37397128,
 'bernie': 0.3396699,
 'hillary': 0.3112797,
 'sanders': 0.26909018,
 'trump': 0.26213026}

In [13]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs') # freq 20, sample 100000
get_top_similar('captain', embeddings_norm, vocab)

vocab size: 10715


{'respectable': 0.2088904,
 'utility': 0.20452517,
 'finals': 0.20363128,
 'dogshit': 0.20302075,
 'bruins': 0.20159477}

#### Analogies

In [None]:
# analogy examples



In [16]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
get_analogy('woman', 'queen', 'man', embeddings_norm, vocab)

vocab size: 10378
father 0.34129798
{'men': 0.3154149, 'father': 0.34129798, 'nickname': 0.2864819, 'noble': 0.28867188, 'levy': 0.31405658}


In [15]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs')
get_analogy('politics', 'trump', 'sports', embeddings_norm, vocab)

vocab size: 9061
trumps 0.43255517
{'trumps': 0.43255517, 'aaron': 0.39563692, 'drivers': 0.3178765, 'fdr': 0.31065878, 'analysts': 0.31284007}


In [14]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
get_analogy('politics', 'trump', 'sports', embeddings_norm, vocab)

vocab size: 10378
bowls 0.34564084
{'delegates': 0.3075849, 'bowls': 0.34564084, 'advantages': 0.30759877, 'asinine': 0.32302096, 'morata': 0.3045337}


In [17]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs')
get_analogy('politics', 'trump', 'sports', embeddings_norm, vocab)

vocab size: 10715
jong 0.3763522
{'trumps': 0.33410287, 'stars': 0.32657743, 'reject': 0.31470904, 'zach': 0.3358821, 'jong': 0.3763522}


In [18]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs')
get_analogy('biden', 'democrat', 'trump', embeddings_norm, vocab)

vocab size: 10715
republican 0.40329498
{'republican': 0.40329498, 'politician': 0.39520502, 'reaction': 0.32067028, 'tool': 0.31063023, 'buffoon': 0.34742776}


In [53]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs')
get_analogy('politics', 'trump', 'sports', embeddings_norm, vocab)

vocab size: 9061
trump 0.9494727
{'trump': 0.9494727, 'trumps': 0.43255517, 'sports': 0.7861865, 'aaron': 0.39563692, 'drivers': 0.3178765}


In [44]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
get_analogy('man', 'king', 'woman', embeddings_norm, vocab)

vocab size: 10378
king 0.9912237
{'woman': 0.8409491, 'girl': 0.357291, 'king': 0.9912237, 'combine': 0.36295393, 'flowers': 0.41219172}


In [45]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs')
get_analogy('man', 'king', 'woman', embeddings_norm, vocab)

vocab size: 9061
woman 1.0140263
{'health': 0.3537078, 'woman': 1.0140263, 'king': 0.96158165, 'ron': 0.32921895, 'feminist': 0.33566618}


In [46]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
get_analogy('boy', 'man', 'girl', embeddings_norm, vocab)

vocab size: 10378
man 0.96783704
{'guy': 0.39905325, 'man': 0.96783704, 'person': 0.42131552, 'woman': 0.4835636, 'girl': 0.90821165}


In [47]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs')
get_analogy('boy', 'man', 'girl', embeddings_norm, vocab)

vocab size: 9061
man 0.93574435
{'man': 0.93574435, 'girl': 0.9012397, 'challenge': 0.3479978, 'pieces': 0.36661068, 'hong': 0.3513927}


In [50]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs')
get_analogy('paris', 'france', 'london', embeddings_norm, vocab)

vocab size: 9061
london 1.0947541
{'england': 0.44113284, 'france': 1.0877644, 'london': 1.0947541, 'desperation': 0.39885274, 'segregated': 0.3995838}


In [52]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
get_analogy('paris', 'france', 'london', embeddings_norm, vocab)

vocab size: 10378
london 0.9265596
{'england': 0.34228924, 'france': 0.88700163, 'london': 0.9265596, 'italy': 0.36605185, 'stl': 0.3462011}


In [64]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
get_analogy('like', 'love', 'dislike', embeddings_norm, vocab)

vocab size: 10378
dislike 1.1648291
{'love': 1.1029745, 'feel': 0.50194085, 'hate': 0.65649307, 'hated': 0.54593873, 'dislike': 1.1648291}


In [68]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
get_analogy('biden', 'democrat', 'trump', embeddings_norm, vocab)

vocab size: 10378
landslide 0.43094638
{'liberals': 0.34583837, 'illegals': 0.34671915, 'dem': 0.40632367, 'landslide': 0.43094638, 'electors': 0.36139715}


In [69]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
get_analogy('obama', 'democrat', 'trump', embeddings_norm, vocab)

vocab size: 10378
republican 0.38552612
{'conservative': 0.358797, 'republican': 0.38552612, 'democratic': 0.30435225, 'dem': 0.31952465, 'hunt': 0.3087293}


In [70]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
get_analogy('liberal', 'democrat', 'conservative', embeddings_norm, vocab)

vocab size: 10378
dem 0.39227346
{'dem': 0.39227346, 'endorse': 0.31491223, 'classical': 0.30324933, 'discourse': 0.3406249, 'extremist': 0.3775786}


In [71]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs')
get_analogy('liberal', 'democrat', 'conservative', embeddings_norm, vocab)

vocab size: 9061
republican 0.4410797
{'republican': 0.4410797, 'immigrants': 0.31548265, 'dem': 0.4169313, 'beto': 0.33393866, 'radicals': 0.3171276}


In [72]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs')
get_analogy('biden', 'democrat', 'trump', embeddings_norm, vocab)

vocab size: 9061
republican 0.41996822
{'conservative': 0.38603973, 'republican': 0.41996822, 'nyc': 0.32983807, 'quiet': 0.31968936, 'counsel': 0.31419033}


In [74]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs')
get_analogy('politics', 'election', 'football', embeddings_norm, vocab)

vocab size: 9061
ucl 0.41692564
{'elections': 0.39627045, 'ucl': 0.41692564, 'juventus': 0.3544297, 'dust': 0.40815508, 'shade': 0.3729683}


In [75]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs')
get_analogy('politics', 'election', 'basketball', embeddings_norm, vocab)

vocab size: 9061
elections 0.41813532
{'game': 0.3093809, 'elections': 0.41813532, 'heat': 0.32754976, 'fifth': 0.38782054, 'cents': 0.30739266}


In [76]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs')
get_analogy('voters', 'election', 'fans', embeddings_norm, vocab)

vocab size: 9061
fan 0.4524206
{'fan': 0.4524206, 'series': 0.36706093, 'cup': 0.362842, 'announce': 0.35837916, 'desperation': 0.3587853}


In [None]:
# non gendered examples
# dem subs
# list of analogy words -> compare agg values dem vs rep
# similarity values -> compare dem vs rep
# only [sport, sports] in wsd

#### CLustering

In [49]:
concept1 = ['red', 'blue', 'green', 'black']
concept2 = ['trump', 'biden', 'president', 'election']
concept3 = ['sports', 'game', 'play', 'score']
concept4 = ['good', 'bad', 'terrible', 'great']
concept5 = ['car', 'bus', 'train']

In [58]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')

vocab size: 10378


In [59]:
kmeans = KMeans(n_clusters=4, random_state=0, n_init="auto").fit(embeddings_norm)

In [62]:
vec = embeddings_norm[vocab['red']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [63]:
vec = embeddings_norm[vocab['blue']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [64]:
vec = embeddings_norm[vocab['trump']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [67]:
vec = embeddings_norm[vocab['biden']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [60]:
vec = embeddings_norm[vocab['sports']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [65]:
vec = embeddings_norm[vocab['bad']].reshape(1, -1)
kmeans.predict(vec)

array([3], dtype=int32)

In [66]:
vec = embeddings_norm[vocab['good']].reshape(1, -1)
kmeans.predict(vec)

array([3], dtype=int32)

In [69]:
vec = embeddings_norm[vocab['car']].reshape(1, -1)
kmeans.predict(vec)

array([0], dtype=int32)

In [70]:
vec = embeddings_norm[vocab['bus']].reshape(1, -1)
kmeans.predict(vec)

array([0], dtype=int32)

In [71]:
vec = embeddings_norm[vocab['train']].reshape(1, -1)
kmeans.predict(vec)

array([0], dtype=int32)

#### Outlier detection

In [106]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')

vocab size: 10378


In [99]:
get_outlier(['apple', 'orange', 'bus'], embeddings_norm, vocab)

bus 0.021671796683222055


In [100]:
get_outlier(['sports', 'game', 'play', 'president'], embeddings_norm, vocab)

president -0.03133737722722193


In [56]:
get_outlier(['red', 'france', 'green', 'blue'], embeddings_norm, vocab)

france -0.006834672763943672


In [57]:
get_outlier(['trump', 'biden', 'president', 'election', 'apple'], embeddings_norm, vocab)

apple 0.008264641437563114


#### Agg eval

In [65]:
# sensitive to choice of words
# how to choose words?

politics_words = ['voters', 'election', 'president', 'democrats', 'republicans', 'politicians', 'media']
sports_words = ['fans', 'race', 'captain', 'team', 'players', 'cup', 'trophy']

##### average pairwise similarity between groups

In [66]:
def avg_sim(sub, l1, l2):
    embeddings_norm, vocab = get_embeddings_and_vocab(sub)
    all_pairs = []
    for p in l1:
        all_pairs.extend([(p,s) for s in l2])
    sim_list = []
    for pair in all_pairs:
        sim = get_sim(pair[0], pair[1], embeddings_norm, vocab)
        sim_list.append(sim)
    print(statistics.mean(sim_list))

In [87]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs') # freq 20, sample 100000
politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())

avg_sim('the_donald_1000_epochs', politics_words, sports_words)

vocab size: 10378
vocab size: 10378
0.013313849


In [88]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs') # freq 20, sample 100000
politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())

avg_sim('conservative_1000_epochs', politics_words, sports_words)

vocab size: 9061
vocab size: 9061
0.016456766


In [89]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs') # freq 20, sample 100000
politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())

avg_sim('politics_1000_epochs', politics_words, sports_words)

vocab size: 10715
vocab size: 10715
0.023214294


##### centroid distance

In [70]:
def get_cent(word_list, embeddings_norm, vocab):
    vec_list = []
    for word in word_list:
        word_id = vocab[word]
        if word_id == 0:
            print("Out of vocabulary word")
            return
        word_vec = embeddings_norm[word_id]
        vec_list.append(word_vec)
    cent = np.mean(np.array(vec_list), axis=0)
    return cent
    

def cent_dist(sub, l1, l2):
    embeddings_norm, vocab = get_embeddings_and_vocab(sub)
    l1_cent = get_cent(l1, embeddings_norm, vocab)
    l2_cent = get_cent(l2, embeddings_norm, vocab)
    print(np.linalg.norm(l1_cent - l2_cent))

In [90]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs')
politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())
cent_dist('politics_1000_epochs', politics_words, sports_words)

vocab size: 10715
vocab size: 10715
0.23763177


In [91]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())
cent_dist('the_donald_1000_epochs', politics_words, sports_words)

vocab size: 10378
vocab size: 10378
0.262931


In [92]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs')
politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())
cent_dist('conservative_1000_epochs', politics_words, sports_words)

vocab size: 9061
vocab size: 9061
0.24582419
