In [1]:
import numpy as np
import pandas as pd
import torch
import sys
import re

from sklearn.manifold import TSNE
import plotly.graph_objects as go

import argparse
import os
from os.path import dirname, abspath
from functools import partial
import json
import yaml
import numpy as np

import torch
import torch.nn as nn
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR

from datasets import load_dataset

import seaborn as sns
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt

from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

from datasets import Dataset
from tqdm.auto import tqdm

from sklearn.cluster import KMeans
import itertools

from collections import deque
from itertools import permutations
import statistics

In [2]:
EMBED_DIMENSION = 300
EMBED_MAX_NORM = 1

data_dir = "/users/ujan/sports-language-in-politics/models/cbow/"

In [3]:
if torch.backends.mps.is_available(): device = "mps"
elif torch.cuda.is_available(): device = "cuda"
else: device = "cpu"

In [4]:
class CBOW_Model(nn.Module):
    """
    Implementation of CBOW model described in paper:
    https://arxiv.org/abs/1301.3781
    """

    def __init__(self, vocab_size: int):
        super(CBOW_Model, self).__init__()
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=EMBED_DIMENSION,
            max_norm=EMBED_MAX_NORM,
        )
        self.linear = nn.Linear(
            in_features=EMBED_DIMENSION,
            out_features=vocab_size,
        )

    def forward(self, inputs):
        x = self.embeddings(inputs)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x

In [5]:
def get_embeddings_and_vocab(sub_name):
    
    vocab = torch.load(data_dir+sub_name+"_vocab.pt")
    print('vocab size: {}'.format(len(vocab.get_itos())))
    model = CBOW_Model(vocab_size=len(vocab.get_itos()))
    model = torch.load(data_dir+sub_name+"_model.pt", map_location=device)

    def normalization(embeddings):
        norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
        norms = np.reshape(norms, (len(norms), 1))
        embeddings_norm = embeddings / norms
        return embeddings_norm
    
    # embedding from first model layer
    embeddings = list(model.parameters())[0].cpu().detach().numpy()
    
    # normalization
    embeddings_norm = normalization(embeddings)

    return embeddings_norm, vocab


def get_top_similar(word: str, embeddings_norm, vocab, topN: int=5):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict


def get_sim(word1: str, word2: str, embeddings_norm, vocab):
    if word1 not in vocab.get_itos() or word2 not in vocab.get_itos():
        print('not in vocab')
        return
    word1_id = vocab[word1]
    if word1_id == 0:
        print("Out of vocabulary word")
        return
    word2_id = vocab[word2]
    if word2_id == 0:
        print("Out of vocabulary word")
        return

    word1_vec = embeddings_norm[word1_id]
    #word1_vec = np.reshape(word1_vec, (len(word1_vec), 1))
    
    word2_vec = embeddings_norm[word2_id]
    #word2_vec = np.reshape(word2_vec, (len(word2_vec), 1))
    
    return np.dot(word1_vec, word2_vec)


def get_analogy(a,b,c, embeddings_norm, vocab, topk=5):
    vocab_list = vocab.get_itos()
    if a not in vocab_list or b not in vocab_list or c not in vocab_list:
        print('not in vocab')
        return
    a_id = vocab[a]
    b_id = vocab[b]
    c_id = vocab[c]
    if a_id == 0 or b_id == 0 or c_id == 0:
        print("Out of vocabulary word")
        return
    a_vec = embeddings_norm[a_id]
    b_vec = embeddings_norm[b_id]
    c_vec = embeddings_norm[c_id]
    sim = -2
    target = None
    sim_dict = {}
    for token in vocab_list:
        if token in [a,b,c]:
            continue
        token_id = vocab[token]
        token_vec = embeddings_norm[token_id]
        s = np.dot(token_vec, b_vec-a_vec+c_vec)
        if s > sim:
            sim = s
            target = token
        if len(sim_dict) < topk:
            sim_dict[token] = s
        else:
            min_key = min(sim_dict, key=sim_dict.get)
            min_val = sim_dict[min_key]
            if s > min_val:
                del sim_dict[min_key]
                sim_dict[token] = s
                    
    #print(target, sim)
    return sim_dict


def get_outlier(word_list, embeddings_norm, vocab):
    vocab_list = vocab.get_itos()
    for word in word_list:
        if word not in vocab_list:
            print('not in vocab')
            return
    pairs = list(itertools.combinations(word_list, 2))
    sim_dict = {w:0 for w in word_list}
    for pair in pairs:
        vec1 = embeddings_norm[vocab[pair[0]]]
        vec2 = embeddings_norm[vocab[pair[1]]]
        sim = np.dot(vec1, vec2)
        sim_dict[pair[0]] += sim
        sim_dict[pair[1]] += sim
    new_d = {w:0 for w in word_list}
    for key, val in sim_dict.items():
        new_d[key] = val/(len(word_list)-1)
    word = None
    value = 2
    for key, val in new_d.items():
        if val < value:
            word = key
            value = val
    print(word, value)

#### Similarities

##### the_donald

In [7]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs') # freq 20, sample 100000
get_top_similar('politics', embeddings_norm, vocab)

vocab size: 10378


{'hearings': 0.22003981,
 'leaning': 0.2085969,
 'sweden': 0.20754245,
 'reddits': 0.20666334,
 'pen': 0.20566946}

In [20]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs') # freq 20, sample 100000
get_top_similar('voters', embeddings_norm, vocab)

vocab size: 10378


{'dems': 0.27929482,
 'vote': 0.2669506,
 'demographics': 0.26626676,
 'voting': 0.26432395,
 'republicans': 0.26054394}

In [8]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs') # freq 20, sample 100000
get_top_similar('woman', embeddings_norm, vocab)

vocab size: 10378


{'girl': 0.3978128,
 'person': 0.33237395,
 'women': 0.3177452,
 'guy': 0.25549906,
 'husband': 0.2481705}

In [10]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs') # freq 20, sample 100000
get_top_similar('trump', embeddings_norm, vocab)

vocab size: 10378


{'obama': 0.33401677,
 'hillary': 0.33117822,
 'trumps': 0.3227671,
 'geotus': 0.30152357,
 'potus': 0.2929576}

##### conservative

In [12]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs') # freq 20, sample 100000
get_top_similar('bad', embeddings_norm, vocab)

vocab size: 9061


{'good': 0.34852636,
 'terrible': 0.3032945,
 'shitty': 0.29318285,
 'horrible': 0.27719852,
 'insignificant': 0.24304685}

In [13]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs') # freq 20, sample 100000
get_top_similar('biden', embeddings_norm, vocab)

vocab size: 9061


{'trump': 0.32836217,
 'bidens': 0.31359124,
 'obama': 0.28926462,
 'sleepy': 0.28519666,
 'pence': 0.2774164}

In [9]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs') # freq 20, sample 100000
get_top_similar('captain', embeddings_norm, vocab)

vocab size: 9061


{'prosecuted': 0.24044552,
 'directors': 0.22615686,
 'debatable': 0.22474325,
 'classified': 0.22291517,
 'comey': 0.22155952}

In [56]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_2019_10_35k_samples') # freq 20, sample 100000
get_top_similar('bad', embeddings_norm, vocab)

vocab size: 5812


{'dumb': 0.24256499,
 'horrible': 0.23618743,
 'great': 0.22729284,
 'bust': 0.21777347,
 'terrible': 0.21155575}

##### politics

In [11]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs') # freq 20, sample 100000
get_top_similar('bad', embeddings_norm, vocab)

vocab size: 10715


{'good': 0.4490177,
 'terrible': 0.36272916,
 'awful': 0.2903085,
 'bug': 0.25489658,
 'smart': 0.25289032}

In [12]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs') # freq 20, sample 100000
get_top_similar('biden', embeddings_norm, vocab)

vocab size: 10715


{'bidens': 0.37397128,
 'bernie': 0.3396699,
 'hillary': 0.3112797,
 'sanders': 0.26909018,
 'trump': 0.26213026}

In [13]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs') # freq 20, sample 100000
get_top_similar('captain', embeddings_norm, vocab)

vocab size: 10715


{'respectable': 0.2088904,
 'utility': 0.20452517,
 'finals': 0.20363128,
 'dogshit': 0.20302075,
 'bruins': 0.20159477}

In [140]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_2015_10_35k_samples') # freq 20, sample 100000
get_top_similar('bad', embeddings_norm, vocab)

vocab size: 5843


{'tough': 0.24751696,
 'good': 0.23711401,
 'terrible': 0.21078521,
 'naive': 0.2104066,
 'rough': 0.20117292}

In [32]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_2015_9_12_100k_samples') # freq 20, sample 100000
get_top_similar('bad', embeddings_norm, vocab)

vocab size: 9269


{'good': 0.3681072,
 'terrible': 0.3150045,
 'shitty': 0.26269546,
 'crooked': 0.2503178,
 'useful': 0.23909773}

#### Analogies

##### the_donald

In [16]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
get_analogy('politics', 'president', 'sports', embeddings_norm, vocab)
get_analogy('biden', 'democrat', 'trump', embeddings_norm, vocab)
get_analogy('trump', 'republican', 'biden', embeddings_norm, vocab)
get_analogy('election', 'politician', 'cup', embeddings_norm, vocab)
get_analogy('fans', 'tournament', 'voters', embeddings_norm, vocab)
get_analogy('voters', 'election', 'fans', embeddings_norm, vocab)
get_analogy('democrats', 'election', 'team', embeddings_norm, vocab)
get_analogy('republicans', 'election', 'team', embeddings_norm, vocab)

vocab size: 10378
scout 0.4258247
landslide 0.43094638
lottery 0.3903635
pitcher 0.42554766
polls 0.38245848
fan 0.4239778
game 0.4688605
roster 0.48409295


{'teams': 0.43678302,
 'roster': 0.48409295,
 'franchise': 0.42832777,
 'championship': 0.42237678,
 'squad': 0.4126062}

##### conservative

In [176]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs')
get_analogy('politics', 'president', 'sports', embeddings_norm, vocab)
get_analogy('biden', 'democrat', 'trump', embeddings_norm, vocab)
get_analogy('trump', 'republican', 'biden', embeddings_norm, vocab)
get_analogy('election', 'politician', 'cup', embeddings_norm, vocab)
get_analogy('fans', 'tournament', 'voters', embeddings_norm, vocab)
get_analogy('voters', 'election', 'fans', embeddings_norm, vocab)
get_analogy('democrats', 'election', 'team', embeddings_norm, vocab)
get_analogy('republicans', 'election', 'team', embeddings_norm, vocab)

vocab size: 9061
athlete 0.36311758
{'player': 0.34608072, 'potus': 0.3311136, 'athlete': 0.36311758, 'ucl': 0.3347901, 'umbrella': 0.32594782}

republican 0.41996822
{'conservative': 0.38603973, 'republican': 0.41996822, 'nyc': 0.32983807, 'quiet': 0.31968936, 'counsel': 0.31419033}

gop 0.34579286
{'gone': 0.30106226, 'gop': 0.34579286, 'democratic': 0.3361963, 'pence': 0.33597142, 'master': 0.29881886}

serie 0.41513517
{'teacher': 0.35261863, 'quarterback': 0.3486863, 'striker': 0.39161766, 'edmonton': 0.35449815, 'serie': 0.41513517}

votes 0.39529595
{'vote': 0.3623533, 'votes': 0.39529595, 'voter': 0.3764603, 'nomination': 0.35263157, 'rigging': 0.37245345}

fan 0.4524206
{'fan': 0.4524206, 'series': 0.36706093, 'cup': 0.362842, 'announce': 0.35837916, 'desperation': 0.3587853}

roster 0.51683754
{'game': 0.43009406, 'player': 0.42423913, 'teams': 0.45016965, 'roster': 0.51683754, 'superbowl': 0.45683464}

roster 0.54680926
{'game': 0.46875653, 'player': 0.43579563, 'roster': 0.

##### politics

In [177]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs')
get_analogy('politics', 'president', 'sports', embeddings_norm, vocab)
get_analogy('biden', 'democrat', 'trump', embeddings_norm, vocab)
get_analogy('trump', 'republican', 'biden', embeddings_norm, vocab)
get_analogy('election', 'politician', 'cup', embeddings_norm, vocab)
get_analogy('fans', 'tournament', 'voters', embeddings_norm, vocab)
get_analogy('voters', 'election', 'fans', embeddings_norm, vocab)
get_analogy('democrats', 'election', 'team', embeddings_norm, vocab)
get_analogy('republicans', 'election', 'team', embeddings_norm, vocab)

vocab size: 10715
decade 0.35326236
{'candidate': 0.34323546, 'decade': 0.35326236, 'threshold': 0.3335488, 'coached': 0.31434968, 'zach': 0.32353944}

republican 0.40329498
{'republican': 0.40329498, 'politician': 0.39520502, 'reaction': 0.32067028, 'tool': 0.31063023, 'buffoon': 0.34742776}

gop 0.43358126
{'gop': 0.43358126, 'dem': 0.40755415, 'impeach': 0.36443967, 'bidens': 0.35762668, 'burrow': 0.35972166}

striker 0.41592416
{'player': 0.37873068, 'billionaire': 0.37589896, 'striker': 0.41592416, 'winger': 0.39610106, 'bless': 0.3746383}

presidency 0.3356225
{'vote': 0.33554313, 'election': 0.32857814, 'gop': 0.33106822, 'presidency': 0.3356225, 'camps': 0.31006607}

fan 0.4739089
{'game': 0.3628464, 'fan': 0.4739089, 'sub': 0.4320428, 'elections': 0.4158188, 'excuse': 0.35385194}

season 0.47238484
{'game': 0.40681168, 'season': 0.47238484, 'teams': 0.4163164, 'roster': 0.40146235, 'tournament': 0.40953067}

season 0.44921964
{'game': 0.4001895, 'season': 0.44921964, 'election

##### comparison with 'true' words

In [88]:
inputs1 = ['politics', 'president', 'sports']
outputs1 = ['player', 'captain', 'star', 'coach', 'striker']

inputs2 = ['election', 'politician', 'cup']
outputs2 = ['player', 'captain', 'star', 'coach', 'striker']

inputs3 = ['fans', 'tournament', 'voters']
outputs3 = ['election']

inputs4 = ['voters', 'election', 'fans']
outputs4 = ['tournament', 'cup', 'race']

inputs5 = ['democrats', 'election', 'team']
outputs5 = ['tournament', 'cup', 'race']

inputs6 = ['republicans', 'election', 'team']
outputs6 = ['tournament', 'cup', 'race']

def compare_4th(inputs, outputs, embeddings_norm, vocab):
    sim_dict = get_analogy(inputs[0], inputs[1], inputs[2], embeddings_norm, vocab, topk=5)
    res_outputs = list(sim_dict.keys())
    all_pairs = []
    for p in outputs:
        all_pairs.extend([(p,s) for s in res_outputs])
    sim_list = []
    for pair in all_pairs:
        sim = get_sim(pair[0], pair[1], embeddings_norm, vocab)
        sim_list.append(sim)
    print(statistics.mean(sim_list))

In [26]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
compare_4th(inputs1, outputs1, embeddings_norm, vocab)
compare_4th(inputs2, outputs2, embeddings_norm, vocab)
compare_4th(inputs3, outputs3, embeddings_norm, vocab)
compare_4th(inputs4, outputs4, embeddings_norm, vocab)
compare_4th(inputs5, outputs5, embeddings_norm, vocab)
compare_4th(inputs6, outputs6, embeddings_norm, vocab)

vocab size: 10378
scout 0.4258247
0.05329864
pitcher 0.42554766
0.12283832
polls 0.38245848
0.17210127
fan 0.4239778
0.050832666
game 0.4688605
0.16184813
roster 0.48409295
0.0924477


In [25]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs')
compare_4th(inputs1, outputs1, embeddings_norm, vocab)
compare_4th(inputs2, outputs2, embeddings_norm, vocab)
compare_4th(inputs3, outputs3, embeddings_norm, vocab)
compare_4th(inputs4, outputs4, embeddings_norm, vocab)
compare_4th(inputs5, outputs5, embeddings_norm, vocab)
compare_4th(inputs6, outputs6, embeddings_norm, vocab)

vocab size: 10715
decade 0.35326236
0.065354764
striker 0.41592416
0.1924319
presidency 0.3356225
0.34701788
fan 0.4739089
0.09828656
season 0.47238484
0.18816395
season 0.44921964
0.11644116


In [14]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
get_analogy('politics', 'president', 'sports', embeddings_norm, vocab)
get_analogy('biden', 'democrat', 'trump', embeddings_norm, vocab)
get_analogy('trump', 'republican', 'biden', embeddings_norm, vocab)
get_analogy('election', 'politician', 'cup', embeddings_norm, vocab)
get_analogy('fans', 'tournament', 'voters', embeddings_norm, vocab)
get_analogy('voters', 'election', 'fans', embeddings_norm, vocab)
get_analogy('democrats', 'election', 'team', embeddings_norm, vocab)
get_analogy('republicans', 'election', 'team', embeddings_norm, vocab)

vocab size: 10378
scout 0.4258247
{'pitch': 0.3304557, 'bulls': 0.3169939, 'scout': 0.4258247, 'gaining': 0.31374907, 'household': 0.3362836}

landslide 0.43094638
{'liberals': 0.34583837, 'illegals': 0.34671915, 'dem': 0.40632367, 'landslide': 0.43094638, 'electors': 0.36139715}

lottery 0.3903635
{'lottery': 0.3903635, 'corpse': 0.35305783, 'rig': 0.33654293, 'sympathetic': 0.36366728, 'nutshell': 0.3700594}

pitcher 0.42554766
{'pitcher': 0.42554766, 'contender': 0.35029933, 'cups': 0.3452366, 'commentator': 0.4178512, 'accent': 0.37176338}

polls 0.38245848
{'party': 0.3540228, 'voting': 0.37569758, 'polls': 0.38245848, 'award': 0.37299412, 'debates': 0.3624028}

fan 0.4239778
{'game': 0.41483322, 'fan': 0.4239778, 'coach': 0.3303164, 'fanbase': 0.35963717, 'scientist': 0.3560795}

game 0.4688605
{'game': 0.4688605, 'teams': 0.44293118, 'elections': 0.43620184, 'receiver': 0.4156961, 'tournament': 0.4615939}

roster 0.48409295
{'teams': 0.43678302, 'roster': 0.48409295, 'franchise'

#### Clustering

In [49]:
concept1 = ['red', 'blue', 'green', 'black']
concept2 = ['trump', 'biden', 'president', 'election']
concept3 = ['sports', 'game', 'play', 'score']
concept4 = ['good', 'bad', 'terrible', 'great']
concept5 = ['car', 'bus', 'train']

In [58]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')

vocab size: 10378


In [59]:
kmeans = KMeans(n_clusters=4, random_state=0, n_init="auto").fit(embeddings_norm)

In [62]:
vec = embeddings_norm[vocab['red']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [63]:
vec = embeddings_norm[vocab['blue']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [64]:
vec = embeddings_norm[vocab['trump']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [67]:
vec = embeddings_norm[vocab['biden']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [60]:
vec = embeddings_norm[vocab['sports']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [65]:
vec = embeddings_norm[vocab['bad']].reshape(1, -1)
kmeans.predict(vec)

array([3], dtype=int32)

In [66]:
vec = embeddings_norm[vocab['good']].reshape(1, -1)
kmeans.predict(vec)

array([3], dtype=int32)

In [69]:
vec = embeddings_norm[vocab['car']].reshape(1, -1)
kmeans.predict(vec)

array([0], dtype=int32)

In [70]:
vec = embeddings_norm[vocab['bus']].reshape(1, -1)
kmeans.predict(vec)

array([0], dtype=int32)

In [71]:
vec = embeddings_norm[vocab['train']].reshape(1, -1)
kmeans.predict(vec)

array([0], dtype=int32)

#### Outlier detection

In [106]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')

vocab size: 10378


In [99]:
get_outlier(['apple', 'orange', 'bus'], embeddings_norm, vocab)

bus 0.021671796683222055


In [100]:
get_outlier(['sports', 'game', 'play', 'president'], embeddings_norm, vocab)

president -0.03133737722722193


In [56]:
get_outlier(['red', 'france', 'green', 'blue'], embeddings_norm, vocab)

france -0.006834672763943672


In [57]:
get_outlier(['trump', 'biden', 'president', 'election', 'apple'], embeddings_norm, vocab)

apple 0.008264641437563114


#### Agg eval

In [6]:
# sensitive to choice of words
# how to choose words?

politics_words = ['voters', 'election', 'president', 'democrats', 'republicans', 'politicians', 'media']
sports_words = ['fans', 'race', 'captain', 'team', 'players', 'cup', 'trophy']

##### average pairwise similarity between groups

In [7]:
def avg_sim(sub, l1, l2):
    embeddings_norm, vocab = get_embeddings_and_vocab(sub)
    all_pairs = []
    for p in l1:
        all_pairs.extend([(p,s) for s in l2])
    sim_list = []
    for pair in all_pairs:
        sim = get_sim(pair[0], pair[1], embeddings_norm, vocab)
        sim_list.append(sim)
    print(statistics.mean(sim_list))

In [8]:
#embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs') # freq 20, sample 100000
#politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
#sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())

avg_sim('the_donald_1000_epochs', politics_words, sports_words)

vocab size: 10378
0.03423778


In [9]:
#embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs') # freq 20, sample 100000
#politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
#sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())

avg_sim('conservative_1000_epochs', politics_words, sports_words)

vocab size: 9061
0.032325916


In [10]:
#embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs') # freq 20, sample 100000
#politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
#sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())

avg_sim('politics_1000_epochs', politics_words, sports_words)

vocab size: 10715
0.02686279


##### average target pair similarity

In [11]:
word_map = {'voters': ['fan', 'fans'],
            'election': ['race', 'cup', 'championship'],
            'trump': ['captain', 'coach'],
            'biden': ['captain', 'coach'], 
            'democrats': ['team', 'teams'],
            'republicans': ['team', 'teams'],
            'politicians': ['player', 'players'],
           }

In [12]:
def avg_map_sim(sub, d):
    embeddings_norm, vocab = get_embeddings_and_vocab(sub)
    sim_list = []
    for key, val in d.items():
        for v in val:
            sim = get_sim(key, v, embeddings_norm, vocab)
            sim_list.append(sim)
    print(statistics.mean(sim_list))

In [13]:
avg_map_sim('the_donald_1000_epochs', word_map)

vocab size: 10378
0.05728969


In [14]:
avg_map_sim('conservative_1000_epochs', word_map)

vocab size: 9061
0.044791397


In [15]:
avg_map_sim('politics_1000_epochs', word_map)

vocab size: 10715
0.050297394


##### centroid distance

In [16]:
def get_cent(word_list, embeddings_norm, vocab):
    vec_list = []
    for word in word_list:
        word_id = vocab[word]
        if word_id == 0:
            print("Out of vocabulary word")
            return
        word_vec = embeddings_norm[word_id]
        vec_list.append(word_vec)
    cent = np.mean(np.array(vec_list), axis=0)
    return cent


def cent_dist(sub, l1, l2):
    embeddings_norm, vocab = get_embeddings_and_vocab(sub)
    l1_cent = get_cent(l1, embeddings_norm, vocab)
    l2_cent = get_cent(l2, embeddings_norm, vocab)
    print(np.linalg.norm(l1_cent - l2_cent))

In [17]:
#embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
#politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
#sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())
cent_dist('the_donald_1000_epochs', politics_words, sports_words)

vocab size: 10378
0.6262743


In [18]:
#embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs')
#politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
#sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())
cent_dist('conservative_1000_epochs', politics_words, sports_words)

vocab size: 9061
0.64171845


In [19]:
#embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs')
#politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
#sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())
cent_dist('politics_1000_epochs', politics_words, sports_words)

vocab size: 10715
0.65180635


#### temporal embeddings

##### politics

In [20]:
# avg pairwise sim

avg_sim('politics_sep_dec_2015', politics_words, sports_words)
avg_sim('politics_sep_dec_2016', politics_words, sports_words)
avg_sim('politics_sep_dec_2018', politics_words, sports_words)
avg_sim('politics_sep_dec_2020', politics_words, sports_words)

vocab size: 6847
0.026348129
vocab size: 14251
0.033180367
vocab size: 14215
0.032419257
vocab size: 16682
0.038351674


In [117]:
# target pair sim

avg_map_sim('politics_sep_dec_2015', word_map)
avg_map_sim('politics_sep_dec_2016', word_map)
avg_map_sim('politics_sep_dec_2018', word_map)
avg_map_sim('politics_sep_dec_2020', word_map)

vocab size: 6847
0.0676941
vocab size: 14251
0.080515206
vocab size: 14215
0.067518756
vocab size: 16682
0.097725645


##### conservative oct

In [27]:
# sensitive to choice of words
# how to choose words?

politics_words = ['voters', 'election', 'president', 'democrats', 'republicans', 'politicians', 'media']
sports_words = ['fans', 'race', 'captain', 'team', 'players', 'cup', 'trophy',]

# do by specific analogies
word_map = {'voters': ['fan', 'fans'],
            'election': ['race', 'cup', 'championship'],
            'trump': ['captain', 'qb'],
            'president': ['captain', 'qb'], 
            'democrats': ['team', 'teams'],
            'republicans': ['team', 'teams'],
            'politicians': ['player', 'players'],
           }

inputs1 = ['politics', 'president', 'sports']
outputs1 = ['player', 'captain', 'star', 'coach', 'striker']

inputs2 = ['election', 'politician', 'cup']
outputs2 = ['player', 'captain', 'star', 'coach', 'striker']

inputs3 = ['fans', 'tournament', 'voters']
outputs3 = ['election']

inputs4 = ['voters', 'election', 'fans']
outputs4 = ['tournament', 'cup', 'race']

inputs5 = ['democrats', 'election', 'team']
outputs5 = ['tournament', 'cup', 'race']

inputs6 = ['republicans', 'election', 'team']
outputs6 = ['tournament', 'cup', 'race']

def compare_4th(inputs, outputs, embeddings_norm, vocab):
    sim_dict = get_analogy(inputs[0], inputs[1], inputs[2], embeddings_norm, vocab, topk=5)
    res_outputs = list(sim_dict.keys())
    all_pairs = []
    for p in outputs:
        all_pairs.extend([(p,s) for s in res_outputs])
    sim_list = []
    for pair in all_pairs:
        sim = get_sim(pair[0], pair[1], embeddings_norm, vocab)
        sim_list.append(sim)
    return statistics.mean(sim_list)

def avg_map_sim(sub, d):
    embeddings_norm, vocab = get_embeddings_and_vocab(sub)
    sim_list = []
    for key, val in d.items():
        key_list = []
        for v in val:
            sim = get_sim(key, v, embeddings_norm, vocab)
            key_list.append(sim)
            sim_list.append(sim)
        print('{}:{} = {}'.format(key, val, statistics.mean(key_list)))
    print(statistics.mean(sim_list))

In [28]:
# target pair sim

avg_map_sim('conservative_2015_10_35k_samples', word_map)
avg_map_sim('conservative_2016_10_35k_samples', word_map)
avg_map_sim('conservative_2017_10_35k_samples', word_map)
avg_map_sim('conservative_2018_10_35k_samples', word_map)
avg_map_sim('conservative_2019_10_35k_samples', word_map)
avg_map_sim('conservative_2020_10_35k_samples', word_map)
avg_map_sim('conservative_2021_10_35k_samples', word_map)

vocab size: 5831
voters:['fan', 'fans'] = 0.010612592101097107
election:['race', 'cup', 'championship'] = 0.07564423978328705
trump:['captain', 'qb'] = 0.015264201909303665
president:['captain', 'qb'] = 0.10416129976511002
democrats:['team', 'teams'] = 0.03371289372444153
republicans:['team', 'teams'] = 0.07419521361589432
politicians:['player', 'players'] = -0.016761720180511475
0.04462011
vocab size: 5719
voters:['fan', 'fans'] = 0.004446897655725479
election:['race', 'cup', 'championship'] = 0.0827854797244072
trump:['captain', 'qb'] = -0.005064288154244423
president:['captain', 'qb'] = 0.07635565847158432
democrats:['team', 'teams'] = 0.039617158472537994
republicans:['team', 'teams'] = 0.05513651669025421
politicians:['player', 'players'] = 0.07312791794538498
0.049039744
vocab size: 5787
voters:['fan', 'fans'] = 0.05614768713712692
election:['race', 'cup', 'championship'] = 0.08088939636945724
trump:['captain', 'qb'] = -0.00615290179848671
president:['captain', 'qb'] = 0.08496217

In [96]:
# analogies

vals = []
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_2015_10_35k_samples')
vals.append(compare_4th(inputs1, outputs1, embeddings_norm, vocab))
vals.append(compare_4th(inputs2, outputs2, embeddings_norm, vocab))
vals.append(compare_4th(inputs3, outputs3, embeddings_norm, vocab))
vals.append(compare_4th(inputs4, outputs4, embeddings_norm, vocab))
vals.append(compare_4th(inputs5, outputs5, embeddings_norm, vocab))
vals.append(compare_4th(inputs6, outputs6, embeddings_norm, vocab))

sum(vals)/len(vals)

vocab size: 5831


0.07033442457516988

In [97]:
vals = []
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_2016_10_35k_samples')
vals.append(compare_4th(inputs1, outputs1, embeddings_norm, vocab))
vals.append(compare_4th(inputs2, outputs2, embeddings_norm, vocab))
vals.append(compare_4th(inputs3, outputs3, embeddings_norm, vocab))
vals.append(compare_4th(inputs4, outputs4, embeddings_norm, vocab))
vals.append(compare_4th(inputs5, outputs5, embeddings_norm, vocab))
vals.append(compare_4th(inputs6, outputs6, embeddings_norm, vocab))

sum(vals)/len(vals)

vocab size: 5719


0.06581845801944534

In [98]:
vals = []
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_2017_10_35k_samples')
vals.append(compare_4th(inputs1, outputs1, embeddings_norm, vocab))
vals.append(compare_4th(inputs2, outputs2, embeddings_norm, vocab))
vals.append(compare_4th(inputs3, outputs3, embeddings_norm, vocab))
vals.append(compare_4th(inputs4, outputs4, embeddings_norm, vocab))
vals.append(compare_4th(inputs5, outputs5, embeddings_norm, vocab))
vals.append(compare_4th(inputs6, outputs6, embeddings_norm, vocab))

sum(vals)/len(vals)

vocab size: 5787


0.05818437319248915

In [99]:
vals = []
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_2018_10_35k_samples')
vals.append(compare_4th(inputs1, outputs1, embeddings_norm, vocab))
vals.append(compare_4th(inputs2, outputs2, embeddings_norm, vocab))
vals.append(compare_4th(inputs3, outputs3, embeddings_norm, vocab))
vals.append(compare_4th(inputs4, outputs4, embeddings_norm, vocab))
vals.append(compare_4th(inputs5, outputs5, embeddings_norm, vocab))
vals.append(compare_4th(inputs6, outputs6, embeddings_norm, vocab))

sum(vals)/len(vals)

vocab size: 5565


0.07641343896587689

In [100]:
vals = []
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_2019_10_35k_samples')
vals.append(compare_4th(inputs1, outputs1, embeddings_norm, vocab))
vals.append(compare_4th(inputs2, outputs2, embeddings_norm, vocab))
vals.append(compare_4th(inputs3, outputs3, embeddings_norm, vocab))
vals.append(compare_4th(inputs4, outputs4, embeddings_norm, vocab))
vals.append(compare_4th(inputs5, outputs5, embeddings_norm, vocab))
vals.append(compare_4th(inputs6, outputs6, embeddings_norm, vocab))

sum(vals)/len(vals)

vocab size: 5812


0.05028908187523484

In [101]:
vals = []
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_2020_10_35k_samples')
vals.append(compare_4th(inputs1, outputs1, embeddings_norm, vocab))
vals.append(compare_4th(inputs2, outputs2, embeddings_norm, vocab))
vals.append(compare_4th(inputs3, outputs3, embeddings_norm, vocab))
vals.append(compare_4th(inputs4, outputs4, embeddings_norm, vocab))
vals.append(compare_4th(inputs5, outputs5, embeddings_norm, vocab))
vals.append(compare_4th(inputs6, outputs6, embeddings_norm, vocab))

sum(vals)/len(vals)

vocab size: 5231


0.03772395228346189

In [102]:
vals = []
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_2021_10_35k_samples')
vals.append(compare_4th(inputs1, outputs1, embeddings_norm, vocab))
vals.append(compare_4th(inputs2, outputs2, embeddings_norm, vocab))
vals.append(compare_4th(inputs3, outputs3, embeddings_norm, vocab))
vals.append(compare_4th(inputs4, outputs4, embeddings_norm, vocab))
vals.append(compare_4th(inputs5, outputs5, embeddings_norm, vocab))
vals.append(compare_4th(inputs6, outputs6, embeddings_norm, vocab))

sum(vals)/len(vals)

vocab size: 5172


0.04980272217653692

##### politics oct

In [21]:
# sensitive to choice of words
# how to choose words?

politics_words = ['voters', 'election', 'president', 'democrats', 'republicans', 'politicians', 'media']
sports_words = ['fans', 'race', 'captain', 'team', 'players', 'cup', 'trophy',]

word_map = {'voters': ['fan', 'fans'],
            'election': ['race', 'cup', 'championship'],
            #'trump': ['captain', 'qb'],
            #'president': ['captain', 'qb'], 
            'democrats': ['team', 'teams'],
            'republicans': ['team', 'teams'],
            'politicians': ['player', 'players'],
           }

inputs1 = ['politics', 'president', 'sports']
outputs1 = ['player', 'captain', 'star', 'coach', 'striker']

inputs2 = ['election', 'politician', 'cup']
outputs2 = ['player', 'captain', 'star', 'coach', 'striker']

inputs3 = ['fans', 'tournament', 'voters']
outputs3 = ['election']

inputs4 = ['voters', 'election', 'fans']
outputs4 = ['tournament', 'cup', 'race']

inputs5 = ['democrats', 'election', 'team']
outputs5 = ['tournament', 'cup', 'race']

inputs6 = ['republicans', 'election', 'team']
outputs6 = ['tournament', 'cup', 'race']

def compare_4th(inputs, outputs, embeddings_norm, vocab):
    sim_dict = get_analogy(inputs[0], inputs[1], inputs[2], embeddings_norm, vocab, topk=5)
    res_outputs = list(sim_dict.keys())
    all_pairs = []
    for p in outputs:
        all_pairs.extend([(p,s) for s in res_outputs])
    sim_list = []
    for pair in all_pairs:
        sim = get_sim(pair[0], pair[1], embeddings_norm, vocab)
        sim_list.append(sim)
    return statistics.mean(sim_list)

In [23]:
# target pair sim

avg_map_sim('politics_2015_10_35k_samples', word_map)
avg_map_sim('politics_2016_10_35k_samples', word_map)
avg_map_sim('politics_2017_10_35k_samples', word_map)
avg_map_sim('politics_2018_10_35k_samples', word_map)
avg_map_sim('politics_2019_10_35k_samples', word_map)
avg_map_sim('politics_2020_10_35k_samples', word_map)
avg_map_sim('politics_2021_10_35k_samples', word_map)

vocab size: 5843
0.070684746
vocab size: 5385
0.040772125
vocab size: 5595
0.034885883
vocab size: 5516
0.050763957
vocab size: 5451
0.057183586
vocab size: 5140
0.06570791
vocab size: 5439
0.05092806


In [33]:
# target pair sim

avg_map_sim('politics_2015_10_35k_samples', word_map)
avg_map_sim('politics_2016_10_35k_samples', word_map)
avg_map_sim('politics_2017_10_35k_samples', word_map)
avg_map_sim('politics_2018_10_35k_samples', word_map)
avg_map_sim('politics_2019_10_35k_samples', word_map)
avg_map_sim('politics_2020_10_35k_samples', word_map)
avg_map_sim('politics_2021_10_35k_samples', word_map)

vocab size: 5843
voters:['fan', 'fans'] = 0.045776404440402985
election:['race', 'cup', 'championship'] = 0.11056459695100784
trump:['captain', 'qb'] = -0.03271827846765518
president:['captain', 'qb'] = 0.042103979736566544
democrats:['team', 'teams'] = 0.07359660416841507
republicans:['team', 'teams'] = 0.057892344892024994
politicians:['player', 'players'] = 0.045653846114873886
0.053086907
vocab size: 5385
voters:['fan', 'fans'] = 0.051763538271188736
election:['race', 'cup', 'championship'] = 0.09460938721895218
trump:['captain', 'qb'] = -0.05960177630186081
president:['captain', 'qb'] = 0.047684039920568466
democrats:['team', 'teams'] = -0.01196194626390934
republicans:['team', 'teams'] = 0.004429662600159645
politicians:['player', 'players'] = 0.03810134530067444
0.028310526
vocab size: 5595
voters:['fan', 'fans'] = 0.05185724049806595
election:['race', 'cup', 'championship'] = 0.04281139373779297
trump:['captain', 'qb'] = -0.010958284139633179
president:['captain', 'qb'] = 0.033

##### politics sep, oct, nov, dec

In [31]:
avg_map_sim('politics_2015_9_12_100k_samples', word_map)
avg_map_sim('politics_2016_9_12_100k_samples', word_map)
#avg_map_sim('politics_2017_10_100k_samples', word_map)

vocab size: 9269
voters:['fan', 'fans'] = 0.057058483362197876
election:['race', 'cup', 'championship'] = 0.154131218791008
trump:['captain', 'qb'] = 0.013377770781517029
president:['captain', 'qb'] = 0.09846999496221542
democrats:['team', 'teams'] = -0.0017117783427238464
republicans:['team', 'teams'] = 0.049411118030548096
politicians:['player', 'players'] = 0.15811745822429657
0.08078932
vocab size: 8817
voters:['fan', 'fans'] = 0.059039320796728134
election:['race', 'cup', 'championship'] = 0.11534001678228378
trump:['captain', 'qb'] = -0.04476126283407211
president:['captain', 'qb'] = 0.062024883925914764
democrats:['team', 'teams'] = 0.026732657104730606
republicans:['team', 'teams'] = 0.11528217047452927
politicians:['player', 'players'] = 0.09242717176675797
0.06450066
