In [1]:
import numpy as np
import pandas as pd
import torch
import sys
import re

from sklearn.manifold import TSNE
import plotly.graph_objects as go

import argparse
import os
from os.path import dirname, abspath
from functools import partial
import json
import yaml
import numpy as np

import torch
import torch.nn as nn
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR

from datasets import load_dataset

import seaborn as sns
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt

from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

from datasets import Dataset
from tqdm.auto import tqdm

from sklearn.cluster import KMeans
import itertools

from collections import deque
from itertools import permutations
import statistics

In [2]:
EMBED_DIMENSION = 300
EMBED_MAX_NORM = 1

data_dir = "/users/ujan/sports-language-in-politics/models/cbow/"

In [3]:
if torch.backends.mps.is_available(): device = "mps"
elif torch.cuda.is_available(): device = "cuda"
else: device = "cpu"

In [4]:
class CBOW_Model(nn.Module):
    """
    Implementation of CBOW model described in paper:
    https://arxiv.org/abs/1301.3781
    """

    def __init__(self, vocab_size: int):
        super(CBOW_Model, self).__init__()
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=EMBED_DIMENSION,
            max_norm=EMBED_MAX_NORM,
        )
        self.linear = nn.Linear(
            in_features=EMBED_DIMENSION,
            out_features=vocab_size,
        )

    def forward(self, inputs):
        x = self.embeddings(inputs)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x

In [5]:
def get_embeddings_and_vocab(sub_name):
    
    vocab = torch.load(data_dir+sub_name+"_vocab.pt")
    print('vocab size: {}'.format(len(vocab.get_itos())))
    model = CBOW_Model(vocab_size=len(vocab.get_itos()))
    model = torch.load(data_dir+sub_name+"_model.pt", map_location=device)

    def normalization(embeddings):
        norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
        norms = np.reshape(norms, (len(norms), 1))
        embeddings_norm = embeddings / norms
        return embeddings_norm
    
    # embedding from first model layer
    embeddings = list(model.parameters())[0].cpu().detach().numpy()
    
    # normalization
    embeddings_norm = normalization(embeddings)

    return embeddings_norm, vocab


def get_top_similar(word: str, embeddings_norm, vocab, topN: int=5):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict


def get_sim(word1: str, word2: str, embeddings_norm, vocab):
    if word1 not in vocab.get_itos() or word2 not in vocab.get_itos():
        print('not in vocab')
        return
    word1_id = vocab[word1]
    if word1_id == 0:
        print("Out of vocabulary word")
        return
    word2_id = vocab[word2]
    if word2_id == 0:
        print("Out of vocabulary word")
        return

    word1_vec = embeddings_norm[word1_id]
    #word1_vec = np.reshape(word1_vec, (len(word1_vec), 1))
    
    word2_vec = embeddings_norm[word2_id]
    #word2_vec = np.reshape(word2_vec, (len(word2_vec), 1))
    
    return np.dot(word1_vec, word2_vec)


def get_analogy(a,b,c, embeddings_norm, vocab):
    vocab_list = vocab.get_itos()
    if a not in vocab_list or b not in vocab_list or c not in vocab_list:
        print('not in vocab')
        return
    a_id = vocab[a]
    b_id = vocab[b]
    c_id = vocab[c]
    if a_id == 0 or b_id == 0 or c_id == 0:
        print("Out of vocabulary word")
        return
    a_vec = embeddings_norm[a_id]
    b_vec = embeddings_norm[b_id]
    c_vec = embeddings_norm[c_id]
    sim = -2
    target = None
    sim_dict = {}
    for token in vocab_list:
        if token in [a,b,c]:
            continue
        token_id = vocab[token]
        token_vec = embeddings_norm[token_id]
        s = np.dot(token_vec, b_vec-a_vec+c_vec)
        if s > sim:
            sim = s
            target = token
        if len(sim_dict) < 5:
            sim_dict[token] = s
        else:
            min_key = min(sim_dict, key=sim_dict.get)
            min_val = sim_dict[min_key]
            if s > min_val:
                del sim_dict[min_key]
                sim_dict[token] = s
                    
    print(target, sim)
    print(sim_dict)
    print('')


def get_outlier(word_list, embeddings_norm, vocab):
    vocab_list = vocab.get_itos()
    for word in word_list:
        if word not in vocab_list:
            print('not in vocab')
            return
    pairs = list(itertools.combinations(word_list, 2))
    sim_dict = {w:0 for w in word_list}
    for pair in pairs:
        vec1 = embeddings_norm[vocab[pair[0]]]
        vec2 = embeddings_norm[vocab[pair[1]]]
        sim = np.dot(vec1, vec2)
        sim_dict[pair[0]] += sim
        sim_dict[pair[1]] += sim
    new_d = {w:0 for w in word_list}
    for key, val in sim_dict.items():
        new_d[key] = val/(len(word_list)-1)
    word = None
    value = 2
    for key, val in new_d.items():
        if val < value:
            word = key
            value = val
    print(word, value)

#### Similarities

##### the_donald

In [81]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs') # freq 20, sample 100000
get_top_similar('politics', embeddings_norm, vocab)

vocab size: 10378


{'hearings': 0.22003981,
 'leaning': 0.2085969,
 'sweden': 0.20754245,
 'reddits': 0.20666334,
 'pen': 0.20566946}

In [20]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs') # freq 20, sample 100000
get_top_similar('voters', embeddings_norm, vocab)

vocab size: 10378


{'dems': 0.27929482,
 'vote': 0.2669506,
 'demographics': 0.26626676,
 'voting': 0.26432395,
 'republicans': 0.26054394}

In [8]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs') # freq 20, sample 100000
get_top_similar('woman', embeddings_norm, vocab)

vocab size: 10378


{'girl': 0.3978128,
 'person': 0.33237395,
 'women': 0.3177452,
 'guy': 0.25549906,
 'husband': 0.2481705}

In [10]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs') # freq 20, sample 100000
get_top_similar('trump', embeddings_norm, vocab)

vocab size: 10378


{'obama': 0.33401677,
 'hillary': 0.33117822,
 'trumps': 0.3227671,
 'geotus': 0.30152357,
 'potus': 0.2929576}

##### conservative

In [12]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs') # freq 20, sample 100000
get_top_similar('bad', embeddings_norm, vocab)

vocab size: 9061


{'good': 0.34852636,
 'terrible': 0.3032945,
 'shitty': 0.29318285,
 'horrible': 0.27719852,
 'insignificant': 0.24304685}

In [13]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs') # freq 20, sample 100000
get_top_similar('biden', embeddings_norm, vocab)

vocab size: 9061


{'trump': 0.32836217,
 'bidens': 0.31359124,
 'obama': 0.28926462,
 'sleepy': 0.28519666,
 'pence': 0.2774164}

In [9]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs') # freq 20, sample 100000
get_top_similar('captain', embeddings_norm, vocab)

vocab size: 9061


{'prosecuted': 0.24044552,
 'directors': 0.22615686,
 'debatable': 0.22474325,
 'classified': 0.22291517,
 'comey': 0.22155952}

##### politics

In [11]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs') # freq 20, sample 100000
get_top_similar('bad', embeddings_norm, vocab)

vocab size: 10715


{'good': 0.4490177,
 'terrible': 0.36272916,
 'awful': 0.2903085,
 'bug': 0.25489658,
 'smart': 0.25289032}

In [12]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs') # freq 20, sample 100000
get_top_similar('biden', embeddings_norm, vocab)

vocab size: 10715


{'bidens': 0.37397128,
 'bernie': 0.3396699,
 'hillary': 0.3112797,
 'sanders': 0.26909018,
 'trump': 0.26213026}

In [13]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs') # freq 20, sample 100000
get_top_similar('captain', embeddings_norm, vocab)

vocab size: 10715


{'respectable': 0.2088904,
 'utility': 0.20452517,
 'finals': 0.20363128,
 'dogshit': 0.20302075,
 'bruins': 0.20159477}

#### Analogies

##### the_donald

In [178]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
get_analogy('politics', 'president', 'sports', embeddings_norm, vocab)
get_analogy('biden', 'democrat', 'trump', embeddings_norm, vocab)
get_analogy('trump', 'republican', 'biden', embeddings_norm, vocab)
get_analogy('election', 'politician', 'cup', embeddings_norm, vocab)
get_analogy('fans', 'tournament', 'voters', embeddings_norm, vocab)
get_analogy('voters', 'election', 'fans', embeddings_norm, vocab)
get_analogy('democrats', 'election', 'team', embeddings_norm, vocab)
get_analogy('republicans', 'election', 'team', embeddings_norm, vocab)

vocab size: 10378
scout 0.4258247
{'pitch': 0.3304557, 'bulls': 0.3169939, 'scout': 0.4258247, 'gaining': 0.31374907, 'household': 0.3362836}

fantasy 0.4194665
{'particular': 0.35942805, 'fantasy': 0.4194665, 'disney': 0.3151315, 'tor': 0.32250527, 'frequent': 0.33008647}

landslide 0.43094638
{'liberals': 0.34583837, 'illegals': 0.34671915, 'dem': 0.40632367, 'landslide': 0.43094638, 'electors': 0.36139715}

lottery 0.3903635
{'lottery': 0.3903635, 'corpse': 0.35305783, 'rig': 0.33654293, 'sympathetic': 0.36366728, 'nutshell': 0.3700594}

pitcher 0.42554766
{'pitcher': 0.42554766, 'contender': 0.35029933, 'cups': 0.3452366, 'commentator': 0.4178512, 'accent': 0.37176338}

polls 0.38245848
{'party': 0.3540228, 'voting': 0.37569758, 'polls': 0.38245848, 'award': 0.37299412, 'debates': 0.3624028}

fan 0.4239778
{'game': 0.41483322, 'fan': 0.4239778, 'coach': 0.3303164, 'fanbase': 0.35963717, 'scientist': 0.3560795}

game 0.4688605
{'game': 0.4688605, 'teams': 0.44293118, 'elections': 0.

##### conservative

In [176]:
embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs')
get_analogy('politics', 'president', 'sports', embeddings_norm, vocab)
get_analogy('biden', 'democrat', 'trump', embeddings_norm, vocab)
get_analogy('trump', 'republican', 'biden', embeddings_norm, vocab)
get_analogy('election', 'politician', 'cup', embeddings_norm, vocab)
get_analogy('fans', 'tournament', 'voters', embeddings_norm, vocab)
get_analogy('voters', 'election', 'fans', embeddings_norm, vocab)
get_analogy('democrats', 'election', 'team', embeddings_norm, vocab)
get_analogy('republicans', 'election', 'team', embeddings_norm, vocab)

vocab size: 9061
athlete 0.36311758
{'player': 0.34608072, 'potus': 0.3311136, 'athlete': 0.36311758, 'ucl': 0.3347901, 'umbrella': 0.32594782}

republican 0.41996822
{'conservative': 0.38603973, 'republican': 0.41996822, 'nyc': 0.32983807, 'quiet': 0.31968936, 'counsel': 0.31419033}

gop 0.34579286
{'gone': 0.30106226, 'gop': 0.34579286, 'democratic': 0.3361963, 'pence': 0.33597142, 'master': 0.29881886}

serie 0.41513517
{'teacher': 0.35261863, 'quarterback': 0.3486863, 'striker': 0.39161766, 'edmonton': 0.35449815, 'serie': 0.41513517}

votes 0.39529595
{'vote': 0.3623533, 'votes': 0.39529595, 'voter': 0.3764603, 'nomination': 0.35263157, 'rigging': 0.37245345}

fan 0.4524206
{'fan': 0.4524206, 'series': 0.36706093, 'cup': 0.362842, 'announce': 0.35837916, 'desperation': 0.3587853}

roster 0.51683754
{'game': 0.43009406, 'player': 0.42423913, 'teams': 0.45016965, 'roster': 0.51683754, 'superbowl': 0.45683464}

roster 0.54680926
{'game': 0.46875653, 'player': 0.43579563, 'roster': 0.

##### politics

In [177]:
embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs')
get_analogy('politics', 'president', 'sports', embeddings_norm, vocab)
get_analogy('biden', 'democrat', 'trump', embeddings_norm, vocab)
get_analogy('trump', 'republican', 'biden', embeddings_norm, vocab)
get_analogy('election', 'politician', 'cup', embeddings_norm, vocab)
get_analogy('fans', 'tournament', 'voters', embeddings_norm, vocab)
get_analogy('voters', 'election', 'fans', embeddings_norm, vocab)
get_analogy('democrats', 'election', 'team', embeddings_norm, vocab)
get_analogy('republicans', 'election', 'team', embeddings_norm, vocab)

vocab size: 10715
decade 0.35326236
{'candidate': 0.34323546, 'decade': 0.35326236, 'threshold': 0.3335488, 'coached': 0.31434968, 'zach': 0.32353944}

republican 0.40329498
{'republican': 0.40329498, 'politician': 0.39520502, 'reaction': 0.32067028, 'tool': 0.31063023, 'buffoon': 0.34742776}

gop 0.43358126
{'gop': 0.43358126, 'dem': 0.40755415, 'impeach': 0.36443967, 'bidens': 0.35762668, 'burrow': 0.35972166}

striker 0.41592416
{'player': 0.37873068, 'billionaire': 0.37589896, 'striker': 0.41592416, 'winger': 0.39610106, 'bless': 0.3746383}

presidency 0.3356225
{'vote': 0.33554313, 'election': 0.32857814, 'gop': 0.33106822, 'presidency': 0.3356225, 'camps': 0.31006607}

fan 0.4739089
{'game': 0.3628464, 'fan': 0.4739089, 'sub': 0.4320428, 'elections': 0.4158188, 'excuse': 0.35385194}

season 0.47238484
{'game': 0.40681168, 'season': 0.47238484, 'teams': 0.4163164, 'roster': 0.40146235, 'tournament': 0.40953067}

season 0.44921964
{'game': 0.4001895, 'season': 0.44921964, 'election

In [None]:
# non gendered examples
# dem subs
# list of analogy words -> compare agg values dem vs rep
# similarity values -> compare dem vs rep
# only [sport, sports] in wsd

#### Clustering

In [49]:
concept1 = ['red', 'blue', 'green', 'black']
concept2 = ['trump', 'biden', 'president', 'election']
concept3 = ['sports', 'game', 'play', 'score']
concept4 = ['good', 'bad', 'terrible', 'great']
concept5 = ['car', 'bus', 'train']

In [58]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')

vocab size: 10378


In [59]:
kmeans = KMeans(n_clusters=4, random_state=0, n_init="auto").fit(embeddings_norm)

In [62]:
vec = embeddings_norm[vocab['red']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [63]:
vec = embeddings_norm[vocab['blue']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [64]:
vec = embeddings_norm[vocab['trump']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [67]:
vec = embeddings_norm[vocab['biden']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [60]:
vec = embeddings_norm[vocab['sports']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [65]:
vec = embeddings_norm[vocab['bad']].reshape(1, -1)
kmeans.predict(vec)

array([3], dtype=int32)

In [66]:
vec = embeddings_norm[vocab['good']].reshape(1, -1)
kmeans.predict(vec)

array([3], dtype=int32)

In [69]:
vec = embeddings_norm[vocab['car']].reshape(1, -1)
kmeans.predict(vec)

array([0], dtype=int32)

In [70]:
vec = embeddings_norm[vocab['bus']].reshape(1, -1)
kmeans.predict(vec)

array([0], dtype=int32)

In [71]:
vec = embeddings_norm[vocab['train']].reshape(1, -1)
kmeans.predict(vec)

array([0], dtype=int32)

#### Outlier detection

In [106]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')

vocab size: 10378


In [99]:
get_outlier(['apple', 'orange', 'bus'], embeddings_norm, vocab)

bus 0.021671796683222055


In [100]:
get_outlier(['sports', 'game', 'play', 'president'], embeddings_norm, vocab)

president -0.03133737722722193


In [56]:
get_outlier(['red', 'france', 'green', 'blue'], embeddings_norm, vocab)

france -0.006834672763943672


In [57]:
get_outlier(['trump', 'biden', 'president', 'election', 'apple'], embeddings_norm, vocab)

apple 0.008264641437563114


#### Agg eval

In [7]:
# sensitive to choice of words
# how to choose words?

politics_words = ['voters', 'election', 'president', 'democrats', 'republicans', 'politicians', 'media']
sports_words = ['fans', 'race', 'captain', 'team', 'players', 'cup', 'trophy']

##### average pairwise similarity between groups

In [8]:
def avg_sim(sub, l1, l2):
    embeddings_norm, vocab = get_embeddings_and_vocab(sub)
    all_pairs = []
    for p in l1:
        all_pairs.extend([(p,s) for s in l2])
    sim_list = []
    for pair in all_pairs:
        sim = get_sim(pair[0], pair[1], embeddings_norm, vocab)
        sim_list.append(sim)
    print(statistics.mean(sim_list))

In [9]:
#embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs') # freq 20, sample 100000
#politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
#sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())

avg_sim('the_donald_1000_epochs', politics_words, sports_words)

vocab size: 10378
0.03423778


In [10]:
#embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs') # freq 20, sample 100000
#politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
#sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())

avg_sim('conservative_1000_epochs', politics_words, sports_words)

vocab size: 9061
0.032325916


In [11]:
#embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs') # freq 20, sample 100000
#politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
#sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())

avg_sim('politics_1000_epochs', politics_words, sports_words)

vocab size: 10715
0.02686279


##### average target pair similarity

In [184]:
word_map = {'voters': 'fans',
            'election': 'race',
            #'election': 'cup',
            #'election': 'trophy',
            'trump': 'captain',
            'biden': 'captain', 
            'democrats': 'team',
            'republicans': 'team',
            'politicians': 'players',
           }

In [12]:
def avg_map_sim(sub, d):
    embeddings_norm, vocab = get_embeddings_and_vocab(sub)
    sim_list = []
    for key, val in d.items():
        sim = get_sim(key, val, embeddings_norm, vocab)
        sim_list.append(sim)
    print(statistics.mean(sim_list))

In [None]:
avg_map_sim('the_donald_1000_epochs', word_map)

In [None]:
avg_map_sim('conservative_1000_epochs', word_map)

In [188]:
avg_map_sim('politics_1000_epochs', word_map)

vocab size: 10715
0.08029996


##### centroid distance

In [16]:
def get_cent(word_list, embeddings_norm, vocab):
    vec_list = []
    for word in word_list:
        word_id = vocab[word]
        if word_id == 0:
            print("Out of vocabulary word")
            return
        word_vec = embeddings_norm[word_id]
        vec_list.append(word_vec)
    cent = np.mean(np.array(vec_list), axis=0)
    return cent


def cent_dist(sub, l1, l2):
    embeddings_norm, vocab = get_embeddings_and_vocab(sub)
    l1_cent = get_cent(l1, embeddings_norm, vocab)
    l2_cent = get_cent(l2, embeddings_norm, vocab)
    print(np.linalg.norm(l1_cent - l2_cent))

In [111]:
#embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
#politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
#sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())
cent_dist('the_donald_1000_epochs', politics_words, sports_words)

vocab size: 10378
0.6262743


In [112]:
#embeddings_norm, vocab = get_embeddings_and_vocab('conservative_1000_epochs')
#politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
#sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())
cent_dist('conservative_1000_epochs', politics_words, sports_words)

vocab size: 9061
0.64171845


In [110]:
#embeddings_norm, vocab = get_embeddings_and_vocab('politics_1000_epochs')
#politics_words = list(get_top_similar('politics', embeddings_norm, vocab, topN=100).keys())
#sports_words = list(get_top_similar('sports', embeddings_norm, vocab, topN=100).keys())
cent_dist('politics_1000_epochs', politics_words, sports_words)

vocab size: 10715
0.65180635


#### temporal embeddings

##### politics

In [17]:
# avg pairwise sim

avg_sim('politics_sep_dec_2015', politics_words, sports_words)
avg_sim('politics_sep_dec_2016', politics_words, sports_words)
avg_sim('politics_sep_dec_2018', politics_words, sports_words)
avg_sim('politics_sep_dec_2020', politics_words, sports_words)

vocab size: 6847
0.026348129
vocab size: 14251
0.033180367
vocab size: 14215
0.032419257
vocab size: 16682
0.038351674


In [None]:
# target pair sim
## wrong ##
avg_map_sim('politics_sep_dec_2015', word_map)
avg_map_sim('politics_sep_dec_2016', word_map)
avg_map_sim('politics_sep_dec_2018', word_map)

In [18]:
# centroid distance

cent_dist('politics_sep_dec_2015', politics_words, sports_words)
cent_dist('politics_sep_dec_2016', politics_words, sports_words)
cent_dist('politics_sep_dec_2018', politics_words, sports_words)
cent_dist('politics_sep_dec_2020', politics_words, sports_words)

vocab size: 6847
0.6188757
vocab size: 14251
0.6433563
vocab size: 14215
0.648723
vocab size: 16682
0.62493944


In [172]:
# analogies

embeddings_norm, vocab = get_embeddings_and_vocab('politics_sep_dec_2015')
get_analogy('politics', 'president', 'sports', embeddings_norm, vocab)
get_analogy('biden', 'democrat', 'trump', embeddings_norm, vocab)
get_analogy('trump', 'republican', 'biden', embeddings_norm, vocab)
get_analogy('election', 'politician', 'cup', embeddings_norm, vocab)
get_analogy('fans', 'tournament', 'voters', embeddings_norm, vocab)
get_analogy('voters', 'election', 'fans', embeddings_norm, vocab)
get_analogy('democrats', 'election', 'team', embeddings_norm, vocab)
get_analogy('republicans', 'election', 'team', embeddings_norm, vocab)

vocab size: 6847
star 0.3942173
{'star': 0.3942173, 'horrible': 0.28759834, 'host': 0.30244356, 'hc': 0.3155215, 'managers': 0.32538742}

republican 0.45222697
{'republican': 0.45222697, 'democratic': 0.32463315, 'families': 0.32871872, 'presidents': 0.3472922, 'pope': 0.34725454}

coalition 0.42763084
{'gop': 0.34682125, 'democrat': 0.35124308, 'remaining': 0.31504303, 'coalition': 0.42763084, 'council': 0.31250295}

permanently 0.37165612
{'player': 0.33940136, 'score': 0.31871927, 'dirty': 0.30277368, 'channel': 0.33898503, 'permanently': 0.37165612}

administration 0.36267522
{'biden': 0.32655776, 'administration': 0.36267522, 'electoral': 0.34558263, 'blog': 0.31259945, 'civilization': 0.34654787}

edge 0.3373633
{'subreddit': 0.32959747, 'edge': 0.3373633, 'offseason': 0.32533637, 'nets': 0.3231932, 'superbowl': 0.33314326}

game 0.51544607
{'game': 0.51544607, 'season': 0.4621518, 'roster': 0.3477487, 'scheme': 0.40618673, 'agent': 0.40981454}

game 0.44547305
{'game': 0.4454730

In [173]:
# analogies

embeddings_norm, vocab = get_embeddings_and_vocab('politics_sep_dec_2016')
get_analogy('politics', 'president', 'sports', embeddings_norm, vocab)
get_analogy('biden', 'democrat', 'trump', embeddings_norm, vocab)
get_analogy('trump', 'republican', 'biden', embeddings_norm, vocab)
get_analogy('election', 'politician', 'cup', embeddings_norm, vocab)
get_analogy('fans', 'tournament', 'voters', embeddings_norm, vocab)
get_analogy('voters', 'election', 'fans', embeddings_norm, vocab)
get_analogy('democrats', 'election', 'team', embeddings_norm, vocab)
get_analogy('republicans', 'election', 'team', embeddings_norm, vocab)

vocab size: 14251
presidents 0.48879406
{'presidents': 0.48879406, 'governor': 0.3835699, 'potus': 0.42715505, 'legend': 0.39075565, 'athletes': 0.4321336}

republican 0.41744873
{'bernie': 0.35638192, 'republican': 0.41744873, 'millennial': 0.3637199, 'gallop': 0.36542752, 'monger': 0.36174294}

democrat 0.42793575
{'gop': 0.41351008, 'democratic': 0.40115157, 'democrat': 0.42793575, 'supporter': 0.3918796, 'dem': 0.40685156}

beater 0.5198802
{'player': 0.41866234, 'manager': 0.41692528, 'barcelona': 0.3780677, 'striker': 0.37592408, 'beater': 0.5198802}

final 0.41620874
{'vote': 0.36761516, 'votes': 0.35698056, 'final': 0.41620874, 'margin': 0.35829577, 'header': 0.35105205}

fan 0.5108575
{'game': 0.3837049, 'fan': 0.5108575, 'rivalry': 0.3858642, 'leicester': 0.38390812, 'flairs': 0.4287348}

championship 0.4812957
{'draft': 0.46907192, 'roster': 0.4434315, 'championship': 0.4812957, 'site': 0.45911494, 'offence': 0.47118452}

site 0.5098346
{'player': 0.45931816, 'sub': 0.454684

In [174]:
# analogies

embeddings_norm, vocab = get_embeddings_and_vocab('politics_sep_dec_2018')
get_analogy('politics', 'president', 'sports', embeddings_norm, vocab)
get_analogy('biden', 'democrat', 'trump', embeddings_norm, vocab)
get_analogy('trump', 'republican', 'biden', embeddings_norm, vocab)
get_analogy('election', 'politician', 'cup', embeddings_norm, vocab)
get_analogy('fans', 'tournament', 'voters', embeddings_norm, vocab)
get_analogy('voters', 'election', 'fans', embeddings_norm, vocab)
get_analogy('democrats', 'election', 'team', embeddings_norm, vocab)
get_analogy('republicans', 'election', 'team', embeddings_norm, vocab)

vocab size: 14215
presidents 0.5019555
{'presidents': 0.5019555, 'potus': 0.36818692, 'writer': 0.39954984, 'hc': 0.34357688, 'viewer': 0.3484116}

republican 0.5534355
{'republican': 0.5534355, 'trumps': 0.38642782, 'conservative': 0.42181188, 'dem': 0.38724598, 'texan': 0.41954777}

democrat 0.47602326
{'democratic': 0.47310188, 'democrat': 0.47602326, '3rd': 0.38512272, 'dem': 0.43025726, 'supporter': 0.46940717}

athlete 0.39989597
{'person': 0.3839544, 'series': 0.38575244, 'leader': 0.39466912, 'championship': 0.39448422, 'athlete': 0.39989597}

vote 0.4847435
{'vote': 0.4847435, 'voting': 0.3383669, 'votes': 0.39002547, 'voter': 0.3343304, 'primaries': 0.4022371}

fan 0.5051737
{'game': 0.38119093, 'fan': 0.5051737, 'elections': 0.45777228, 'joining': 0.34786144, 'presser': 0.32704985}

game 0.5242634
{'game': 0.5242634, 'teams': 0.4663511, 'offense': 0.4370786, 'elections': 0.4353964, 'franchise': 0.42877886}

teams 0.50897205
{'game': 0.45571238, 'player': 0.45282543, 'teams':