In [73]:
import numpy as np
import pandas as pd
import torch
import sys
import re

from sklearn.manifold import TSNE
import plotly.graph_objects as go

import argparse
import os
from os.path import dirname, abspath
from functools import partial
import json
import yaml
import numpy as np

import torch
import torch.nn as nn
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR

from datasets import load_dataset

import seaborn as sns
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt

from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

from datasets import Dataset
from tqdm.auto import tqdm

from sklearn.cluster import KMeans
import itertools

In [10]:
EMBED_DIMENSION = 300
EMBED_MAX_NORM = 1

data_dir = "/users/ujan/sports-language-in-politics/models/cbow/"

In [11]:
if torch.backends.mps.is_available(): device = "mps"
elif torch.cuda.is_available(): device = "cuda"
else: device = "cpu"

In [12]:
class CBOW_Model(nn.Module):
    """
    Implementation of CBOW model described in paper:
    https://arxiv.org/abs/1301.3781
    """

    def __init__(self, vocab_size: int):
        super(CBOW_Model, self).__init__()
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=EMBED_DIMENSION,
            max_norm=EMBED_MAX_NORM,
        )
        self.linear = nn.Linear(
            in_features=EMBED_DIMENSION,
            out_features=vocab_size,
        )

    def forward(self, inputs):
        x = self.embeddings(inputs)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x

In [98]:
def get_embeddings_and_vocab(sub_name):
    
    vocab = torch.load(data_dir+sub_name+"_vocab.pt")
    print('vocab size: {}'.format(len(vocab.get_itos())))
    model = CBOW_Model(vocab_size=len(vocab.get_itos()))
    model = torch.load(data_dir+sub_name+"_model.pt", map_location=device)

    def normalization(embeddings):
        norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
        norms = np.reshape(norms, (len(norms), 1))
        embeddings_norm = embeddings / norms
        return embeddings_norm
    
    # embedding from first model layer
    embeddings = list(model.parameters())[0].cpu().detach().numpy()
    
    # normalization
    embeddings_norm = normalization(embeddings)

    return embeddings_norm, vocab


def get_top_similar(word: str, embeddings_norm, vocab, topN: int=5):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict


def get_sim(word1: str, word2: str, embeddings_norm, vocab):
    if word1 not in vocab.get_itos() or word2 not in vocab.get_itos():
        print('not in vocab')
        return
    word1_id = vocab[word1]
    if word1_id == 0:
        print("Out of vocabulary word")
        return
    word2_id = vocab[word2]
    if word2_id == 0:
        print("Out of vocabulary word")
        return

    word1_vec = embeddings_norm[word1_id]
    #word1_vec = np.reshape(word1_vec, (len(word1_vec), 1))
    
    word2_vec = embeddings_norm[word2_id]
    #word2_vec = np.reshape(word2_vec, (len(word2_vec), 1))
    
    print(np.dot(word1_vec, word2_vec))


def get_analogy(a,b,c, embeddings_norm, vocab):
    vocab_list = vocab.get_itos()
    if a not in vocab_list or b not in vocab_list or c not in vocab_list:
        print('not in vocab')
        return
    a_id = vocab[a]
    b_id = vocab[b]
    c_id = vocab[c]
    if a_id == 0 or b_id == 0 or c_id == 0:
        print("Out of vocabulary word")
        return
    a_vec = embeddings_norm[a_id]
    b_vec = embeddings_norm[b_id]
    c_vec = embeddings_norm[c_id]
    sim = -2
    target = None
    for token in vocab_list:
        if token in [a,b,c]:
            continue
        token_id = vocab[token]
        token_vec = embeddings_norm[token_id]
        s = np.dot(token_vec, b_vec-a_vec+c_vec)
        if s > sim:
            sim = s
            target = token
    print(target, sim)


def get_outlier(word_list, embeddings_norm, vocab):
    vocab_list = vocab.get_itos()
    for word in word_list:
        if word not in vocab_list:
            print('not in vocab')
            return
    pairs = list(itertools.combinations(word_list, 2))
    sim_dict = {w:0 for w in word_list}
    for pair in pairs:
        vec1 = embeddings_norm[vocab[pair[0]]]
        vec2 = embeddings_norm[vocab[pair[1]]]
        sim = np.dot(vec1, vec2)
        sim_dict[pair[0]] += sim
        sim_dict[pair[1]] += sim
    new_d = {w:0 for w in word_list}
    for key, val in sim_dict.items():
        new_d[key] = val/(len(word_list)-1)
    word = None
    value = 2
    for key, val in new_d.items():
        if val < value:
            word = key
            value = val
    print(word, value)

#### Similarities

In [37]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_conservative') # freq 20
get_top_similar('bad', embeddings_norm, vocab)

vocab size: 12813


{'terrible': 0.36026692,
 'horrible': 0.3512573,
 'shitty': 0.28138334,
 'good': 0.25139344,
 'awful': 0.23673256}

In [25]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs') # freq 20, sample 100000
get_top_similar('hate', embeddings_norm, vocab)

vocab size: 10378


{'despise': 0.4057723,
 'dislike': 0.357977,
 'hates': 0.33946168,
 'love': 0.31733787,
 'hated': 0.26700807}

#### Analogies

In [46]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
get_analogy('woman', 'queen', 'man', embeddings_norm, vocab)

vocab size: 10378
father 0.34129798


In [42]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')
get_analogy('politics', 'trump', 'sports', embeddings_norm, vocab)

vocab size: 10378
bowls 0.34564084


#### CLustering

In [49]:
concept1 = ['red', 'blue', 'green', 'black']
concept2 = ['trump', 'biden', 'president', 'election']
concept3 = ['sports', 'game', 'play', 'score']
concept4 = ['good', 'bad', 'terrible', 'great']
concept5 = ['car', 'bus', 'train']

In [52]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')

vocab size: 10378


In [57]:
kmeans = KMeans(n_clusters=4, random_state=0, n_init="auto").fit(embeddings_norm)

In [58]:
kmeans.labels_

array([1, 1, 3, ..., 1, 1, 0], dtype=int32)

In [62]:
vec = embeddings_norm[vocab['red']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [63]:
vec = embeddings_norm[vocab['blue']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [64]:
vec = embeddings_norm[vocab['trump']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [67]:
vec = embeddings_norm[vocab['biden']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [65]:
vec = embeddings_norm[vocab['bad']].reshape(1, -1)
kmeans.predict(vec)

array([3], dtype=int32)

In [66]:
vec = embeddings_norm[vocab['good']].reshape(1, -1)
kmeans.predict(vec)

array([3], dtype=int32)

In [68]:
vec = embeddings_norm[vocab['sports']].reshape(1, -1)
kmeans.predict(vec)

array([1], dtype=int32)

In [69]:
vec = embeddings_norm[vocab['car']].reshape(1, -1)
kmeans.predict(vec)

array([0], dtype=int32)

In [70]:
vec = embeddings_norm[vocab['bus']].reshape(1, -1)
kmeans.predict(vec)

array([0], dtype=int32)

In [71]:
vec = embeddings_norm[vocab['train']].reshape(1, -1)
kmeans.predict(vec)

array([0], dtype=int32)

#### Outlier detection

In [92]:
embeddings_norm, vocab = get_embeddings_and_vocab('the_donald_1000_epochs')

vocab size: 10378


In [99]:
get_outlier(['apple', 'orange', 'bus'], embeddings_norm, vocab)

bus 0.021671796683222055


In [100]:
get_outlier(['sports', 'game', 'play', 'president'], embeddings_norm, vocab)

president -0.03133737722722193
