In [193]:
import numpy as np
import pandas as pd
import torch
import sys

from sklearn.manifold import TSNE
import plotly.graph_objects as go

import argparse
import os
from os.path import dirname, abspath
from functools import partial
import json
import yaml
import numpy as np

import torch
import torch.nn as nn
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR

from datasets import load_dataset

In [194]:
EMBED_DIMENSION = 300
EMBED_MAX_NORM = 1

In [195]:
vocab = torch.load(f"/users/ujan/sports-language-in-politics/models/cbow/vocab.pt")

In [196]:
len(vocab.get_itos())

9904

In [197]:
class CBOW_Model(nn.Module):
    """
    Implementation of CBOW model described in paper:
    https://arxiv.org/abs/1301.3781
    """

    def __init__(self, vocab_size: int):
        super(CBOW_Model, self).__init__()
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=EMBED_DIMENSION,
            max_norm=EMBED_MAX_NORM,
        )
        self.linear = nn.Linear(
            in_features=EMBED_DIMENSION,
            out_features=vocab_size,
        )

    def forward(self, inputs):
        x = self.embeddings(inputs)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x

In [198]:
model = CBOW_Model(vocab_size=len(vocab.get_itos()))  ## change vocab size ##

In [199]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## change file path ##
model = torch.load(f"/users/ujan/sports-language-in-politics/models/cbow/model.pt", map_location=device)

In [200]:
# embedding from first model layer
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalization
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape

(9904, 300)

In [201]:
imp_tokens = [
    'biden', 'trump', 'coach', 'politician', 'fan',
    'voter', 'election', 'party', 'team', 'race',
    'democrats', 'republicans',
]
for token in imp_tokens:
    if token not in vocab.get_itos():
        print(token)

In [202]:
# get embeddings
embeddings_df = pd.DataFrame(embeddings)
embeddings_df.index = vocab.get_itos()
#embeddings_df.head()

In [203]:
# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_trans = tsne.fit_transform(embeddings_df)
embeddings_df_trans = pd.DataFrame(embeddings_df_trans)

# get token order
embeddings_df_trans.index = vocab.get_itos()

In [204]:
embeddings_df_trans = embeddings_df_trans.filter(items=imp_tokens, axis=0)
is_imp = np.array([True if item in imp_tokens else False for item in embeddings_df_trans.index.to_list()])

In [205]:
color = np.where(is_imp, "blue", "red")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_trans[0],
        y=embeddings_df_trans[1],
        mode="text",
        text=embeddings_df_trans.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
fig.write_html("word2vec_visualization.html")

In [186]:
def get_top_similar(word: str, topN: int = 10):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [187]:
for word, sim in get_top_similar("trump").items():
    print("{}: {:.3f}".format(word, sim))

obama: 0.410
hillary: 0.382
biden: 0.357
sanders: 0.351
trumps: 0.350
lie: 0.349
bernie: 0.342
pelosi: 0.334
china: 0.330
registered: 0.315


In [213]:
def get_sim(word1: str, word2: str):
    if word1 not in vocab.get_itos() or word2 not in vocab.get_itos():
        print('not in vocab')
        return
    word1_id = vocab[word1]
    if word1_id == 0:
        print("Out of vocabulary word")
        return
    word2_id = vocab[word2]
    if word2_id == 0:
        print("Out of vocabulary word")
        return

    word1_vec = embeddings_norm[word1_id]
    #word1_vec = np.reshape(word1_vec, (len(word1_vec), 1))
    
    word2_vec = embeddings_norm[word2_id]
    #word2_vec = np.reshape(word2_vec, (len(word2_vec), 1))
    
    print(np.dot(word1_vec, word2_vec))

In [207]:
# conservative

In [208]:
get_sim('trump', 'coach')

0.025071468


In [209]:
get_sim('biden', 'coach')

-0.10821757


In [210]:
get_sim('race', 'election')

0.12638268


In [211]:
get_sim('voter', 'fan')

0.114419945


In [None]:
# democrats

In [189]:
get_sim('trump', 'coach')

0.17658761


In [190]:
get_sim('biden', 'coach')

0.020108037


In [191]:
get_sim('race', 'election')

0.42546162


In [192]:
get_sim('voter', 'fan')

0.17199346


In [2]:
import polars as pl

In [3]:
df = pl.read_csv('/users/ujan/sports-language-in-politics/data/processed/politics_sample.csv').drop_nulls()

In [None]:
df['subreddit'].unique().to_list()

In [11]:
len(df.filter(pl.col('subreddit') == 'Conservative'))

86348

In [20]:
len(df.filter(pl.col('subreddit') == 'Donald_Trump'))

184

In [22]:
len(df.filter(pl.col('subreddit') == 'donaldtrump'))

1147

In [23]:
len(df.filter(pl.col('subreddit') == 'AskTrumpSupporters'))

28454

In [24]:
len(df.filter(pl.col('subreddit') == 'The_Donald'))

419292

In [21]:
len(df.filter(pl.col('subreddit') == 'JoeBiden'))

7632

In [16]:
len(df.filter(pl.col('subreddit') == 'AskALiberal'))

12888

In [17]:
len(df.filter(pl.col('subreddit') == 'democrats'))  # 238 biden, 1063 trump (> 50 chars)

6308

In [18]:
len(df.filter(pl.col('subreddit') == 'socialism'))

14020

In [19]:
len(df.filter(pl.col('subreddit') == 'DemocraticSocialism'))

3027

In [3]:
df = pl.read_csv('/users/ujan/sports-language-in-politics/data/processed/sports_sample.csv').drop_nulls()

In [4]:
df['subreddit'].unique().to_list()

['DallasStars',
 'kings',
 'Seahawks',
 'Chargers',
 '49ers',
 'Browns',
 'canucks',
 'mlb',
 'FloridaPanthers',
 'buccos',
 'warriors',
 'rbny',
 'NewYorkIslanders',
 'rangers',
 'minnesotaunited',
 'NewYorkMets',
 'UtahJazz',
 'whitecapsfc',
 'TexasRangers',
 'hockey',
 'penguins',
 'AnaheimDucks',
 'angelsbaseball',
 'leafs',
 'nhl',
 'NYYankees',
 'GoNets',
 'ripcity',
 'LAGalaxy',
 'devils',
 'DetroitPistons',
 'Mariners',
 'fcdallas',
 'AustinFC',
 'denvernuggets',
 'DenverBroncos',
 'canes',
 'falcons',
 'ColoradoAvalanche',
 'Habs',
 'Jaguars',
 'CharlotteFootballClub',
 'cowboys',
 'Braves',
 'wildhockey',
 'tfc',
 'SanJoseSharks',
 'NOLAPelicans',
 'ColoradoRockies',
 'OaklandAthletics',
 'Reds',
 'Tennesseetitans',
 'hawks',
 'Predators',
 'Basketball',
 'redsox',
 'detroitlions',
 'phillies',
 'NYCFC',
 'LAClippers',
 'minnesotavikings',
 'caps',
 'football',
 'whitesox',
 'Padres',
 'LosAngelesRams',
 'chicagobulls',
 'clevelandcavs',
 'losangeleskings',
 'BlueJackets',
 '