In [11]:
import numpy as np
import pandas as pd
import torch
import sys

from sklearn.manifold import TSNE
import plotly.graph_objects as go

import argparse
import os
from os.path import dirname, abspath
from functools import partial
import json
import yaml
import numpy as np

import torch
import torch.nn as nn
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR

from datasets import load_dataset

In [13]:
EMBED_DIMENSION = 300
EMBED_MAX_NORM = 1

In [15]:
vocab = torch.load(f"/users/ujan/sports-language-in-politics/models/cbow/vocab.pt")

In [18]:
len(vocab.get_itos())

1561

In [19]:
class CBOW_Model(nn.Module):
    """
    Implementation of CBOW model described in paper:
    https://arxiv.org/abs/1301.3781
    """

    def __init__(self, vocab_size: int):
        super(CBOW_Model, self).__init__()
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=EMBED_DIMENSION,
            max_norm=EMBED_MAX_NORM,
        )
        self.linear = nn.Linear(
            in_features=EMBED_DIMENSION,
            out_features=vocab_size,
        )

    def forward(self, inputs):
        x = self.embeddings(inputs)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x

In [20]:
model = CBOW_Model(vocab_size=len(vocab.get_itos()))  ## change vocab size ##

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## change file path ##
model = torch.load(f"/users/ujan/sports-language-in-politics/models/cbow/model.pt", map_location=device)

In [70]:
# embedding from first model layer
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalization
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape

(1561, 300)

In [63]:
imp_tokens = [
    'biden', 'trump', 'coach', 'politician', 'fan',
    'voter', 'election', 'party', 'team', 'race',
    'democrats', 'republicans',
]
for token in imp_tokens:
    if token not in vocab.get_itos():
        print(token)

In [77]:
# get embeddings
embeddings_df = pd.DataFrame(embeddings)
embeddings_df.index = vocab.get_itos()
#embeddings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
<unk>,0.054493,0.026717,-0.083857,-0.02548,-0.030166,-0.003006,-0.024241,-0.001474,-0.049852,0.010876,...,-0.035062,0.099281,0.027973,0.011006,-0.029408,0.021452,0.131082,-0.103689,-0.014844,-0.182686
the,0.532822,0.507532,-0.001684,0.521379,-0.021458,-0.037755,-0.010519,0.047359,0.003725,-0.01818,...,0.048543,-0.047803,0.007469,-0.008154,0.013043,0.013928,0.014812,-0.031773,0.024262,0.032028
to,-0.001062,0.028484,-0.032488,-0.000496,0.072757,0.048156,0.051688,0.062093,0.055935,0.033592,...,0.047163,0.037665,0.036785,0.003008,0.054632,0.018457,-0.410992,0.418069,-0.364954,-0.413116
a,0.374503,-0.087086,-0.513799,0.456457,-0.019827,-0.007753,-0.055422,0.048995,-0.012768,-0.024422,...,0.093427,-0.061046,0.048407,0.022885,-0.021112,0.024027,0.026016,-0.038199,0.009593,-0.02757
and,0.021249,0.035081,-0.023988,0.014344,-0.012067,-0.041881,0.020403,0.022845,0.044986,-0.027736,...,0.022556,-0.020934,0.010186,-0.039694,0.015937,0.028191,-0.364327,-0.487388,-0.553411,-0.304283


In [78]:
# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_trans = tsne.fit_transform(embeddings_df)
embeddings_df_trans = pd.DataFrame(embeddings_df_trans)

# get token order
embeddings_df_trans.index = vocab.get_itos()

In [81]:
embeddings_df_trans = embeddings_df_trans.filter(items=imp_tokens, axis=0)
is_imp = np.array([True if item in imp_tokens else False for item in embeddings_df_trans.index.to_list()])

In [85]:
color = np.where(is_imp, "blue", "red")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_trans[0],
        y=embeddings_df_trans[1],
        mode="text",
        text=embeddings_df_trans.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
fig.write_html("word2vec_visualization.html")

In [93]:
def get_top_similar(word: str, topN: int = 10):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [94]:
for word, sim in get_top_similar("trump").items():
    print("{}: {:.3f}".format(word, sim))

obama: 0.727
biden: 0.705
bernie: 0.685
sanders: 0.662
donald: 0.621
clinton: 0.584
joe: 0.579
popular: 0.575
mr: 0.573
cruz: 0.568


In [91]:
def get_distance(word1: str, word2: str):
    if word1 not in vocab.get_itos() or word2 not in vocab.get_itos():
        print('not in vocab')
        return
    word1_id = vocab[word1]
    if word1_id == 0:
        print("Out of vocabulary word")
        return
    word2_id = vocab[word2]
    if word2_id == 0:
        print("Out of vocabulary word")
        return

    word1_vec = embeddings_norm[word1_id]
    #word1_vec = np.reshape(word1_vec, (len(word1_vec), 1))
    
    word2_vec = embeddings_norm[word2_id]
    #word2_vec = np.reshape(word2_vec, (len(word2_vec), 1))
    
    print(np.dot(word1_vec, word2_vec))

In [92]:
get_distance('trump', 'biden')

0.7053479


In [95]:
get_distance('trump', 'election')

0.3212096


In [2]:
import polars as pl

In [3]:
df = pl.read_csv('/users/ujan/sports-language-in-politics/data/processed/politics_sample.csv').drop_nulls()

In [4]:
df['subreddit'].unique().to_list()

['Israel_Palestine',
 'AltFacts',
 'TrumpForPrison',
 'Shitstatistssay',
 'CanadianPolitics',
 'CPC',
 'UnbiasedCanada',
 'GreenPartyOfCanada',
 'PAForSanders',
 'RanktheVote',
 'TheMajorityReport',
 'NationalSocialism',
 'ALJAZEERAauto',
 'Socialism_101',
 'EmergingRisks',
 'calexit',
 'brealism',
 'The_DonaldUnleashed',
 'True_AskAConservative',
 'SandersForPresident',
 'BlueMidterm2018',
 'thenewcoldwar',
 'distributism',
 'FloridaForSanders',
 'ModelCentralState',
 'Minarchy',
 'donaldtrump',
 'EndlessWar',
 'ArabIsraeliConflict',
 'qualitynews',
 'DebateCommunism',
 'China_Flu',
 'explainlikedonald',
 'EndFPTP',
 'MedicareForAll',
 'governmentoppression',
 'libertarianmeme',
 'Our_Politics',
 'ThanksObama',
 'AustraliaSimUpper',
 'PoliticalPerspectives',
 'SocialismAndVeganism',
 'CA2NWO',
 'CornbreadLiberals',
 'USCensus2020',
 'worldpolitics',
 'The_Redacted',
 'CoronavirusCT',
 'nyspolitics',
 'ConspiracyFacts',
 'ReddLineNews',
 'WesternTerrorism',
 'NOWTTYG',
 'illinoispoliti

In [11]:
len(df.filter(pl.col('subreddit') == 'Conservative'))

86348

In [20]:
len(df.filter(pl.col('subreddit') == 'Donald_Trump'))

184

In [22]:
len(df.filter(pl.col('subreddit') == 'donaldtrump'))

1147

In [23]:
len(df.filter(pl.col('subreddit') == 'AskTrumpSupporters'))

28454

In [24]:
len(df.filter(pl.col('subreddit') == 'The_Donald'))

419292

In [21]:
len(df.filter(pl.col('subreddit') == 'JoeBiden'))

7632

In [16]:
len(df.filter(pl.col('subreddit') == 'AskALiberal'))

12888

In [17]:
len(df.filter(pl.col('subreddit') == 'democrats'))  # 238 biden, 1063 trump (> 50 chars)

6308

In [18]:
len(df.filter(pl.col('subreddit') == 'socialism'))

14020

In [19]:
len(df.filter(pl.col('subreddit') == 'DemocraticSocialism'))

3027

In [3]:
df = pl.read_csv('/users/ujan/sports-language-in-politics/data/processed/sports_sample.csv').drop_nulls()

In [4]:
df['subreddit'].unique().to_list()

['DallasStars',
 'kings',
 'Seahawks',
 'Chargers',
 '49ers',
 'Browns',
 'canucks',
 'mlb',
 'FloridaPanthers',
 'buccos',
 'warriors',
 'rbny',
 'NewYorkIslanders',
 'rangers',
 'minnesotaunited',
 'NewYorkMets',
 'UtahJazz',
 'whitecapsfc',
 'TexasRangers',
 'hockey',
 'penguins',
 'AnaheimDucks',
 'angelsbaseball',
 'leafs',
 'nhl',
 'NYYankees',
 'GoNets',
 'ripcity',
 'LAGalaxy',
 'devils',
 'DetroitPistons',
 'Mariners',
 'fcdallas',
 'AustinFC',
 'denvernuggets',
 'DenverBroncos',
 'canes',
 'falcons',
 'ColoradoAvalanche',
 'Habs',
 'Jaguars',
 'CharlotteFootballClub',
 'cowboys',
 'Braves',
 'wildhockey',
 'tfc',
 'SanJoseSharks',
 'NOLAPelicans',
 'ColoradoRockies',
 'OaklandAthletics',
 'Reds',
 'Tennesseetitans',
 'hawks',
 'Predators',
 'Basketball',
 'redsox',
 'detroitlions',
 'phillies',
 'NYCFC',
 'LAClippers',
 'minnesotavikings',
 'caps',
 'football',
 'whitesox',
 'Padres',
 'LosAngelesRams',
 'chicagobulls',
 'clevelandcavs',
 'losangeleskings',
 'BlueJackets',
 '