In [1]:
import numpy as np
import pandas as pd
import torch
import sys

from sklearn.manifold import TSNE
import plotly.graph_objects as go

import argparse
import os
from os.path import dirname, abspath
from functools import partial
import json
import yaml
import numpy as np

import torch
import torch.nn as nn
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR

from datasets import load_dataset

In [3]:
CBOW_N_WORDS = 4
SKIPGRAM_N_WORDS = 4

MIN_WORD_FREQUENCY = 50
MAX_SEQUENCE_LENGTH = 256

EMBED_DIMENSION = 300
EMBED_MAX_NORM = 1

In [None]:
class CBOW_Model(nn.Module):
    """
    Implementation of CBOW model described in paper:
    https://arxiv.org/abs/1301.3781
    """

    def __init__(self, vocab_size: int):
        super(CBOW_Model, self).__init__()
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=EMBED_DIMENSION,
            max_norm=EMBED_MAX_NORM,
        )
        self.linear = nn.Linear(
            in_features=EMBED_DIMENSION,
            out_features=vocab_size,
        )

    def forward(self, inputs):
        x = self.embeddings(inputs)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x

In [None]:
model = CBOW_Model(vocab_size=4099)  ## change vocab size ##

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## change file path ##
model = torch.load(f"/content/word2vec-pytorch/weights/cbow_WikiText2/model.pt", map_location=device)
vocab = torch.load(f"/content/word2vec-pytorch/weights/cbow_WikiText2/vocab.pt")

In [None]:
# embedding from first model layer
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalization
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape

In [None]:
# get embeddings
embeddings_df = pd.DataFrame(embeddings)

# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_trans = tsne.fit_transform(embeddings_df)
embeddings_df_trans = pd.DataFrame(embeddings_df_trans)

# get token order
embeddings_df_trans.index = vocab.get_itos()

# if token is a number
is_numeric = embeddings_df_trans.index.str.isnumeric()

In [None]:
color = np.where(is_numeric, "green", "black")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_trans[0],
        y=embeddings_df_trans[1],
        mode="text",
        text=embeddings_df_trans.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
fig.write_html("word2vec_visualization.html")

In [None]:
def get_top_similar(word: str, topN: int = 10):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [None]:
for word, sim in get_top_similar("red").items():
    print("{}: {:.3f}".format(word, sim))

In [2]:
import polars as pl

In [3]:
df = pl.read_csv('/users/ujan/sports-language-in-politics/data/processed/politics_sample.csv').drop_nulls()

In [4]:
df['subreddit'].unique().to_list()

['Israel_Palestine',
 'AltFacts',
 'TrumpForPrison',
 'Shitstatistssay',
 'CanadianPolitics',
 'CPC',
 'UnbiasedCanada',
 'GreenPartyOfCanada',
 'PAForSanders',
 'RanktheVote',
 'TheMajorityReport',
 'NationalSocialism',
 'ALJAZEERAauto',
 'Socialism_101',
 'EmergingRisks',
 'calexit',
 'brealism',
 'The_DonaldUnleashed',
 'True_AskAConservative',
 'SandersForPresident',
 'BlueMidterm2018',
 'thenewcoldwar',
 'distributism',
 'FloridaForSanders',
 'ModelCentralState',
 'Minarchy',
 'donaldtrump',
 'EndlessWar',
 'ArabIsraeliConflict',
 'qualitynews',
 'DebateCommunism',
 'China_Flu',
 'explainlikedonald',
 'EndFPTP',
 'MedicareForAll',
 'governmentoppression',
 'libertarianmeme',
 'Our_Politics',
 'ThanksObama',
 'AustraliaSimUpper',
 'PoliticalPerspectives',
 'SocialismAndVeganism',
 'CA2NWO',
 'CornbreadLiberals',
 'USCensus2020',
 'worldpolitics',
 'The_Redacted',
 'CoronavirusCT',
 'nyspolitics',
 'ConspiracyFacts',
 'ReddLineNews',
 'WesternTerrorism',
 'NOWTTYG',
 'illinoispoliti

In [11]:
len(df.filter(pl.col('subreddit') == 'Conservative'))

86348

In [20]:
len(df.filter(pl.col('subreddit') == 'Donald_Trump'))

184

In [22]:
len(df.filter(pl.col('subreddit') == 'donaldtrump'))

1147

In [23]:
len(df.filter(pl.col('subreddit') == 'AskTrumpSupporters'))

28454

In [24]:
len(df.filter(pl.col('subreddit') == 'The_Donald'))

419292

In [21]:
len(df.filter(pl.col('subreddit') == 'JoeBiden'))

7632

In [16]:
len(df.filter(pl.col('subreddit') == 'AskALiberal'))

12888

In [17]:
len(df.filter(pl.col('subreddit') == 'democrats'))  # 238 biden, 1063 trump (> 50 chars)

6308

In [18]:
len(df.filter(pl.col('subreddit') == 'socialism'))

14020

In [19]:
len(df.filter(pl.col('subreddit') == 'DemocraticSocialism'))

3027

In [3]:
df = pl.read_csv('/users/ujan/sports-language-in-politics/data/processed/sports_sample.csv').drop_nulls()

In [4]:
df['subreddit'].unique().to_list()

['DallasStars',
 'kings',
 'Seahawks',
 'Chargers',
 '49ers',
 'Browns',
 'canucks',
 'mlb',
 'FloridaPanthers',
 'buccos',
 'warriors',
 'rbny',
 'NewYorkIslanders',
 'rangers',
 'minnesotaunited',
 'NewYorkMets',
 'UtahJazz',
 'whitecapsfc',
 'TexasRangers',
 'hockey',
 'penguins',
 'AnaheimDucks',
 'angelsbaseball',
 'leafs',
 'nhl',
 'NYYankees',
 'GoNets',
 'ripcity',
 'LAGalaxy',
 'devils',
 'DetroitPistons',
 'Mariners',
 'fcdallas',
 'AustinFC',
 'denvernuggets',
 'DenverBroncos',
 'canes',
 'falcons',
 'ColoradoAvalanche',
 'Habs',
 'Jaguars',
 'CharlotteFootballClub',
 'cowboys',
 'Braves',
 'wildhockey',
 'tfc',
 'SanJoseSharks',
 'NOLAPelicans',
 'ColoradoRockies',
 'OaklandAthletics',
 'Reds',
 'Tennesseetitans',
 'hawks',
 'Predators',
 'Basketball',
 'redsox',
 'detroitlions',
 'phillies',
 'NYCFC',
 'LAClippers',
 'minnesotavikings',
 'caps',
 'football',
 'whitesox',
 'Padres',
 'LosAngelesRams',
 'chicagobulls',
 'clevelandcavs',
 'losangeleskings',
 'BlueJackets',
 '