In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:
import json

import numpy as np

from word_partisanship.utils import (
    logodds_with_prior,
)
from preprocessing.utils import (
    split_by_party,
    load_event_comments,
    build_vocab,
    build_term_vector,
)
from preprocessing.constants import OUTPUT_DIR

In [None]:
YEAR = 2016

EVENT_NAME = f"us_elections_{YEAR}"

N_DISPLAY = 50

In [None]:
# Read event data
event_comments = load_event_comments(
    EVENT_NAME,
    backend="polars",
)


In [None]:
dem_comments, rep_comments = split_by_party(
    event_comments,
    backend="polars",
)


In [None]:
print(dem_comments.shape)
print(rep_comments.shape)

In [None]:
event_comments = event_comments.to_pandas()
dem_comments = dem_comments.to_pandas()
rep_comments = rep_comments.to_pandas()


In [None]:
print("Building vocabulary...")
vocab = build_vocab(event_comments["tokens"], min_comment_freq=1)

print("Building term vectors...")
term_vec = build_term_vector(event_comments["tokens"], vocab)
dem_term_vec = build_term_vector(dem_comments["tokens"], vocab)
rep_term_vec = build_term_vector(rep_comments["tokens"], vocab)

print("Calculating loggodds...")
logodds = logodds_with_prior(
    term_vec,
    dem_term_vec,
    rep_term_vec,
)

In [None]:
sorted_ind = np.argsort(logodds)
# sort vocabulary by index
sorted_vocab = np.array(sorted(vocab, key=vocab.get))

In [None]:
print("Democrat tokens")
dem_idiosyncratic_tokens = list(sorted_vocab[sorted_ind[-N_DISPLAY:]])
print(dem_idiosyncratic_tokens)

with open(f"{OUTPUT_DIR}/{EVENT_NAME}_dem_idiosyncratic_tokens.json", "w") as f:
    json.dump(dem_idiosyncratic_tokens, f)


In [None]:
print("Republican tokens")
rep_idiosyncratic_tokens = list(sorted_vocab[sorted_ind[:N_DISPLAY]])
print(rep_idiosyncratic_tokens)

with open(f"{OUTPUT_DIR}/{EVENT_NAME}_rep_idiosyncratic_tokens.json", "w") as f:
    json.dump(rep_idiosyncratic_tokens, f)