In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:
import json

import numpy as np
from tqdm import tqdm
tqdm.pandas()

from word_partisanship.utils import (
    logodds_with_prior,
)
from preprocessing.utils import (
    split_by_party,
    load_event_comments,
    build_term_vector,
    load_event_vocab,
)
from preprocessing.constants import OUTPUT_DIR


In [None]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("data/logs/word_partisanship.log"),
        logging.StreamHandler(stream=sys.stdout)
    ]
)

In [None]:
THEME = "elections"

EVENT_NAMES = [
    "us_elections_2016",
    "us_midterms_2018",
]

N_DISPLAY = 50

NGRAM_RANGE = (1, 2)

In [None]:
for event_name in EVENT_NAMES:
    # Read event data
    logging.info(f"Loading event data {event_name}...")
    event_comments = load_event_comments(theme=THEME, event_name=event_name)
    event_vocab = load_event_vocab(theme=THEME, event_name=event_name)

    # sort vocabulary tokens by index
    ind_to_token = {v: k for k, v in event_vocab.items()}

    dem_comments, rep_comments = split_by_party(
        comments=event_comments,
    )

    logging.info(dem_comments.shape)
    logging.info(rep_comments.shape)

    tokens = event_comments["tokens"]
    dem_tokens = dem_comments["tokens"]
    rep_tokens = rep_comments["tokens"]

    logging.info("Building overall term vector...")
    overall_term_vec = build_term_vector(
        tokens, ngram_range=NGRAM_RANGE, vocab=event_vocab
    )
    logging.info("Building dem term vector...")
    dem_term_vec = build_term_vector(
        dem_tokens, ngram_range=NGRAM_RANGE, vocab=event_vocab
    )
    logging.info("Building rep term vector...")
    rep_term_vec = build_term_vector(
        rep_tokens, ngram_range=NGRAM_RANGE, vocab=event_vocab
    )

    logging.info("Calculating loggodds...")
    logodds = logodds_with_prior(
        overall_term_vec,
        dem_term_vec,
        rep_term_vec,
        zscore=True,
    )

    # Get top token indices
    sorted_logodds_indices = np.argsort(logodds)

    idiosyncratic_tokens = {}

    logging.info("Democrat tokens")
    dem_idiosyncratic_tokens = [
        (ind_to_token[index], logodds[index])
        for index in reversed(sorted_logodds_indices[-N_DISPLAY:])
    ]
    logging.info(dem_idiosyncratic_tokens)

    idiosyncratic_tokens["dem"] = dem_idiosyncratic_tokens

    logging.info("Republican tokens")
    rep_idiosyncratic_tokens = [
        (ind_to_token[index], logodds[index])
        for index in sorted_logodds_indices[:N_DISPLAY]
    ]
    logging.info(rep_idiosyncratic_tokens)

    idiosyncratic_tokens["rep"] = rep_idiosyncratic_tokens

    with open(f"{OUTPUT_DIR}/{event_name}_idiosyncratic_tokens.json", "w") as f:
        json.dump(idiosyncratic_tokens, f)
