In [1]:
%load_ext autoreload
%autoreload 2

%cd '..'

/dss/dsshome1/03/di93fup/polarization_reddit


In [2]:
import json

import numpy as np

from word_partisanship.utils import (
    logodds_with_prior,
)
from preprocessing.utils import (
    split_by_party,
    load_event_comments,
    load_event_vocab,
    build_vocab,
    build_term_vector,
)
from preprocessing.constants import OUTPUT_DIR, MIN_OCCURENCE_FOR_VOCAB

In [3]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("data/logs/word_partisanship.log"),
        logging.StreamHandler(stream=sys.stdout)
    ]
)

In [4]:
YEAR = 2016

EVENT_NAME = f"us_elections_{YEAR}"

N_DISPLAY = 50

In [5]:
# Read event data
event_comments = load_event_comments(event_name=EVENT_NAME)


In [6]:
dem_comments, rep_comments = split_by_party(
    comments=event_comments,
)


In [7]:
logging.info(dem_comments.shape)
logging.info(rep_comments.shape)

2023-06-08 16:41:41,345 [INFO] (2782950, 9)
2023-06-08 16:41:41,346 [INFO] (1526884, 9)


In [8]:
event_vocab = build_vocab(event_comments["tokens"], ngram_range=(1, 2), min_df=MIN_OCCURENCE_FOR_VOCAB)

logging.info("Building overall term vector...")
term_vec = build_term_vector(event_comments["tokens"], event_vocab)
logging.info("Building dem term vector...")
dem_term_vec = build_term_vector(dem_comments["tokens"], event_vocab)
logging.info("Building rep term vector...")
rep_term_vec = build_term_vector(rep_comments["tokens"], event_vocab)

logging.info("Calculating loggodds...")
logodds = logodds_with_prior(
    term_vec,
    dem_term_vec,
    rep_term_vec,
)


2023-06-08 16:41:41,436 [INFO] Building overall term vector...


MemoryError: 

In [None]:
# Get top token indices
sorted_logodds_indices = np.argsort(logodds)

In [None]:
# sort vocabulary tokens by index
vocab_tokens = np.array(
    [
        key
        for key, _ in sorted(
            event_vocab.items(),
            key=lambda item: item[1],
        )
    ]
)


In [None]:
logging.info("Democrat tokens")
dem_idiosyncratic_tokens = list(vocab_tokens[sorted_logodds_indices[-N_DISPLAY:]])
logging.info(dem_idiosyncratic_tokens)

with open(f"{OUTPUT_DIR}/{EVENT_NAME}_dem_idiosyncratic_tokens.json", "w") as f:
    json.dump(dem_idiosyncratic_tokens, f)


In [None]:
logging.info("Republican tokens")
rep_idiosyncratic_tokens = list(vocab_tokens[sorted_logodds_indices[:N_DISPLAY]])
logging.info(rep_idiosyncratic_tokens)

with open(f"{OUTPUT_DIR}/{EVENT_NAME}_rep_idiosyncratic_tokens.json", "w") as f:
    json.dump(rep_idiosyncratic_tokens, f)
