In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("data/logs/user_partisanship.log"),
        logging.StreamHandler(stream=sys.stdout)
    ]
)

In [None]:
# import polars as pl
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

from load.utils import (
    load_df_from_parquet,
    save_df_as_parquet,
)
from preprocessing.utils import (
    calculate_user_party,
)

In [None]:
YEARS = [2016]

logging.info(YEARS)

### User Affiliation

In [None]:
for year in YEARS:
    logging.info(f"Loading partisan comments {year}")
    partisan_comments_year = load_df_from_parquet(
        file_name=f"partisan_comments_{year}.parquet"
    )

    logging.info(f"Calculating user partisanship {year}")
    user_party_year: pd.DataFrame = partisan_comments_year.groupby(
        by="author"
    ).progress_apply(
        calculate_user_party,
    )  # type: ignore

    user_party_year["party"] = user_party_year["party"].astype("string")
    user_party_year = user_party_year.reset_index().copy()

    del partisan_comments_year

    logging.info(f"Saving user partisanship {year}")
    save_df_as_parquet(
        data=user_party_year,
        target_file=f"user_party_{year}.parquet",
    )

    logging.info(user_party_year.shape)
    logging.info(user_party_year.groupby(by="party")["author"].count())
