In [1]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from betabinomial import pval_adj
from scipy.stats import fisher_exact

In [2]:
with open("/src/main_configs.json", 'r') as f:
    main_configs = json.load(f)

# Loading Data

In [4]:
dataframes_path = "/data/processed/data_frames"

state_and_sentiment_counts = pd.read_csv(f"{dataframes_path}/state_and_sentiment_counts.csv")
date_and_sentiment_counts = pd.read_csv(f"{dataframes_path}/date_and_sentiment_counts.csv")

In [6]:
state_and_sentiment_counts = state_and_sentiment_counts[state_and_sentiment_counts['state'].isin(main_configs['us_51_state'])].reset_index(drop=True)

state_and_sentiment_counts = state_and_sentiment_counts.pivot_table(values='counts', index='state', columns='label').reset_index().fillna(0)
state_and_sentiment_counts.iloc[:, 1:] = state_and_sentiment_counts.iloc[:, 1:].astype(int)

date_and_sentiment_counts = date_and_sentiment_counts.pivot_table(values='counts', index='created_at', columns='label').reset_index().fillna(0)
date_and_sentiment_counts.iloc[:, 1:] = date_and_sentiment_counts.iloc[:, 1:].astype(int)

# Fisher-Exact Test

In [8]:
def state_table(state):
    state_count = state_and_sentiment_counts.loc[state_and_sentiment_counts['state'] == state, ["Pro", "Anti"]].values.tolist()[0]
    rest = state_and_sentiment_counts.loc[state_and_sentiment_counts['state'] != state, ["Pro", "Anti"]].values.sum(axis=0).tolist()

    return np.array([state_count, rest])

In [None]:
fisher_values = [fisher_exact(state_table(sname), alternative='two-sided') for sname in sorted(state_and_sentiment_counts['state'].values)]

In [None]:
p_values = [i[1] for i in fisher_values]
odd_values = [i[0] for i in fisher_values]

In [None]:
df = pd.DataFrame({'state': sorted(state_and_sentiment_counts['state'].values),
                   'tweet_counts': state_and_sentiment_counts.sort_values('state')[['Anti', 'Pro']].sum(axis=1).values,
                   'padj':-np.log10(pval_adj(np.array(p_values))),
                   'odd_ratios':np.log(odd_values)})

In [None]:
df.to_parquet(f"{dataframes_path}/fisher_exact_test_results_state.parquet", index=False)

### CTM

In [None]:
us_tweets_ctm = pd.read_parquet(f"{dataframes_path}/state_topic_count_8_200_0.2.parquet")
us_tweets_ctm = us_tweets_ctm[us_tweets_ctm['state'].isin(main_configs['us_51_state'])].reset_index(drop=True)

In [None]:
def state_table(state, topic):
    try:
        state_topic = us_tweets_ctm.loc[(us_tweets_ctm['state'] == state) & (us_tweets_ctm['topic'] == topic), "counts"].values[0]
    except:
        state_topic = 0
    state_rest = us_tweets_ctm.loc[(us_tweets_ctm['state'] == state) & (us_tweets_ctm['topic'] != topic), "counts"].values.sum()
    rest_topic = us_tweets_ctm.loc[(us_tweets_ctm['state'] != state) & (us_tweets_ctm['topic'] == topic), "counts"].values.sum()
    rest_rest = us_tweets_ctm.loc[(us_tweets_ctm['state'] != state) & (us_tweets_ctm['topic'] != topic), "counts"].values.sum()

    return np.array([[state_topic, state_rest],
                     [rest_topic, rest_rest]])

In [None]:
state_and_sentiment_counts['sum'] = state_and_sentiment_counts.iloc[:, 1:].values.sum(axis=1)

In [None]:
topics = list(range(1,9))
states = sorted(us_tweets_ctm['state'].unique())
tweet_counts = np.array([us_tweets_ctm.loc[us_tweets_ctm['state'] == state, "counts"].values.sum() for state in states])

In [None]:
dfs = []
for topic in tqdm(topics):
    fisher_values = np.array([fisher_exact(state_table(sname, topic), alternative='two-sided') for sname in states])
    p_values = fisher_values[:, 1]
    odd_values = fisher_values[:, 0]

    df = pd.DataFrame({'state': states,
                   'tweet_counts': tweet_counts,
                   'padj':-np.log10(pval_adj(np.array(p_values))),
                   'odd_ratios':np.log(odd_values),
                   'topic': np.array([topic] * len(states))})
    
    dfs.append(df)

100%|██████████| 8/8 [00:05<00:00,  1.50it/s]


In [None]:
dfs = pd.concat(dfs, axis=0, ignore_index=True)
dfs.to_parquet(f"{dfs_saving_path}/fisher_exact_test_results_ctm.parquet", index=False)