In [None]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.stats import fisher_exact

from src.scripts.read_data import ReadData

In [None]:
world_data_path = "/data/raw/daily_world_en_csv"
dataframes_path = "/data/processed/dataframes"

with open("/src/main_configs.json", 'r') as f:
    main_configs = json.load(f)

In [None]:
world_data_sentiments_raw = pd.read_parquet(f"{dataframes_path}/world_data_sentiments_raw.parquet")
us_tweets_sentiments_raw = pd.read_parquet(f"{dataframes_path}/us_tweets_sentiments_raw.parquet")

read_data_world = ReadData(world_data_path, ['id', 'text'], filter_tweets=True)
read_data_world.read_csvs_and_combine_data()
read_data_world.data = read_data_world.data.drop(columns=['text'])

  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
100%|██████████| 73/73 [01:07<00:00,  1.07it/s]


In [None]:
read_data_world.data['dummy_value'] = np.ones((read_data_world.data.shape[0], ))

In [None]:
filtered_us = us_tweets_sentiments_raw.loc[np.max(us_tweets_sentiments_raw.loc[:, ['Pro', 'Anti']].values, axis=1) >= 0.99, :]
filtered_us.loc[:, 'label'] = np.argmax(filtered_us.loc[:, ['Anti', 'Pro']].values, axis=1)
filtered_us = filtered_us.drop(columns=['Rest', 'Pro', 'Anti'])
us_ready = read_data_world.data.join(filtered_us.set_index("id"), on='id').dropna().drop(columns=['dummy_value'])
us_ready = us_ready.reset_index(drop=True)

filtered_world = world_data_sentiments_raw.loc[np.max(world_data_sentiments_raw.loc[:, ['Pro', 'Anti']].values, axis=1) >= 0.99, :]
filtered_world.loc[:, 'label'] = np.argmax(filtered_world.loc[:, ['Anti', 'Pro']].values, axis=1)
filtered_world = filtered_world.drop(columns=['Rest', 'Pro', 'Anti'])
world_ready = read_data_world.data.join(filtered_world.set_index("id"), on='id').dropna().drop(columns=['dummy_value'])
world_ready = world_ready.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [None]:
us_ready['label'] = us_ready['label'].astype(int)
world_ready['label'] = world_ready['label'].astype(int)

daily_us_ready = us_ready.copy(deep=True)

In [None]:
us_ready['created_at'] = pd.to_datetime(us_ready['created_at'].apply(lambda x: x[:7]))
world_ready['created_at'] = pd.to_datetime(world_ready['created_at'].apply(lambda x: x[:7]))

daily_us_ready['created_at'] = pd.to_datetime(daily_us_ready['created_at'].apply(lambda x: x[:11]))

In [None]:
us_ready = us_ready.value_counts(['created_at', 'label']).reset_index(name='counts')
us_ready = us_ready.sort_values('created_at').reset_index(drop=True)

world_ready = world_ready.value_counts(['created_at', 'label']).reset_index(name='counts')
world_ready = world_ready.sort_values('created_at').reset_index(drop=True)

daily_us_ready = daily_us_ready.value_counts(['created_at', 'label']).reset_index(name='counts')
daily_us_ready = daily_us_ready.sort_values('created_at').reset_index(drop=True)

In [None]:
us_ready = us_ready.pivot_table(values='counts', index='created_at', columns='label').reset_index()
us_ready.columns = ['created_at', 'Anti', 'Pro']

world_ready = world_ready.pivot_table(values='counts', index='created_at', columns='label').reset_index()
world_ready.columns = ['created_at', 'Anti', 'Pro']

daily_us_ready = daily_us_ready.pivot_table(values='counts', index='created_at', columns='label').reset_index()
daily_us_ready.columns = ['created_at', 'Anti', 'Pro']

In [None]:
unique_dates = us_ready['created_at'].values
daily_unique_dates = daily_us_ready['created_at'].values

### Fisher Exact Test

In [None]:
def date_table(table, all_dates):
    fisher_tables = []

    for date in all_dates:
        date_count = table.loc[table['created_at'] == date, ["Pro", "Anti"]].values.tolist()[0]
        rest = table.loc[table['created_at'] != date, ["Pro", "Anti"]].values.sum(axis=0).tolist()

        fisher_tables.append(np.array([date_count, rest]))
    
    return fisher_tables

In [None]:
us_fisher_values = [fisher_exact(table, alternative='two-sided') for table in date_table(us_ready, unique_dates)]
world_fisher_values = [fisher_exact(table, alternative='two-sided') for table in date_table(world_ready, unique_dates)]

daily_us_fisher_values = [fisher_exact(table, alternative='two-sided') for table in date_table(daily_us_ready, daily_unique_dates)]

In [None]:
us_df = pd.DataFrame({'date': unique_dates,
                   'tweet_counts': us_ready[['Anti', 'Pro']].sum(axis=1).values,
                   'p_val':[i[1] for i in us_fisher_values],
                   'odd_ratios':np.log([i[0] for i in us_fisher_values])})

world_df = pd.DataFrame({'date': unique_dates,
                   'tweet_counts': world_ready[['Anti', 'Pro']].sum(axis=1).values,
                   'p_val':[i[1] for i in world_fisher_values],
                   'odd_ratios':np.log([i[0] for i in world_fisher_values])})

daily_us_df = pd.DataFrame({'date': daily_unique_dates,
                   'tweet_counts': daily_us_ready[['Anti', 'Pro']].sum(axis=1).values,
                   'p_val':[i[1] for i in daily_us_fisher_values],
                   'odd_ratios':np.log([i[0] for i in daily_us_fisher_values])})

In [None]:
us_df.to_parquet(f"{dataframes_path}/monthly_fisher_exact_test_results_for_us_tweets.parquet", index=False)
world_df.to_parquet(f"{dataframes_path}/monthly_fisher_exact_test_results_for_world_tweets.parquet", index=False)
daily_us_df.to_parquet(f"{dataframes_path}/daily_fisher_exact_test_results_for_us_tweets.parquet", index=False)