In [None]:
import json
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

from src.scripts.tools import Tools
from src.scripts.incremental_imputer import Imputer

In [None]:
def date_range(start, end):
    delta = end - start  # as timedelta
    days = [start + timedelta(days=i) for i in range(delta.days)]
    return days

In [None]:
tools = Tools()
incremental_imputer = Imputer()

In [None]:
dataframes_path = "/data/processed/data_frames"
covid_data_path = "/data/raw/covid_data"
socio_economic_params_path = "/data/raw/socio_economic_params"

In [None]:
with open("/src/main_configs.json", 'r') as f:
    main_configs = json.load(f)

### Date-based covid data and ATV

In [None]:
fisher_exact_test_results_date = pd.read_parquet(f"{dataframes_path}/fisher_exact_test_results_date.parquet")
fisher_exact_test_results_date['created_at'] = pd.to_datetime(fisher_exact_test_results_date['created_at'])

date_base_unemployment = pd.read_parquet(f"{socio_economic_params_path}/date_base_unemployment.parquet")

In [None]:
covid_data = pd.read_csv(f"{covid_data_path}/owid-covid-data.csv")
covid_data = covid_data[covid_data['iso_code'] == "USA"][['date', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'people_vaccinated_per_hundred']]
covid_data['new_cases'] = covid_data['new_cases'].fillna(1.0)
covid_data.loc[:, ['total_deaths', 'new_deaths']] = covid_data[['total_deaths', 'new_deaths']].fillna(0.0)
covid_data['people_vaccinated_per_hundred'] = covid_data['people_vaccinated_per_hundred'].fillna(0.0)

dummy_values = [0.0]*len(date_range(datetime(2020, 1, 1), datetime(2020, 1, 22)))
dummy_df = pd.DataFrame(np.array([date_range(datetime(2020, 1, 1), datetime(2020, 1, 22)),
                                      dummy_values, dummy_values, dummy_values, dummy_values, dummy_values]).T,
                        columns=covid_data.columns)

covid_data = pd.concat([dummy_df, covid_data], axis=0).reset_index(drop=True)

covid_data['date'] = pd.to_datetime(covid_data['date'])
covid_data = covid_data[covid_data['date'] < '2022-01-01']

In [None]:
fisher_exact_test_results_date.rename(columns={'created_at':'date'}, inplace=True)
covid_data = covid_data.join(fisher_exact_test_results_date.set_index('date'), on='date').drop(columns=['padj'])

covid_data.iloc[:, 1:-3] = np.log10(covid_data.iloc[:, 1:-3].values.astype(np.float32))
covid_data.iloc[:, -3] = covid_data.iloc[:, -3].values / 100
covid_data = covid_data.replace(-np.inf, 0)

  after removing the cwd from sys.path.


In [None]:
date_base_unemployment = date_base_unemployment.iloc[1:, [0, -3]].rename(columns={'Unnamed: 0':'date', 'I.U.R':'unemployment'}).dropna()
date_base_unemployment['date'] = pd.to_datetime(date_base_unemployment['date'].apply(lambda x: '-'.join(np.array(x.split('/'))[[-1, 0, 1]].tolist())))
date_base_unemployment = date_base_unemployment[date_base_unemployment['date'] >= '2020-01-01']

date_base_unemployment['unemployment'] = date_base_unemployment['unemployment'].astype(np.float32)

In [None]:
days = date_range(datetime(2020, 1, 1), datetime(2022, 1, 1))
date_base_unemployment_all = pd.DataFrame({'date': days,
                                           'unemployment': [np.nan] * len(days)})
date_base_unemployment_all = date_base_unemployment_all.join(date_base_unemployment.set_index('date'),
                                                             on='date', lsuffix='_').drop(columns=['unemployment_'])

incremental_imputer.impute_data(date_base_unemployment_all, 'unemployment')

covid_data = covid_data.join(date_base_unemployment_all.set_index('date'), on='date')
covid_data.head()

Unnamed: 0,date,total_cases,new_cases,total_deaths,new_deaths,people_vaccinated_per_hundred,tweet_counts,odd_ratios,unemployment
0,2020-01-01,0.0,0.0,0.0,0.0,0.0,965,-0.084875,1.5
1,2020-01-02,0.0,0.0,0.0,0.0,0.0,1569,0.294055,1.5
2,2020-01-03,0.0,0.0,0.0,0.0,0.0,1370,0.306792,1.5
3,2020-01-04,0.0,0.0,0.0,0.0,0.0,1000,0.0829,1.5
4,2020-01-05,0.0,0.0,0.0,0.0,0.0,989,-0.161164,1.5


In [None]:
covid_data.to_parquet(f"{dataframes_path}/date_based_covid_data_ATV_and_unemployment.parquet", index=False)

### State-based Vaccination Percentage and ATV

In [None]:
vaccination_percentage = pd.read_parquet(f"{covid_data_path}/us_state_vaccinations.parquet")[['date', 'location', 'people_vaccinated_per_hundred']]
vaccination_percentage['date'] = pd.to_datetime(vaccination_percentage['date'])
vaccination_percentage = vaccination_percentage[vaccination_percentage['date'] < '2022-01-01'].reset_index(drop=True)

vaccination_percentage['location'] = vaccination_percentage['location'].replace({'New York State': 'New York'})
vaccination_percentage['location'] = vaccination_percentage['location'].apply(lambda x: main_configs['us_states_and_abbreviations'].get(x))

In [None]:
imputed_dfs = [incremental_imputer.impute_data(vaccination_percentage[vaccination_percentage['location'] == state],
                                               'people_vaccinated_per_hundred', inplace=False) for state in main_configs['us_51_state']]
vaccination_percentage = pd.concat(imputed_dfs, axis=0).reset_index(drop=True)

In [None]:
fisher_exact_test_results_state = pd.read_parquet(f"{dataframes_path}/fisher_exact_test_results_state.parquet")
fisher_exact_test_results_state = fisher_exact_test_results_state[fisher_exact_test_results_state['state'].isin(main_configs['us_51_state'])].sort_values('state')

In [None]:
vaccination_percentage = np.array([np.max(vaccination_percentage[vaccination_percentage['location'] == state]['people_vaccinated_per_hundred'].values) for state in sorted(main_configs['us_51_state'])])
vaccination_percentage = pd.DataFrame({'state':np.array(sorted(main_configs['us_51_state'])),
                                       'vaccination_percentage': vaccination_percentage,
                                       'odd_ratios': fisher_exact_test_results_state['odd_ratios'].values,
                                       'tweet_counts': fisher_exact_test_results_state['tweet_counts'].values})

In [None]:
vaccination_percentage.to_csv(f"{dataframes_path}/correlation_df_VaccinationPercentage_ATV.csv", index=False)