In [None]:
from calculating_risk_for_single_community import calculating_risk_for_single_community as single_calculate
import numpy as np
import pandas as pd
import math
import datetime
import warnings
warnings.filterwarnings('ignore')

In [None]:
'''
Method
1) Daily new cases
2) Model outputs
    a) Aggregate RPI data to communities
    b) Unify the format
3) Convert to Value between -1,0,1
4) Calculate correlation coefficient
'''


In [None]:
# input data into dataframes
population = pd.read_csv('processed_population.csv')

url_infe = "https://raw.githubusercontent.com/ANRGUSC/covid19_risk_estimation/master/data/Covid-19.csv"
infection = pd.read_csv(url_infe)

url_usc = "https://raw.githubusercontent.com/ANRGUSC/covid19_risk_estimation/master/data/Covid-19-R.csv"
risk_usc = pd.read_csv(url_usc)

url_rpi = "https://raw.githubusercontent.com/Yueyang-Li-Elfa/Risk-Score-RPI-Solver/master/Latest%20Risk%20Score/risk_score.csv"
risk_rpi = pd.read_csv(url_rpi)

url_umich1 = "https://raw.githubusercontent.com/skasralikar/Risk-Score-1-UMichZJU/master/data/output/LA_daily_out.csv"
risk_umich1 = pd.read_csv(url_umich1,error_bad_lines=False)

# url_lmu = "https://raw.githubusercontent.com/wujj0326/COVID_Risk_Competition/master/Data/final_risk_score.csv"
# risk_lmu = pd.read_csv(url_lmu)

url_umich3 = "https://raw.githubusercontent.com/sli525/rmds-lab-team3-project/master/output%20file/risk_level.csv"
risk_umich3 = pd.read_csv(url_umich3)


In [None]:
# Change all regions/locations to Title Style
infection['Region'] = infection.apply(lambda x: x['Region'].replace('UNINCORPORATED - ','').title().replace('city of ',''), axis = 1)

print(population.head(),'\n',infection.head(),'\n',risk_usc.head(),'\n',risk_umich1.head(),'\n',risk_umich3.head())

In [None]:
# Create a variable to save the latest start date and earlist end date
date_inter=["01-01-2020","12-31-2020"]
date_inter[0] = datetime.datetime.strptime(date_inter[0],'%m-%d-%Y')
date_inter[1] = datetime.datetime.strptime(date_inter[1],'%m-%d-%Y')

In [None]:
# Organize all model outputs to the format as ["Region","cases","risk","first_date","last_date"]
# USC
la_regions = risk_usc.Region.unique()

model_usc = pd.DataFrame()
model_usc["Region"] = la_regions
model_usc["cases"] = np.empty((len(model_usc), 0)).tolist()
model_usc["risk"] = np.empty((len(model_usc), 0)).tolist()
model_usc["first_date"] = np.empty((len(model_usc), 0)).tolist()
model_usc["last_date"] = np.empty((len(model_usc), 0)).tolist()

infection.sort_values(by="Time Stamp")
risk_usc.sort_values(by="Time Stamp")

for index, row in model_usc.iterrows():
    case_r = infection.loc[infection['Region'] == row["Region"]]
    risk_r = risk_usc.loc[risk_usc['Region'] == row["Region"]]
    model_usc["cases"][index]= case_r["Number of cases"].tolist()
    model_usc["risk"][index]= risk_r["Risk-Score"].tolist()
    
    # Get min and max Time stamp
    model_usc["first_date"][index]= [np.min(infection['Time Stamp']),np.min(risk_usc['Time Stamp'])]
    model_usc["last_date"][index]= [np.max(infection['Time Stamp']),np.max(risk_usc['Time Stamp'])]

# Update the time interval
usc_first = datetime.datetime.strptime(model_usc["first_date"][0][1],'%m-%d-%Y')
usc_last = datetime.datetime.strptime(model_usc["last_date"][0][1],'%m-%d-%Y')
if usc_first>date_inter[0]:
    date_inter[0]=usc_first
if usc_last<date_inter[1]:
    date_inter[1]=usc_last
print(model_usc)

In [None]:
# Organize all model outputs to the format as ["Region","cases","risk","first_date","last_date"]
# umich1
la_regions = risk_umich1.Region.unique()

model_umich1 = pd.DataFrame()
model_umich1["Region"] = la_regions
model_umich1["cases"] = np.empty((len(model_umich1), 0)).tolist()
model_umich1["risk"] = np.empty((len(model_umich1), 0)).tolist()
model_umich1["first_date"] = np.empty((len(model_umich1), 0)).tolist()
model_umich1["last_date"] = np.empty((len(model_umich1), 0)).tolist()

infection.sort_values(by="Time Stamp")
risk_umich1.sort_values(by="Timestamp")

for index, row in model_umich1.iterrows():
    case_r = infection.loc[infection['Region'] == row["Region"]]
    risk_r = risk_umich1.loc[risk_umich1['Region'] == row["Region"]]
    model_umich1["cases"][index]= case_r["Number of cases"].tolist()
    model_umich1["risk"][index]= risk_r["Risk_score"].tolist()
    
    # Get min and max Time stamp
    model_umich1["first_date"][index]= [np.min(infection['Time Stamp']),np.min(risk_umich1['Timestamp']).strip()]
    model_umich1["last_date"][index]= [np.max(infection['Time Stamp']),np.max(risk_umich1['Timestamp']).strip()]

# Update the time interval
umich1_first = datetime.datetime.strptime(model_umich1["first_date"][0][1],'%Y-%m-%d')
umich1_last = datetime.datetime.strptime(model_umich1["last_date"][0][1],'%Y-%m-%d')
if umich1_first>date_inter[0]:
    date_inter[0]=umich1_first
if umich1_last<date_inter[1]:
    date_inter[1]=umich1_last

    
print(model_umich1)

In [None]:
# Organize all model outputs to the format as ["Region","cases","risk","first_date","last_date"]
# umich3
la_regions = risk_umich3.Region.unique()

model_umich3 = pd.DataFrame()
model_umich3["Region"] = la_regions
model_umich3["cases"] = np.empty((len(model_umich3), 0)).tolist()
model_umich3["risk"] = np.empty((len(model_umich3), 0)).tolist()
model_umich3["first_date"] = np.empty((len(model_umich3), 0)).tolist()
model_umich3["last_date"] = np.empty((len(model_umich3), 0)).tolist()

infection.sort_values(by="Time Stamp")
risk_umich3.sort_values(by="Time Stamp")

for index, row in model_umich3.iterrows():
    case_r = infection.loc[infection['Region'] == row["Region"]]
    risk_r = risk_umich3.loc[risk_umich3['Region'] == row["Region"]]
    model_umich3["cases"][index]= case_r["Number of cases"].tolist()
    model_umich3["risk"][index]= risk_r["risk_score"].tolist()
    
    # Get min and max Time stamp
    model_umich3["first_date"][index]= [np.min(infection['Time Stamp']),np.min(risk_umich3['Time Stamp']).strip()]
    model_umich3["last_date"][index]= [np.max(infection['Time Stamp']),np.max(risk_umich3['Time Stamp']).strip()]


# Update the time interval
umich3_first = datetime.datetime.strptime(model_umich3["first_date"][0][1],'%Y-%m-%d')
umich3_last = datetime.datetime.strptime(model_umich3["last_date"][0][1],'%Y-%m-%d')
if umich3_first>date_inter[0]:
    date_inter[0]=umich3_first
if umich3_last<date_inter[1]:
    date_inter[1]=umich3_last    
    
print(model_umich3)

In [None]:
print(len(model_umich3['risk'][1]))

In [None]:
# We don't have long time series of RPI data so will not be included in the model this time
# Organize all model outputs to the format as ["Region","cases","risk","first_date","last_date"]
# RPI

la_regions = risk_rpi["community"].unique()
print(la_regions)
print(risk_rpi.dropna())
risk_rpi = risk_rpi.dropna(thresh=26)
comm_rpi = risk_rpi.groupby(["community","weekday"])["risk_score"].apply(lambda x : x.astype(float).mean()).reset_index()

la_regions = comm_rpi["community"].unique()
print(la_regions)


model_rpi = pd.DataFrame()
model_rpi["Region"] = la_regions
model_rpi["cases"] = np.empty((len(model_rpi), 0)).tolist()
model_rpi["risk"] = np.empty((len(model_rpi), 0)).tolist()
model_rpi["first_date"] = np.empty((len(model_rpi), 0)).tolist()
model_rpi["last_date"] = np.empty((len(model_rpi), 0)).tolist()

infection.sort_values(by="Time Stamp")
risk_rpi.sort_values(by="Time Stamp")

for index, row in model_rpi.iterrows():
    case_r = infection.loc[infection['Region'] == row["Region"]]
    risk_r = risk_rpi.loc[risk_rpi['Region'] == row["Region"]]
    model_rpi["cases"][index]= case_r["Number of cases"].tolist()
    model_rpi["risk"][index]= risk_r["risk_score"].tolist()
    
    # Get min and max Time stamp
    model_rpi["first_date"][index]= [np.min(infection['Time Stamp']),np.min(risk_rpi['Time Stamp'])]
    model_rpi["last_date"][index]= [np.max(infection['Time Stamp']),np.max(risk_rpi['Time Stamp'])]
    
print(model_rpi)

In [None]:
# Choose intersected data based on dates
def column_slice(df,data_column,last_date_column,case_risk,first_d,last_d):
    case_len = (first_d - last_d).days
    format_date = ["%m-%d-%Y","%Y-%m-%d"]
    case_l_delta = (last_d - datetime.datetime.strptime(df[last_date_column][0][case_risk],format_date[case_risk])).days
    case_m = df[data_column].tolist()

    for i in range(len(case_m)):
        j = len(case_m[i])
        if case_l_delta < 0:
            case_m[i] = case_m[i][(case_l_delta+case_len):case_l_delta]
        elif case_l_delta == 0:
            case_m[i] = case_m[i][case_len:]

    df[data_column] = case_m

In [None]:
# Ensemble

# merge two or more dataframes into one

df_ensemble = model_usc.merge(model_umich1,on='Region').merge(model_umich3,on='Region')
df_ensemble = df_ensemble[["Region","cases","risk_x","risk_y","risk","first_date_x","last_date_x","first_date_y","last_date_y","first_date","last_date"]]
df_ensemble = df_ensemble.rename(columns={'risk_x': 'risk_usc','risk_y':'risk_umich1','risk':'risk_umich3','first_date_x':'first_date_usc','first_date_y':'first_date_umich1','first_date':'first_date_umich3','last_date_x':'last_date_usc','last_date_y':'last_date_umich1','last_date':'last_date_umich3'})

'''
Use the intersection generated from all the data

'''
df_merged = df_ensemble
first_d = date_inter[0]
last_d = date_inter[1]

# Temperarily change first day to July 12th, and define the last day to be 3-day less
first_d = datetime.datetime.strptime('2020-7-12','%Y-%m-%d')
last_d = last_d - datetime.timedelta(days=3)
# cases portion
column_slice(df_ensemble,"cases","last_date_usc",0,first_d,last_d)
# usc portion
column_slice(df_ensemble,"risk_usc","last_date_usc",0,first_d,last_d)
# umich1 portion
column_slice(df_ensemble,"risk_umich1","last_date_umich1",1,first_d,last_d)
# umich3 portion
column_slice(df_ensemble,"risk_umich3","last_date_umich3",1,first_d,last_d)

print(len(df_ensemble['risk_usc'][100]),len(df_ensemble['cases'][100]),len(df_ensemble['risk_umich1'][0]),len(df_ensemble['risk_umich3'][0]))


df_ensemble = df_ensemble[['Region','cases','risk_usc','risk_umich1','risk_umich3']]
df_ensemble["first_last"] = [date_inter for i in df_ensemble.index]
df_ensemble

In [None]:
def n_case(n_ahead, case):
    n = len(case)
    new_infe = []
    for i in range(n):
        if i<n_ahead:
            new_infe.append(0)
        else:
            new_infe.append(case[i]-case[i-n_ahead])
    new_infe = (np.array(new_infe)-np.min(new_infe)) / (np.max(new_infe)-np.min(new_infe))
    new_infe = np.round(new_infe,6)
    
    return new_infe.tolist()

def n_risk(risk):
    n = len(risk)

    risk = (np.array(risk)-np.min(risk)) / (np.max(risk)-np.min(risk))
    risk = np.round(risk,6)
    return risk.tolist()

In [None]:
df_ensemble['3-case'] = 0
df_ensemble['3-case'] = df_ensemble['3-case'].astype('object')
df_ensemble['7-case'] = 0
df_ensemble['7-case'] = df_ensemble['7-case'].astype('object')
df_ensemble['n-usc'] = 0
df_ensemble['n-usc'] = df_ensemble['n-usc'].astype('object')
df_ensemble['n-umich1'] = 0
df_ensemble['n-umich1'] = df_ensemble['n-umich1'].astype('object')
df_ensemble['n-umich3'] = 0
df_ensemble['n-umich3'] = df_ensemble['n-umich3'].astype('object')

for index, row in df_ensemble.iterrows():
    case = np.array(row['cases'], dtype = 'float')
    risk_usc = np.array(row['risk_usc'], dtype = 'float')
    risk_umich1 = np.array(row['risk_umich1'], dtype = 'float')
    risk_umich3 = np.array(row['risk_umich3'], dtype = 'float')
    
    if sum(risk_usc)<=0:
        continue
    df_ensemble.set_value(index, '3-case', n_case(3, case))
    #df_ensemble.set_value(index, '7-case', n_case(7, case))
    df_ensemble.set_value(index, 'n-usc', n_risk(risk_usc))
    df_ensemble.set_value(index, 'n-umich1', n_risk(risk_umich1))
    df_ensemble.set_value(index, 'n-umich3', n_risk(risk_umich3))

In [None]:
df_ensemble

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

def lin_reg(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=9)
    lin_reg_mod = LinearRegression()
    lin_reg_mod.fit(X_train, y_train)
    pred = lin_reg_mod.predict(X_test)
    # RMSE
    test_set_rmse = (np.sqrt(mean_squared_error(y_test, pred)))
    # r square
    test_set_r2 = lin_reg_mod.score(X_test,y_test)

    output = np.array([lin_reg_mod.coef_,lin_reg_mod.intercept_,test_set_rmse])
    return output

In [None]:
df_ensemble = df_ensemble[df_ensemble["3-case"] != 0]
df_ensemble.to_csv("ensemble_pre.csv")

In [None]:
df_ensemble['ensemble'] = 0
df_ensemble['ensemble'] = df_ensemble['ensemble'].astype('object')

# Ensemble to do 3-day prediction
for index, row in df_ensemble.iterrows():
    try:


        y = np.array(row['3-case'], dtype = 'object')
        x_usc = np.array(row['n-usc'], dtype = 'object')
        x_umich = np.array(row['n-umich1'], dtype = 'object')
        x_rmds3 = np.array(row['n-umich3'], dtype = 'object')
        X = pd.DataFrame({'n_usc': x_usc, 'n_umich1': x_umich,'n_umich3': x_rmds3})
        df_ensemble.set_value(index, 'ensemble', lin_reg(X,y))
    except:
        pass

In [None]:
df_ensemble.to_csv("ensemble_result.csv")

In [None]:
# Use the last 3 days data to do prediction and evaluation (in progress)

last_d = date_inter[1]
first_d = last_d - datetime.timedelta(days=3)
df_test=df_merged

# cases portion
column_slice(df_test,"cases","last_date_usc",0,first_d,last_d)
# usc portion
column_slice(df_test,"risk_usc","last_date_usc",0,first_d,last_d)
# umich1 portion
column_slice(df_test,"risk_umich1","last_date_umich1",1,first_d,last_d)
# umich3 portion
column_slice(df_test,"risk_umich3","last_date_umich3",1,first_d,last_d)




