In [98]:
import pandas as pd
import numpy as np
from utils import separate_xy, train_test_split

In [99]:
# Code
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from models import log_loss

In [100]:
def l1_loss(y_pred, y_true):
    y_true =  np.array(y_true)
    y_pred =  np.array(y_pred)
    assert(len(y_true) == len(y_pred))

    return np.mean(np.abs(y_true -y_pred))

In [107]:
# function to add the one-hot interaction terms
COL = 'county'
DEFAULT_COLS = [
    'chng_smoothed_adj_outpatient_cli',
       'chng_smoothed_adj_outpatient_covid', 'chng_smoothed_outpatient_cli',
       'chng_smoothed_outpatient_covid', 'doctor-visits_smoothed_adj_cli',
       'doctor-visits_smoothed_cli', 'fb-survey_smoothed_cli',
       'fb-survey_smoothed_hh_cmnty_cli', 'fb-survey_smoothed_ili',
       'fb-survey_smoothed_nohh_cmnty_cli',
       'fb-survey_smoothed_travel_outside_state_5d', 'fb-survey_smoothed_wcli',
       'fb-survey_smoothed_whh_cmnty_cli', 'fb-survey_smoothed_wili',
       'fb-survey_smoothed_wnohh_cmnty_cli',
       'fb-survey_smoothed_wtravel_outside_state_5d',
       'hospital-admissions_smoothed_adj_covid19_from_claims',
       'hospital-admissions_smoothed_covid19_from_claims',
       'quidel_covid_ag_smoothed_pct_positive', 'safegraph_bars_visit_num',
       'safegraph_bars_visit_prop', 'safegraph_completely_home_prop',
       'safegraph_completely_home_prop_7dav', 'safegraph_full_time_work_prop',
       'safegraph_full_time_work_prop_7dav',
       'safegraph_median_home_dwell_time',
       'safegraph_median_home_dwell_time_7dav',
       'safegraph_part_time_work_prop', 'safegraph_part_time_work_prop_7dav',
       'safegraph_restaurants_visit_num', 'safegraph_restaurants_visit_prop',
]
DEFAULT_COLS_WITH_SHIFTED = DEFAULT_COLS + [f'SHIFT_{c}' for c in DEFAULT_COLS]

COLS = [
    'hospital-admissions_smoothed_adj_covid19_from_claims',
    'quidel_covid_ag_smoothed_pct_positive'
]
def add_one_hot_and_interactions(df, interaction_cols=DEFAULT_COLS):
    counties = df['county'].unique().tolist()
    df = pd.get_dummies(df, prefix=[COL], columns=[COL])

    for col in interaction_cols:
        for c in counties:
            colname = f'county_{c}'
            df[f'county_{c}_{col}'] = df[col] * df[colname]

    return df

In [108]:
df = pd.read_csv('original_train_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,date,county,chng_smoothed_adj_outpatient_cli,chng_smoothed_adj_outpatient_covid,chng_smoothed_outpatient_cli,chng_smoothed_outpatient_covid,doctor-visits_smoothed_adj_cli,doctor-visits_smoothed_cli,fb-survey_smoothed_cli,...,safegraph_completely_home_prop_7dav,safegraph_full_time_work_prop,safegraph_full_time_work_prop_7dav,safegraph_median_home_dwell_time,safegraph_median_home_dwell_time_7dav,safegraph_part_time_work_prop,safegraph_part_time_work_prop_7dav,safegraph_restaurants_visit_num,safegraph_restaurants_visit_prop,response
0,1,2020-06-01,1073,1.734587,0.106936,1.600603,0.095467,3.025067,2.757994,0.252375,...,0.266217,0.045082,0.039964,691.227799,684.659404,0.078195,0.071174,3920.904475,668.759615,9.999976
1,2,2020-06-01,4013,1.583965,0.238237,1.610365,0.227908,2.840442,3.964751,0.36676,...,0.333171,0.044,0.036443,717.6152,718.383785,0.078856,0.067482,8070.877188,201.404352,17.099871
2,3,2020-06-01,4019,2.47836,0.129131,2.720644,0.132313,2.547498,3.770899,0.606955,...,0.346089,0.042179,0.038705,629.365231,629.611212,0.074251,0.065566,1895.869083,208.52962,10.776225
3,4,2020-06-01,6001,3.918585,0.085287,3.64271,0.08293,1.840819,2.12703,0.251949,...,0.433487,0.043269,0.041447,865.267498,867.296708,0.061682,0.057156,435.666448,33.728597,3.658338
4,5,2020-06-01,6013,1.427697,0.123045,1.570387,0.111393,2.364599,3.288683,0.140145,...,0.39941,0.040764,0.036613,876.362205,894.988299,0.065636,0.059531,485.369151,49.936914,3.430476


In [109]:
# Get the list of the columns of the dataframe
column_list = df.columns.values.tolist()
column_list.remove('Unnamed: 0')
column_list.remove('date')
column_list.remove('county')
column_list.remove('response')

# Get the shifted features
for column_name in column_list:
#     df['SHIFT2_' + column_name] = df[column_name].shift(1)
    df['SHIFT_' + column_name] = df[column_name] - df[column_name].shift(1) / df[column_name]
#     df = df.drop(column_name, axis = 1)
    max_value = df['SHIFT_' + column_name].max()
    min_value  = df['SHIFT_' + column_name].min()
    df['SHIFT_' + column_name] = (df['SHIFT_' + column_name] - min_value) / (max_value - min_value)

df = df.dropna(axis=1, how='all')
df = df.dropna()


In [110]:
df

Unnamed: 0.1,Unnamed: 0,date,county,chng_smoothed_adj_outpatient_cli,chng_smoothed_adj_outpatient_covid,chng_smoothed_outpatient_cli,chng_smoothed_outpatient_covid,doctor-visits_smoothed_adj_cli,doctor-visits_smoothed_cli,fb-survey_smoothed_cli,...,SHIFT_safegraph_completely_home_prop,SHIFT_safegraph_completely_home_prop_7dav,SHIFT_safegraph_full_time_work_prop,SHIFT_safegraph_full_time_work_prop_7dav,SHIFT_safegraph_median_home_dwell_time,SHIFT_safegraph_median_home_dwell_time_7dav,SHIFT_safegraph_part_time_work_prop,SHIFT_safegraph_part_time_work_prop_7dav,SHIFT_safegraph_restaurants_visit_num,SHIFT_safegraph_restaurants_visit_prop
1,2,2020-06-01,4013,1.583965,0.238237,1.610365,0.227908,2.840442,3.964751,0.366760,...,0.811998,0.767339,0.728730,0.558703,0.503797,0.485590,0.682143,0.616105,0.390669,0.121867
2,3,2020-06-01,4019,2.478360,0.129131,2.720644,0.132313,2.547498,3.770899,0.606955,...,0.649630,0.615082,0.722069,0.694838,0.363336,0.320234,0.645972,0.636008,0.090568,0.127609
3,4,2020-06-01,6001,3.918585,0.085287,3.642710,0.082930,1.840819,2.127030,0.251949,...,0.882095,0.869348,0.744748,0.703868,0.738710,0.762806,0.571554,0.529415,0.019643,0.018580
4,5,2020-06-01,6013,1.427697,0.123045,1.570387,0.111393,2.364599,3.288683,0.140145,...,0.610288,0.545049,0.715641,0.528241,0.755921,0.813826,0.700780,0.689239,0.022224,0.031732
5,6,2020-06-01,6019,3.146324,0.185762,2.982210,0.187356,3.796917,3.571460,0.330864,...,0.406752,0.337094,0.765130,0.714948,0.419112,0.415088,0.762717,0.783576,0.029268,0.052059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18295,18296,2020-11-30,49035,6.745637,2.838909,7.253291,3.092718,15.851526,17.780651,1.753087,...,0.532654,0.617724,0.769511,0.630906,0.442810,0.488176,0.724752,0.593260,0.063544,0.083726
18296,18297,2020-11-30,49049,16.152816,1.283203,16.503462,1.276698,7.405778,6.651997,2.258082,...,0.419257,0.368184,0.731260,0.583035,0.506753,0.512916,0.773212,0.806306,0.042339,0.103933
18297,18298,2020-11-30,53033,8.108174,0.910226,8.533337,0.901584,17.272424,20.872014,0.368948,...,0.938778,0.855033,0.620653,0.550149,0.641996,0.641654,0.318065,0.185779,0.037441,0.025817
18298,18299,2020-11-30,55079,4.657846,1.513101,4.849651,1.546724,8.169531,10.775481,2.193921,...,0.497668,0.407960,0.827413,0.893582,0.377462,0.345599,0.787716,0.911218,0.042983,0.097684


In [111]:
df_train, df_val = train_test_split(df)
df_train = df_train.drop('date', axis=1)
df_val = df_val.drop('date', axis=1)

X_train, y_train = separate_xy(df_train, 'response')
X_val, y_val     = separate_xy(df_val, 'response')


print()
print('#######################################################################')
print('LINEAR REGRESSION')
print('#######################################################################')
print()

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred_linear = linear_reg.predict(X_val)

print("Linear regression loss: ")
print(log_loss(y_pred_linear, y_val))

print()
print('#######################################################################')
print('RIDGE REGRESSION')
print('#######################################################################')
print()

ridge_reg = Ridge(alpha=0.1)
ridge_reg.fit(X_train, y_train)
y_pred_ridge = ridge_reg.predict(X_val)

print("Ridge regression loss: ")
print(log_loss(y_pred_ridge, y_val))

print()
print('#######################################################################')
print('LASSO REGRESSION')
print('#######################################################################')
print()

# Increasing default tolerance so the solver converges
lasso_reg = Lasso(alpha=0.02, tol=0.1)
lasso_reg.fit(X_train, y_train)
y_pred_lasso = lasso_reg.predict(X_val)
y_pred_lasso[y_pred_lasso<0] = 0
print("Lasso regression loss: ")
print(log_loss(y_pred_lasso, y_val))
print()

print("Predictions:")
print(y_pred_lasso)

print("Actual values:")
print(y_val)



#######################################################################
LINEAR REGRESSION
#######################################################################

Linear regression loss: 
0.3889465352505646

#######################################################################
RIDGE REGRESSION
#######################################################################

Ridge regression loss: 
0.38895310253447896

#######################################################################
LASSO REGRESSION
#######################################################################

Lasso regression loss: 
0.4110697510689757

Predictions:
[34.04842585 14.68479262 15.31121793 ... 26.32499958 54.651632
 49.20254927]
Actual values:
15301    41.800334
15302    29.791803
15303    32.506006
15304     8.727016
15305    15.282336
           ...    
18296    76.707556
18297    93.047380
18298    29.633201
18299    75.739243
18300    78.745000
Name: response, Length: 3000, dtype: float64


# Adding one-hot and interactions

In [112]:
# df2 = add_one_hot_and_interactions(df, interaction_cols=[])
df2 = add_one_hot_and_interactions(df, interaction_cols=COLS)




In [113]:
df2_train, df2_val = train_test_split(df2)
df2_train = df2_train.drop('date', axis=1)
df2_val = df2_val.drop('date', axis=1)

In [114]:
df2

Unnamed: 0.1,Unnamed: 0,date,chng_smoothed_adj_outpatient_cli,chng_smoothed_adj_outpatient_covid,chng_smoothed_outpatient_cli,chng_smoothed_outpatient_covid,doctor-visits_smoothed_adj_cli,doctor-visits_smoothed_cli,fb-survey_smoothed_cli,fb-survey_smoothed_hh_cmnty_cli,...,county_48215_quidel_covid_ag_smoothed_pct_positive,county_48303_quidel_covid_ag_smoothed_pct_positive,county_48439_quidel_covid_ag_smoothed_pct_positive,county_48453_quidel_covid_ag_smoothed_pct_positive,county_49035_quidel_covid_ag_smoothed_pct_positive,county_49049_quidel_covid_ag_smoothed_pct_positive,county_53033_quidel_covid_ag_smoothed_pct_positive,county_55079_quidel_covid_ag_smoothed_pct_positive,county_55133_quidel_covid_ag_smoothed_pct_positive,county_1073_quidel_covid_ag_smoothed_pct_positive
3,2,2020-06-01,1.583965,0.238237,1.610365,0.227908,2.840442,3.964751,0.366760,13.232588,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,3,2020-06-01,2.478360,0.129131,2.720644,0.132313,2.547498,3.770899,0.606955,16.650415,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
5,4,2020-06-01,3.918585,0.085287,3.642710,0.082930,1.840819,2.127030,0.251949,10.139948,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
6,5,2020-06-01,1.427697,0.123045,1.570387,0.111393,2.364599,3.288683,0.140145,8.213925,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
7,6,2020-06-01,3.146324,0.185762,2.982210,0.187356,3.796917,3.571460,0.330864,14.248363,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18297,18296,2020-11-30,6.745637,2.838909,7.253291,3.092718,15.851526,17.780651,1.753087,38.438334,...,0.0,0.0,0.0,0.0,8.064706,0.000000,0.000000,0.000000,0.000000,0.0
18298,18297,2020-11-30,16.152816,1.283203,16.503462,1.276698,7.405778,6.651997,2.258082,45.455855,...,0.0,0.0,0.0,0.0,0.000000,8.064706,0.000000,0.000000,0.000000,0.0
18299,18298,2020-11-30,8.108174,0.910226,8.533337,0.901584,17.272424,20.872014,0.368948,19.012553,...,0.0,0.0,0.0,0.0,0.000000,0.000000,8.064706,0.000000,0.000000,0.0
18300,18299,2020-11-30,4.657846,1.513101,4.849651,1.546724,8.169531,10.775481,2.193921,37.377444,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,8.064706,0.000000,0.0


In [115]:
X_train2, y_train2 = separate_xy(df2_train, 'response')
X_val2, y_val2     = separate_xy(df2_val, 'response')

In [123]:

# print()
# print('#######################################################################')
# print('LINEAR REGRESSION')
# print('#######################################################################')
# print()

# linear_reg = LinearRegression()
# linear_reg.fit(X_train2, y_train2)
# y_pred_linear2 = linear_reg.predict(X_val2)
# y_pred_linear2[y_pred_linear2 < 0] = 0

# print("Linear regression loss: ")
# print(log_loss(y_pred_linear2, y_val2))

# print()
# print('#######################################################################')
# print('RIDGE REGRESSION')
# print('#######################################################################')
# print()

# ridge_reg = Ridge(alpha=0.1)
# ridge_reg.fit(X_train2, y_train2)
# y_pred_ridge2 = ridge_reg.predict(X_val2)
# y_pred_ridge2[y_pred_ridge2 < 0] = 0

# print("Ridge regression loss: ")
# print(log_loss(y_pred_ridge2, y_val2))

print()
print('#######################################################################')
print('LASSO REGRESSION')
print('#######################################################################')
print()

# Increasing default tolerance so the solver converges
lasso_reg = Lasso(alpha=0.004, tol=0.01)
lasso_reg.fit(X_train2, y_train2)
y_pred_lasso2 = lasso_reg.predict(X_val2)
y_pred_lasso2[y_pred_lasso2 < 0] = 0

print("Lasso regression loss: ")
print(log_loss(y_pred_lasso2, y_val2))
print()

print("Predictions:")
print(y_pred_lasso2)

print("Actual values:")
print(y_val2)



#######################################################################
LASSO REGRESSION
#######################################################################

Lasso regression loss: 
0.327810047289948

Predictions:
[ 32.18483903  21.88164839  16.32153206 ...  40.05084076 127.12006986
 185.339653  ]
Actual values:
15302    41.800334
15303    29.791803
15304    32.506006
15305     8.727016
15306    15.282336
           ...    
18297    76.707556
18298    93.047380
18299    29.633201
18300    75.739243
18301    78.745000
Name: response, Length: 3000, dtype: float64


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


# Adding response_log 

In [71]:
df['response_log'] = np.log(1 + df['response'])
df = df.drop('response', axis = 1)

In [72]:
df_train, df_val = train_test_split(df)
df_train = df_train.drop('date', axis=1)
df_val = df_val.drop('date', axis=1)

X_train, y_train = separate_xy(df_train, 'response_log')
X_val, y_val     = separate_xy(df_val, 'response_log')


print()
print('#######################################################################')
print('LINEAR REGRESSION')
print('#######################################################################')
print()

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred_linear = linear_reg.predict(X_val)

print("Linear regression loss: ")
print(l1_loss(y_pred_linear, y_val))

print()
print('#######################################################################')
print('RIDGE REGRESSION')
print('#######################################################################')
print()

ridge_reg = Ridge(alpha=0.1)
ridge_reg.fit(X_train, y_train)
y_pred_ridge = ridge_reg.predict(X_val)

print("Ridge regression loss: ")
print(l1_loss(y_pred_ridge, y_val))

print()
print('#######################################################################')
print('LASSO REGRESSION')
print('#######################################################################')
print()

# Increasing default tolerance so the solver converges
lasso_reg = Lasso(alpha=0.02, tol=0.1)
lasso_reg.fit(X_train, y_train)
y_pred_lasso = lasso_reg.predict(X_val)

print("Lasso regression loss: ")
print(l1_loss(y_pred_lasso, y_val))
print()

print("Predictions:")
print(y_pred_lasso)

print("Actual values:")
print(y_val)



#######################################################################
LINEAR REGRESSION
#######################################################################

Linear regression loss: 
0.5220286737904394

#######################################################################
RIDGE REGRESSION
#######################################################################

Ridge regression loss: 
0.5236448422686452

#######################################################################
LASSO REGRESSION
#######################################################################

Lasso regression loss: 
0.5564765427659623

Predictions:
[3.79581616 2.94894266 2.57977329 ... 2.84924523 3.69656498 3.74921205]
Actual values:
15302    3.756546
15303    3.427249
15304    3.511725
15305    2.274907
15306    2.790081
           ...   
18297    4.352952
18298    4.543799
18299    3.422084
18300    4.340413
18301    4.378834
Name: response_log, Length: 3000, dtype: float64
