In [1]:

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pandas import read_csv
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier



$r = \text{NBR_cas}''(t)$

$AVG\ P(r>0)$ sur 14 J

$AVG\ P(r==0)$ sur 14 J

$AVG\ P(r<0)$ sur 14 J

Output $\rightarrow$ \[-1, 1\]

ECART-type $\rightarrow$ certitude

In [2]:
DATA_PATH = "../../data_processed/usa_top200_final.csv"
df = pd.read_csv(DATA_PATH, index_col="Unnamed: 0")

df.drop(labels=["date", 'GEOID', 'NAME', 'cumulative_confirmed', 'smoothed_cumul', 'smoothed_d1'], axis=1, inplace=True)

df = df.sort_values(['shapeID', 'time'])

In [3]:
df.columns

Index(['time', 'shapeID', 'x0_mean', 'x0_amin', 'x0_amax', 'x1_mean',
       'x1_amin', 'x1_amax', 'x2_mean', 'x2_amin', 'x2_amax', 'x3_mean',
       'x3_amin', 'x3_amax', 'x4_mean', 'x4_amin', 'x4_amax', 'x5_mean',
       'x5_amin', 'x5_amax', 'x6_mean', 'x6_amin', 'x6_amax', 'x7_mean',
       'x7_amin', 'x7_amax', 'x8_mean', 'x8_amin', 'x8_amax', 'x9_mean',
       'x9_amin', 'x9_amax', 'x10_mean', 'x10_amin', 'x10_amax', 'smoothed_d2',
       'mobility_transit_stations', 'mobility_retail_and_recreation',
       'mobility_grocery_and_pharmacy', 'mobility_parks',
       'mobility_residential', 'mobility_workplaces'],
      dtype='object')

In [4]:
ids = df["shapeID"].drop_duplicates().values

N_DAYS = 7
times = df["time"].drop_duplicates().values
times_grouped = [times[n:n+N_DAYS] for n in range(0, len(times), N_DAYS)]
times_grouped.pop() # not len 14

cols = df.drop(['time', 'shapeID', 'smoothed_d2'], axis=1).columns
grouped_cols = []
for i in range(0, N_DAYS):
    for label in cols:
        grouped_cols.append('Day {}: {}'.format(i, label))

X_grouped = pd.DataFrame(columns=grouped_cols)
Y_grouped = pd.DataFrame(columns=['y{}'.format(i) for i in range(0,N_DAYS)])

errors = []
bad_format_error = 0
for id in ids:
    for dates in times_grouped:

        id_query = df.shapeID == id
        date_query = df.time.isin(dates)
        query = df[id_query & date_query].drop(['time', 'shapeID'], axis=1)
        
        x_vals = query.drop('smoothed_d2', axis=1).values.flatten()
        y_vals = query.smoothed_d2.values.flatten()

        if(len(x_vals)) != len(grouped_cols):
            bad_format_error+=1
            errors.append(query)
        else:
            df_x_temp = pd.DataFrame(x_vals.reshape(1,-1), columns=grouped_cols)
            df_y_temp = pd.DataFrame(y_vals.reshape(1,-1), columns=['y{}'.format(i) for i in range(0,N_DAYS)])
            X_grouped = X_grouped.append(df_x_temp, ignore_index=True)
            Y_grouped = Y_grouped.append(df_y_temp, ignore_index=True)
print("SUCCESS with {}/{} FORMAT_ERRORS".format(bad_format_error, len(ids) * len(times_grouped)))
X_grouped

SUCCESS with 1544/6000 FORMAT_ERRORS


Unnamed: 0,Day 0: x0_mean,Day 0: x0_amin,Day 0: x0_amax,Day 0: x1_mean,Day 0: x1_amin,Day 0: x1_amax,Day 0: x2_mean,Day 0: x2_amin,Day 0: x2_amax,Day 0: x3_mean,...,Day 6: x9_amax,Day 6: x10_mean,Day 6: x10_amin,Day 6: x10_amax,Day 6: mobility_transit_stations,Day 6: mobility_retail_and_recreation,Day 6: mobility_grocery_and_pharmacy,Day 6: mobility_parks,Day 6: mobility_residential,Day 6: mobility_workplaces
0,-0.265867,-0.617371,-0.001806,-3.129673,-4.534496,-1.185845,-1.297791,-2.135575,-0.545528,1.382690,...,-0.031516,-0.907673,-1.195230,-0.582090,-33.0,-43.0,-13.0,-26.000000,23.0,-52.0
1,0.233274,-0.412005,0.862699,-2.557625,-3.365029,-1.506327,-0.544322,-2.441180,1.502298,1.223960,...,-0.647954,0.343750,-0.143780,0.856180,-29.0,-39.0,-7.0,44.000000,22.0,-53.0
2,-0.770472,-2.284364,0.466664,-1.003385,-1.959998,-0.141072,-1.935275,-2.905714,-1.183925,-1.299237,...,0.073250,-0.938103,-1.522722,-0.420376,-31.0,-42.0,-15.0,-22.000000,23.0,-52.0
3,2.504439,1.649267,3.440990,-2.251706,-3.396859,-1.145116,-1.919852,-3.037474,-0.705522,0.719270,...,-0.700126,-0.387579,-0.984083,0.760272,-29.0,-34.0,-10.0,0.000000,22.0,-50.0
4,0.210959,-1.003040,2.487906,-1.546592,-2.536916,-0.508163,-1.525725,-2.509403,-0.490812,0.139731,...,-0.909358,0.903034,0.438086,1.423229,-29.0,-33.0,-9.0,20.000000,15.0,-50.0
5,-0.718689,-2.278638,0.597595,-0.611439,-1.969486,0.350838,-1.352978,-2.262016,-0.135466,-0.884870,...,-0.221004,0.192331,-0.242879,0.539384,-31.0,-32.0,-6.0,-30.000000,22.0,-49.0
6,-0.324670,-0.733631,0.138329,-1.886763,-3.198827,-0.422535,1.255972,-0.415679,2.693703,2.410346,...,0.068804,-0.150052,-0.753738,0.426131,-23.0,-27.0,-4.0,68.000000,19.0,-47.0
7,-0.612875,-1.537777,0.451462,-2.275767,-3.504989,-0.460879,-1.044129,-1.718642,-0.210753,1.208377,...,0.006254,-0.301724,-0.757907,0.120698,-32.0,-28.0,-3.0,-1.000000,19.0,-45.0
8,-1.320549,-1.879949,-0.940167,-0.928139,-2.043570,0.121937,0.427096,-0.873863,1.756007,1.161699,...,0.655955,-0.526483,-0.776356,0.102439,-22.0,-22.0,4.0,59.000000,17.0,-44.0
9,-1.135404,-2.275179,0.263950,0.311717,-1.004855,1.238783,-0.356256,-2.003182,0.971203,-0.123988,...,0.885830,0.519334,0.170695,0.820025,-22.0,-15.0,12.0,53.000000,16.0,-42.0


In [10]:
N_PCs = 11
avg_cols = ["x{}_mean".format(i) for i in range(0,N_PCs)]
min_cols = ["x{}_amin".format(i) for i in range(0,N_PCs)]
max_cols = ["x{}_amax".format(i) for i in range(0,N_PCs)]
generic_cols = ['time', 'shapeID', 'mobility_transit_stations', 'mobility_retail_and_recreation',
       'mobility_grocery_and_pharmacy', 'mobility_parks',
       'mobility_residential', 'mobility_workplaces']

def get_df_from_cols(dataframe, cols):
    X_new = pd.DataFrame()
    for column in dataframe.columns:
        for label in cols:
            if label in column:
                X_new[column] = dataframe[column]
    return X_new

X_no_cams = get_df_from_cols(X_grouped, generic_cols)
X_min = get_df_from_cols(X_grouped, min_cols+generic_cols)
X_max = get_df_from_cols(X_grouped, max_cols+generic_cols)
X_avg = get_df_from_cols(X_grouped, avg_cols+generic_cols)

In [41]:
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.ensemble import AdaBoostRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

X_train = X_no_cams.to_numpy()
y_train = Y_grouped.mean().to_numpy()

num_folds = 10
seed = 7
scoring = 'neg_mean_squared_error'

models = []
models.append(('GB', GradientBoostingRegressor()))
models.append(('AB', AdaBoostRegressor()))
models.append(('DT', DecisionTreeRegressor()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('LR', LinearRegression()))

results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s %f %f " % (name, cv_results.mean(), cv_results.std())
    print(msg)

DT -2937.549588 1352.933767 
KNN -1483.987277 1304.749986 
LR -1290.123501 1250.325647 


Results: no difference

In [65]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

TRESHOLD = 10

X_train = X_min.to_numpy()
y_train = (Y_grouped > TRESHOLD).sum(axis=1)

num_folds = 10
seed = 7
scoring = 'accuracy'

models = []
models.append(('GB', GradientBoostingClassifier()))
models.append(('AB', AdaBoostClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('LR', LogisticRegression()))

results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s %f %f " % (name, cv_results.mean(), cv_results.std())
    print(msg)

GB 0.455341 0.057830 
AB 0.442550 0.058166 
DT 0.346043 0.033837 
KNN 0.412478 0.039655 
LR 0.447716 0.061701 


MEAN OF WEEK W HIGHER 10 CLASSIFICATION

- GB 0.927965 0.019642 
- AB 0.925721 0.020152 
- DT 0.875234 0.026127 
- KNN 0.924821 0.022831 
- LR 0.927741 0.019161 

(w cams)
- GB 0.927293 0.020238 
- AB 0.923704 0.018885 
- DT 0.875671 0.011897 
- KNN 0.923250 0.023082 
- LR 0.922804 0.018741 



N DAYS HIGHER 10 CLASSIFICATION

- GB 0.450636 0.063706 
- AB 0.430657 0.074770 
- DT 0.319354 0.038397 
- KNN 0.410910 0.041697 
- LR 0.460284 0.067121 

(w cams)
- GB 0.455341 0.057830 
- AB 0.442550 0.058166 
- DT 0.346043 0.033837 
- KNN 0.412478 0.039655 
- LR 0.447716 0.061701 