In [1]:
# importamos las librerías necesarias para trabajar.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, cross_val_score, train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

#Otros 
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_raw = pd.read_feather('./data/pivot_data')

df_raw.index = df_raw['index']
df_raw.drop('index', axis=1, inplace=True)

df_raw

Unnamed: 0_level_0,argentino,asistencia_respiratoria,confirmados,cuidado_intensivo,edad,fallecidos,mayor_65,mayor_65_internacion,presentan_sintomas,privado,publico,internacion,sexo_F,sexo_M
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-03-01,0,0,0,0,0.000000,0,0,0,0,0,0,0,0,0
2020-03-02,1,0,1,0,43.000000,0,0,0,1,1,0,1,0,1
2020-03-03,0,0,0,0,0.000000,0,0,0,0,0,0,0,0,0
2020-03-04,0,0,0,0,0.000000,0,0,0,0,0,0,0,0,0
2020-03-05,5,1,5,1,50.600000,1,2,1,5,3,2,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-09-26,678,7,694,9,44.386167,12,123,30,359,394,300,54,348,346
2020-09-27,556,2,567,6,40.890212,7,76,21,335,307,260,45,274,293
2020-09-28,764,3,785,6,41.391614,9,113,26,431,480,305,53,399,386
2020-09-29,892,4,906,4,42.691685,9,137,25,454,531,375,57,449,457


## Pruebo con las principales features primero

In [3]:
cols = ['fallecidos', 'mayor_65', 'internacion', 'cuidado_intensivo', 'mayor_65_internacion']
df = df_raw[cols]

In [4]:
#df['fallecidos_mañana'] = df.fallecidos.shift(periods=-1, fill_value=0)
df.head()

Unnamed: 0_level_0,fallecidos,mayor_65,internacion,cuidado_intensivo,mayor_65_internacion
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-03-01,0,0,0,0,0
2020-03-02,0,0,1,0,0
2020-03-03,0,0,0,0,0
2020-03-04,0,0,0,0,0
2020-03-05,1,2,3,1,1


In [5]:
res = pd.DataFrame({'model':[], 'r2':[] , 'mse': []})

In [6]:
df = df.loc[df.index >= pd.Timestamp('2020-04-01')]

In [7]:
df.loc[(df.index >= pd.Timestamp('2020-09-01')) & (df.index < pd.Timestamp('2020-09-15'))]

Unnamed: 0_level_0,fallecidos,mayor_65,internacion,cuidado_intensivo,mayor_65_internacion
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-09-01,30,179,97,17,54
2020-09-02,46,164,126,24,49
2020-09-03,31,161,119,21,51
2020-09-04,37,162,105,24,50
2020-09-05,32,147,99,14,42
2020-09-06,24,108,66,11,32
2020-09-07,37,210,129,18,58
2020-09-08,37,173,128,20,43
2020-09-09,28,177,105,10,50
2020-09-10,39,197,107,14,44


In [8]:
xtrain = df.loc[df.index < pd.Timestamp('2020-09-01')].drop(['fallecidos'], axis = 1)
ytrain = df.loc[df.index < pd.Timestamp('2020-09-01'), 'fallecidos']

xtest = df.loc[(df.index >= pd.Timestamp('2020-09-01')) & (df.index < pd.Timestamp('2020-09-15'))].drop(['fallecidos'], axis = 1)
ytest = df.loc[(df.index >= pd.Timestamp('2020-09-01')) & (df.index < pd.Timestamp('2020-09-15')), 'fallecidos']

xval = df.loc[df.index >= pd.Timestamp('2020-09-15')].drop(['fallecidos'], axis = 1)
yval = df.loc[df.index >= pd.Timestamp('2020-09-15'), 'fallecidos']

## Escalo los datos

In [9]:
scaler = StandardScaler()
xtrain_scal = scaler.fit_transform(xtrain)
xtest_scal = scaler.transform(xtest)

In [10]:
lr = LinearRegression()
lr.fit(xtrain_scal, ytrain)
r2_score(ytest, lr.predict(xtest_scal))

0.6522837035781356

In [11]:
res = res.append({'model':'Linear Reg Scal',
            'r2':r2_score(ytest, lr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, lr.predict(xtest_scal))},
           ignore_index=True)

In [12]:
svr = SVR()
svr.fit(xtrain_scal, ytrain)
r2_score(ytest, svr.predict(xtest_scal))

0.5884569016182941

In [13]:
res = res.append({'model':'SVR Scal',
            'r2':r2_score(ytest, svr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, svr.predict(xtest_scal))},
           ignore_index=True)

In [14]:
knr = KNeighborsRegressor()
knr.fit(xtrain_scal, ytrain)
r2_score(ytest, knr.predict(xtest_scal))

0.48037412314886985

In [15]:
res = res.append({'model':'KNR Scal',
            'r2':r2_score(ytest, knr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, knr.predict(xtest_scal))},
           ignore_index=True)

## Agrego polinomios

In [16]:
poly = PolynomialFeatures(2)
xtrain_poly = poly.fit_transform(xtrain_scal)
xtest_poly = poly.transform(xtest_scal)

In [17]:
lr = LinearRegression()
lr.fit(xtrain_poly, ytrain)
r2_score(ytest, lr.predict(xtest_poly))

0.6670612415557277

In [18]:
res = res.append({'model':'Linear Reg Poly',
            'r2':r2_score(ytest, lr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, lr.predict(xtest_poly))},
           ignore_index=True)

In [19]:
svr = SVR()
svr.fit(xtrain_poly, ytrain)
r2_score(ytest, svr.predict(xtest_poly))

0.4612128065001383

In [20]:
res = res.append({'model':'SVR Poly',
            'r2':r2_score(ytest, svr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, svr.predict(xtest_poly))},
           ignore_index=True)

In [21]:
knr = KNeighborsRegressor()
knr.fit(xtrain_poly, ytrain)
r2_score(ytest, knr.predict(xtest_poly))

0.5356248376201611

In [22]:
res = res.append({'model':'KNR Poly',
            'r2':r2_score(ytest, knr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, knr.predict(xtest_poly))},
           ignore_index=True)

In [23]:
res.sort_values('r2', ascending=False)

Unnamed: 0,model,r2,mse
3,Linear Reg Poly,0.667061,26.152679
0,Linear Reg Scal,0.652284,27.31347
1,SVR Scal,0.588457,32.32713
5,KNR Poly,0.535625,36.477143
2,KNR Scal,0.480374,40.817143
4,SVR Poly,0.461213,42.322284


# Analiso con el dataset promediado

In [24]:
df_ma = df.rolling(3).mean()
df_ma = df_ma.fillna(0)

In [25]:
xtrain = df_ma.loc[df_ma.index < pd.Timestamp('2020-09-01')].drop(['fallecidos'], axis = 1)
ytrain = df_ma.loc[df_ma.index < pd.Timestamp('2020-09-01'), 'fallecidos']

xtest = df_ma.loc[(df_ma.index >= pd.Timestamp('2020-09-01')) & (df_ma.index < pd.Timestamp('2020-09-15'))].drop(['fallecidos'], axis = 1)
ytest = df_ma.loc[(df_ma.index >= pd.Timestamp('2020-09-01')) & (df_ma.index < pd.Timestamp('2020-09-15')), 'fallecidos']

xval = df_ma.loc[df_ma.index >= pd.Timestamp('2020-09-15')].drop(['fallecidos'], axis = 1)
yval = df_ma.loc[df_ma.index >= pd.Timestamp('2020-09-15'), 'fallecidos']

In [26]:
scaler = StandardScaler()
xtrain_scal = scaler.fit_transform(xtrain)
xtest_scal = scaler.transform(xtest)

In [27]:
lr = LinearRegression()
lr.fit(xtrain_scal, ytrain)
r2_score(ytest, lr.predict(xtest_scal))

0.8371602026720403

In [28]:
res = res.append({'model':'Linear Reg Scal (Prom)',
            'r2':r2_score(ytest, lr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, lr.predict(xtest_scal))},
           ignore_index=True)

In [29]:
svr = SVR()
svr.fit(xtrain_scal, ytrain)
r2_score(ytest, svr.predict(xtest_scal))

0.6814525796309414

In [30]:
res = res.append({'model':'SVR Scal (Prom)',
            'r2':r2_score(ytest, svr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, svr.predict(xtest_scal))},
           ignore_index=True)

In [31]:
knr = KNeighborsRegressor()
knr.fit(xtrain_scal, ytrain)
r2_score(ytest, knr.predict(xtest_scal))

0.24916641946010076

In [32]:
res = res.append({'model':'KNR Scal (Prom)',
            'r2':r2_score(ytest, knr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, knr.predict(xtest_scal))},
           ignore_index=True)

## Agrego polinomios

In [33]:
poly = PolynomialFeatures(2)
xtrain_poly = poly.fit_transform(xtrain_scal)
xtest_poly = poly.transform(xtest_scal)

In [34]:
lr = LinearRegression()
lr.fit(xtrain_poly, ytrain)
r2_score(ytest, lr.predict(xtest_poly))

0.2941076091205822

In [35]:
res = res.append({'model':'Linear Reg Poly (Prom)',
            'r2':r2_score(ytest, lr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, lr.predict(xtest_poly))},
           ignore_index=True)

In [36]:
svr = SVR()
svr.fit(xtrain_poly, ytrain)
r2_score(ytest, svr.predict(xtest_poly))

0.4496486538576975

In [37]:
res = res.append({'model':'SVR Poly (Prom)',
            'r2':r2_score(ytest, svr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, svr.predict(xtest_poly))},
           ignore_index=True)

In [38]:
knr = KNeighborsRegressor()
knr.fit(xtrain_poly, ytrain)
r2_score(ytest, knr.predict(xtest_poly))

0.28246573716997947

In [39]:
res = res.append({'model':'KNR Poly (Prom)',
            'r2':r2_score(ytest, knr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, knr.predict(xtest_poly))},
           ignore_index=True)

In [40]:
res.sort_values('r2', ascending=False)

Unnamed: 0,model,r2,mse
6,Linear Reg Scal (Prom),0.83716,6.22373
7,SVR Scal (Prom),0.681453,12.174868
3,Linear Reg Poly,0.667061,26.152679
0,Linear Reg Scal,0.652284,27.31347
1,SVR Scal,0.588457,32.32713
5,KNR Poly,0.535625,36.477143
2,KNR Scal,0.480374,40.817143
4,SVR Poly,0.461213,42.322284
10,SVR Poly (Prom),0.449649,21.034403
9,Linear Reg Poly (Prom),0.294108,26.979175


# Pruebo usando solo las ultimas N semanas para entrenar

In [41]:
xtrain = df.loc[df.index < pd.Timestamp('2020-09-01')].drop(['fallecidos'], axis = 1)
ytrain = df.loc[df.index < pd.Timestamp('2020-09-01'), 'fallecidos']

xtest = df.loc[(df.index >= pd.Timestamp('2020-09-01')) & (df.index < pd.Timestamp('2020-09-15'))].drop(['fallecidos'], axis = 1)
ytest = df.loc[(df.index >= pd.Timestamp('2020-09-01')) & (df.index < pd.Timestamp('2020-09-15')), 'fallecidos']

xval = df.loc[df.index >= pd.Timestamp('2020-09-15')].drop(['fallecidos'], axis = 1)
yval = df.loc[df.index >= pd.Timestamp('2020-09-15'), 'fallecidos']

In [42]:
df.shape[0]

183

In [43]:
df = df.reset_index()
df.drop('index', axis=1, inplace=True)

In [44]:
def get_r2(xtrain, xtest, ytrain, ytest, m = KNeighborsRegressor()):
    
    m.fit(xtrain, ytrain)
    
    return r2_score(ytest, m.predict(xtest)), mean_squared_error(ytest, m.predict(xtest))

In [45]:
def get_scal(xtrain, xtest):
    scaler = StandardScaler()
    
    xtrain = scaler.fit_transform(xtrain)
    xtest = scaler.transform(xtest)
    
    return xtrain, xtest

In [46]:
def get_poly(xtrain, xtest, n=2):
    poly = PolynomialFeatures(n)
    
    xtrain = poly.fit_transform(xtrain)
    xtest = poly.transform(xtest)
    
    return xtrain, xtest

In [47]:
res_weeks = pd.DataFrame({'model':[], 'n_weeks':[], 'r2':[], 'mse':[]})

In [48]:
n_weeks = 4
start = 0
middle = 7 * n_weeks - 1
end = 7 * n_weeks + 7 - 1

In [49]:
r2 = np.array([])
mse = np.array([])
models = {'LR': LinearRegression(n_jobs=5), 'KNR': KNeighborsRegressor(n_jobs=5), 'SVR': SVR()}

for name, m in models.items():  # Para cada modelo en la lista de modelos correr lo siguiente
    
    for delta in range(0, df.shape[0] - n_weeks * 7, 1):
        # Armo una ventana de n_weeks de ancho que uso para entrenar los modelos
        # Esa ventana se va corriendo de a un dia a la vez
        
        xtrain = df.loc[delta:middle + delta].drop(['fallecidos'], axis = 1)
        ytrain = df.loc[delta:middle + delta, 'fallecidos']

        xtest = df.loc[middle + delta:end + delta].drop(['fallecidos'], axis = 1)
        ytest = df.loc[middle + delta:end + delta, 'fallecidos']

        aux_r2, aux_mse = get_r2(xtrain, xtest, ytrain, ytest, m)
        
        # Una vez que entrene el modelo con esas entradas y calcule sus metricas las guardo en un array
        r2 = np.append(r2, aux_r2)
        mse = np.append(mse, aux_mse)
    
    # Evaluo que tan bien hizo cada modelo a lo largo de las semanas calculando la media de las metricas obtenidas
    res_weeks = res_weeks.append({'model':name, 'r2':r2.mean(), 'mse': mse.mean(), 'n_weeks': n_weeks}, ignore_index=True)

res_weeks.sort_values('r2', ascending=False)

Unnamed: 0,model,n_weeks,r2,mse
0,LR,4.0,-0.022697,26.604618
1,KNR,4.0,-0.245703,35.313387
2,SVR,4.0,-1.03259,54.041862


In [50]:
r2 = np.array([])
mse = np.array([])

for name, m in models.items():
    
    for delta in range(0, df.shape[0] - n_weeks * 7, 1):
        
        xtrain = df.loc[delta:middle + delta].drop(['fallecidos'], axis = 1)
        ytrain = df.loc[delta:middle + delta, 'fallecidos']

        xtest = df.loc[middle + delta:end + delta].drop(['fallecidos'], axis = 1)
        ytest = df.loc[middle + delta:end + delta, 'fallecidos']

        xtrain_scal, xtest_scal = get_scal(xtrain, xtest)

        aux_r2, aux_mse = get_r2(xtrain_scal, xtest_scal, ytrain, ytest, m)

        r2 = np.append(r2, aux_r2)
        mse = np.append(mse, aux_mse)


    res_weeks = res_weeks.append({'model':f"{name} Scal",'r2':r2.mean(),'mse':mse.mean(),'n_weeks':n_weeks}, ignore_index=True)

res_weeks.sort_values('r2', ascending=False)

Unnamed: 0,model,n_weeks,r2,mse
3,LR Scal,4.0,-0.022697,26.604618
0,LR,4.0,-0.022697,26.604618
1,KNR,4.0,-0.245703,35.313387
4,KNR Scal,4.0,-0.263664,34.589679
2,SVR,4.0,-1.03259,54.041862
5,SVR Scal,4.0,-1.037749,56.068587


In [51]:
res_weeks.sort_values(['r2', 'n_weeks'], ascending=False)

Unnamed: 0,model,n_weeks,r2,mse
3,LR Scal,4.0,-0.022697,26.604618
0,LR,4.0,-0.022697,26.604618
1,KNR,4.0,-0.245703,35.313387
4,KNR Scal,4.0,-0.263664,34.589679
2,SVR,4.0,-1.03259,54.041862
5,SVR Scal,4.0,-1.037749,56.068587


## Analisis de series temporales

In [52]:
# Pruebas

m = LinearRegression()

tscv = TimeSeriesSplit(n_splits=6)

cv_results = cross_val_score(m, pd.DataFrame(xtrain), ytrain, cv=tscv, scoring='r2')
cv_results

array([-4.53258204,  0.48002394, -7.61476478, -0.16477099, -8.17331661,
       -4.28874041])