In [1]:
# importamos las librerías necesarias para trabajar.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, cross_val_score, train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

#Otros 
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_raw = pd.read_feather('./data/pivot_data')

df_raw.index = df_raw['index']
df_raw.drop('index', axis=1, inplace=True)

df_raw

Unnamed: 0_level_0,argentino,asistencia_respiratoria,confirmados,cuidado_intensivo,edad,fallecidos,mayor_65,mayor_65_internacion,presentan_sintomas,privado,publico,internacion,sexo_F,sexo_M
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-03-01,0,0,0,0,0.000000,0,0,0,0,0,0,0,0,0
2020-03-02,1,0,1,0,43.000000,0,0,0,1,1,0,1,0,1
2020-03-03,0,0,0,0,0.000000,0,0,0,0,0,0,0,0,0
2020-03-04,0,0,0,0,0.000000,0,0,0,0,0,0,0,0,0
2020-03-05,5,1,5,1,50.600000,1,2,1,5,3,2,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-09-26,676,7,692,9,44.251445,10,121,29,359,393,299,53,348,344
2020-09-27,556,2,567,6,40.890212,7,76,21,335,307,260,45,274,293
2020-09-28,764,3,785,6,41.391614,9,113,26,431,480,305,53,399,386
2020-09-29,891,4,904,4,42.636799,7,135,25,454,531,373,57,449,455


## Pruebo con las principales features primero

In [227]:
cols = ['fallecidos', 'mayor_65', 'internacion', 'cuidado_intensivo', 'mayor_65_internacion']
df = df_raw[cols]

In [4]:
#df['fallecidos_mañana'] = df.fallecidos.shift(periods=-1, fill_value=0)
df.head()

Unnamed: 0_level_0,fallecidos,mayor_65,internacion,cuidado_intensivo,mayor_65_internacion
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-03-01,0,0,0,0,0
2020-03-02,0,0,1,0,0
2020-03-03,0,0,0,0,0
2020-03-04,0,0,0,0,0
2020-03-05,1,2,3,1,1


In [5]:
res = pd.DataFrame({'model':[], 'r2':[] , 'mse': []})

In [6]:
df = df.loc[df.index >= pd.Timestamp('2020-04-01')]

In [7]:
df.loc[(df.index >= pd.Timestamp('2020-09-01')) & (df.index < pd.Timestamp('2020-09-15'))]

Unnamed: 0_level_0,fallecidos,mayor_65,internacion,cuidado_intensivo,mayor_65_internacion
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-09-01,28,177,96,17,53
2020-09-02,41,159,124,24,47
2020-09-03,26,157,115,20,48
2020-09-04,34,160,104,24,49
2020-09-05,29,144,97,13,40
2020-09-06,20,107,66,11,32
2020-09-07,35,208,128,18,57
2020-09-08,33,169,127,19,42
2020-09-09,26,175,105,10,50
2020-09-10,32,192,107,14,44


In [8]:
xtrain = df.loc[df.index < pd.Timestamp('2020-09-01')].drop(['fallecidos'], axis = 1)
ytrain = df.loc[df.index < pd.Timestamp('2020-09-01'), 'fallecidos']

xtest = df.loc[(df.index >= pd.Timestamp('2020-09-01')) & (df.index < pd.Timestamp('2020-09-15'))].drop(['fallecidos'], axis = 1)
ytest = df.loc[(df.index >= pd.Timestamp('2020-09-01')) & (df.index < pd.Timestamp('2020-09-15')), 'fallecidos']

xval = df.loc[df.index >= pd.Timestamp('2020-09-15')].drop(['fallecidos'], axis = 1)
yval = df.loc[df.index >= pd.Timestamp('2020-09-15'), 'fallecidos']

## Escalo los datos

In [9]:
scaler = StandardScaler()
xtrain_scal = scaler.fit_transform(xtrain)
xtest_scal = scaler.transform(xtest)

In [10]:
lr = LinearRegression()
lr.fit(xtrain_scal, ytrain)
r2_score(ytest, lr.predict(xtest_scal))

0.7110094228962718

In [11]:
res = res.append({'model':'Linear Reg Scal',
            'r2':r2_score(ytest, lr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, lr.predict(xtest_scal))},
           ignore_index=True)

In [12]:
svr = SVR()
svr.fit(xtrain_scal, ytrain)
r2_score(ytest, svr.predict(xtest_scal))

0.6351936330319984

In [13]:
res = res.append({'model':'SVR Scal',
            'r2':r2_score(ytest, svr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, svr.predict(xtest_scal))},
           ignore_index=True)

In [14]:
knr = KNeighborsRegressor()
knr.fit(xtrain_scal, ytrain)
r2_score(ytest, knr.predict(xtest_scal))

0.5489780324737346

In [15]:
res = res.append({'model':'KNR Scal',
            'r2':r2_score(ytest, knr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, knr.predict(xtest_scal))},
           ignore_index=True)

## Agrego polinomios

In [16]:
poly = PolynomialFeatures(2)
xtrain_poly = poly.fit_transform(xtrain_scal)
xtest_poly = poly.transform(xtest_scal)

In [17]:
lr = LinearRegression()
lr.fit(xtrain_poly, ytrain)
r2_score(ytest, lr.predict(xtest_poly))

0.696850379542169

In [18]:
res = res.append({'model':'Linear Reg Poly',
            'r2':r2_score(ytest, lr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, lr.predict(xtest_poly))},
           ignore_index=True)

In [19]:
svr = SVR()
svr.fit(xtrain_poly, ytrain)
r2_score(ytest, svr.predict(xtest_poly))

0.5321657701786032

In [20]:
res = res.append({'model':'SVR Poly',
            'r2':r2_score(ytest, svr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, svr.predict(xtest_poly))},
           ignore_index=True)

In [21]:
knr = KNeighborsRegressor()
knr.fit(xtrain_poly, ytrain)
r2_score(ytest, knr.predict(xtest_poly))

0.5275835721107928

In [22]:
res = res.append({'model':'KNR Poly',
            'r2':r2_score(ytest, knr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, knr.predict(xtest_poly))},
           ignore_index=True)

In [23]:
res.sort_values('r2', ascending=False)

Unnamed: 0,model,r2,mse
0,Linear Reg Scal,0.711009,18.524886
3,Linear Reg Poly,0.69685,19.432509
1,SVR Scal,0.635194,23.384833
2,KNR Scal,0.548978,28.911429
4,SVR Poly,0.532166,29.989129
5,KNR Poly,0.527584,30.282857


# Analiso con el dataset promediado

In [24]:
df_ma = df.rolling(3).mean()
df_ma = df_ma.fillna(0)

In [25]:
xtrain = df_ma.loc[df_ma.index < pd.Timestamp('2020-09-01')].drop(['fallecidos'], axis = 1)
ytrain = df_ma.loc[df_ma.index < pd.Timestamp('2020-09-01'), 'fallecidos']

xtest = df_ma.loc[(df_ma.index >= pd.Timestamp('2020-09-01')) & (df_ma.index < pd.Timestamp('2020-09-15'))].drop(['fallecidos'], axis = 1)
ytest = df_ma.loc[(df_ma.index >= pd.Timestamp('2020-09-01')) & (df_ma.index < pd.Timestamp('2020-09-15')), 'fallecidos']

xval = df_ma.loc[df_ma.index >= pd.Timestamp('2020-09-15')].drop(['fallecidos'], axis = 1)
yval = df_ma.loc[df_ma.index >= pd.Timestamp('2020-09-15'), 'fallecidos']

In [26]:
scaler = StandardScaler()
xtrain_scal = scaler.fit_transform(xtrain)
xtest_scal = scaler.transform(xtest)

In [27]:
lr = LinearRegression()
lr.fit(xtrain_scal, ytrain)
r2_score(ytest, lr.predict(xtest_scal))

0.8544080330021755

In [28]:
res = res.append({'model':'Linear Reg Scal (Prom)',
            'r2':r2_score(ytest, lr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, lr.predict(xtest_scal))},
           ignore_index=True)

In [29]:
svr = SVR()
svr.fit(xtrain_scal, ytrain)
r2_score(ytest, svr.predict(xtest_scal))

0.707463732120601

In [30]:
res = res.append({'model':'SVR Scal (Prom)',
            'r2':r2_score(ytest, svr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, svr.predict(xtest_scal))},
           ignore_index=True)

In [31]:
knr = KNeighborsRegressor()
knr.fit(xtrain_scal, ytrain)
r2_score(ytest, knr.predict(xtest_scal))

0.269954775935316

In [32]:
res = res.append({'model':'KNR Scal (Prom)',
            'r2':r2_score(ytest, knr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, knr.predict(xtest_scal))},
           ignore_index=True)

## Agrego polinomios

In [33]:
poly = PolynomialFeatures(2)
xtrain_poly = poly.fit_transform(xtrain_scal)
xtest_poly = poly.transform(xtest_scal)

In [34]:
lr = LinearRegression()
lr.fit(xtrain_poly, ytrain)
r2_score(ytest, lr.predict(xtest_poly))

0.3071775409585912

In [35]:
res = res.append({'model':'Linear Reg Poly (Prom)',
            'r2':r2_score(ytest, lr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, lr.predict(xtest_poly))},
           ignore_index=True)

In [36]:
svr = SVR()
svr.fit(xtrain_poly, ytrain)
r2_score(ytest, svr.predict(xtest_poly))

0.5478813401346592

In [37]:
res = res.append({'model':'SVR Poly (Prom)',
            'r2':r2_score(ytest, svr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, svr.predict(xtest_poly))},
           ignore_index=True)

In [38]:
knr = KNeighborsRegressor()
knr.fit(xtrain_poly, ytrain)
r2_score(ytest, knr.predict(xtest_poly))

0.42767301630807186

In [39]:
res = res.append({'model':'KNR Poly (Prom)',
            'r2':r2_score(ytest, knr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, knr.predict(xtest_poly))},
           ignore_index=True)

In [40]:
res.sort_values('r2', ascending=False)

Unnamed: 0,model,r2,mse
6,Linear Reg Scal (Prom),0.854408,4.818071
0,Linear Reg Scal,0.711009,18.524886
7,SVR Scal (Prom),0.707464,9.680894
3,Linear Reg Poly,0.69685,19.432509
1,SVR Scal,0.635194,23.384833
2,KNR Scal,0.548978,28.911429
10,SVR Poly (Prom),0.547881,14.961949
4,SVR Poly,0.532166,29.989129
5,KNR Poly,0.527584,30.282857
11,KNR Poly (Prom),0.427673,18.94


# Pruebo usando solo las ultimas N semanas para entrenar

In [41]:
xtrain = df.loc[df.index < pd.Timestamp('2020-09-01')].drop(['fallecidos'], axis = 1)
ytrain = df.loc[df.index < pd.Timestamp('2020-09-01'), 'fallecidos']

xtest = df.loc[(df.index >= pd.Timestamp('2020-09-01')) & (df.index < pd.Timestamp('2020-09-15'))].drop(['fallecidos'], axis = 1)
ytest = df.loc[(df.index >= pd.Timestamp('2020-09-01')) & (df.index < pd.Timestamp('2020-09-15')), 'fallecidos']

xval = df.loc[df.index >= pd.Timestamp('2020-09-15')].drop(['fallecidos'], axis = 1)
yval = df.loc[df.index >= pd.Timestamp('2020-09-15'), 'fallecidos']

In [42]:
df.shape[0]

183

In [43]:
df = df.reset_index()
df.drop('index', axis=1, inplace=True)

In [44]:
def get_r2(xtrain, xtest, ytrain, ytest, m = KNeighborsRegressor()):
    
    m.fit(xtrain, ytrain)
    
    return r2_score(ytest, m.predict(xtest)), mean_squared_error(ytest, m.predict(xtest))

In [45]:
def get_scal(xtrain, xtest):
    scaler = StandardScaler()
    
    xtrain = scaler.fit_transform(xtrain)
    xtest = scaler.transform(xtest)
    
    return xtrain, xtest

In [46]:
def get_poly(xtrain, xtest, n=2):
    poly = PolynomialFeatures(n)
    
    xtrain = poly.fit_transform(xtrain)
    xtest = poly.transform(xtest)
    
    return xtrain, xtest

In [47]:
res_weeks = pd.DataFrame({'model':[], 'n_weeks':[], 'shift':[], 'r2':[], 'mse':[]})

In [48]:
n_weeks = 4
start = 0
middle = 7 * n_weeks - 1
end = 7 * n_weeks + 7 - 1

In [49]:
r2 = np.array([])
mse = np.array([])
models = {'LR': LinearRegression(n_jobs=5), 'KNR': KNeighborsRegressor(n_jobs=5), 'SVR': SVR()}

for name, m in models.items():  # Para cada modelo en la lista de modelos correr lo siguiente
    
    for delta in range(0, df.shape[0] - n_weeks * 7, 1):
        # Armo una ventana de n_weeks de ancho que uso para entrenar los modelos
        # Esa ventana se va corriendo de a un dia a la vez
        
        xtrain = df.loc[delta:middle + delta].drop(['fallecidos'], axis = 1)
        ytrain = df.loc[delta:middle + delta, 'fallecidos']

        xtest = df.loc[middle + delta:end + delta].drop(['fallecidos'], axis = 1)
        ytest = df.loc[middle + delta:end + delta, 'fallecidos']

        aux_r2, aux_mse = get_r2(xtrain, xtest, ytrain, ytest, m)
        
        # Una vez que entrene el modelo con esas entradas y calcule sus metricas las guardo en un array
        r2 = np.append(r2, aux_r2)
        mse = np.append(mse, aux_mse)
    
    # Evaluo que tan bien hizo cada modelo a lo largo de las semanas calculando la media de las metricas obtenidas
    res_weeks = res_weeks.append({'model':name, 'r2':r2.mean(), 'shift':0, 'mse': mse.mean(), 'n_weeks': n_weeks}, ignore_index=True)

res_weeks.sort_values('r2', ascending=False)

Unnamed: 0,model,n_weeks,shift,r2,mse
0,LR,4.0,0.0,-0.058577,21.677894
1,KNR,4.0,0.0,-0.333691,28.056952
2,SVR,4.0,0.0,-0.985896,41.003349


In [50]:
r2 = np.array([])
mse = np.array([])

for name, m in models.items():
    
    for delta in range(0, df.shape[0] - n_weeks * 7, 1):
        
        xtrain = df.loc[delta:middle + delta].drop(['fallecidos'], axis = 1)
        ytrain = df.loc[delta:middle + delta, 'fallecidos']

        xtest = df.loc[middle + delta:end + delta].drop(['fallecidos'], axis = 1)
        ytest = df.loc[middle + delta:end + delta, 'fallecidos']

        xtrain_scal, xtest_scal = get_scal(xtrain, xtest)

        aux_r2, aux_mse = get_r2(xtrain_scal, xtest_scal, ytrain, ytest, m)

        r2 = np.append(r2, aux_r2)
        mse = np.append(mse, aux_mse)


    res_weeks = res_weeks.append({'model':f"{name} Scal",'r2':r2.mean(), 'shift':0, 'mse':mse.mean(),'n_weeks':n_weeks}, ignore_index=True)

res_weeks.sort_values('r2', ascending=False)

Unnamed: 0,model,n_weeks,shift,r2,mse
3,LR Scal,4.0,0.0,-0.058577,21.677894
0,LR,4.0,0.0,-0.058577,21.677894
1,KNR,4.0,0.0,-0.333691,28.056952
4,KNR Scal,4.0,0.0,-0.335873,27.197271
2,SVR,4.0,0.0,-0.985896,41.003349
5,SVR Scal,4.0,0.0,-0.998429,42.612983


## Pruebo usando solo fallecidos desplazada

In [116]:
df_falle = df.copy()
df_falle['shift'] = df.fallecidos.shift(1, fill_value=0)
df_falle = df_falle[['fallecidos', 'shift']]
df_falle.tail(10)

Unnamed: 0,fallecidos,shift
173,13,10
174,11,13
175,16,11
176,11,16
177,15,11
178,10,15
179,7,10
180,9,7
181,7,9
182,11,7


In [206]:
df_aux = df.reset_index()
df_aux.drop('index', axis=1, inplace=True)

In [226]:
cols

['mayor_65', 'internacion', 'cuidado_intensivo', 'mayor_65_internacion']

In [189]:
cols_to_shift = cols
cols_to_shift.remove('fallecidos')

In [242]:
res_falle = pd.DataFrame({'model':[], 'n_weeks':[], 'shift':[], 'r2':[], 'mse':[]})

In [245]:
n_weeks = 6
start = 0
middle = 7 * n_weeks - 1
end = 7 * n_weeks + 7 - 1

In [246]:
r2 = np.array([])
mse = np.array([])
models = {'LR': LinearRegression(n_jobs=5)} #'KNR': KNeighborsRegressor(n_jobs=5), 'SVR': SVR()}

for i in range(1, 32, 1):
    
    df_falle = df_aux.copy()
    df_falle['shift'] = df_aux.fallecidos.shift(i, fill_value=0)
    df_falle = df_falle[['shift', *cols]]
    
    for name, m in models.items():  # Para cada modelo en la lista de modelos correr lo siguiente

        for delta in range(0, df_falle.shape[0] - n_weeks * 7, 1):
            # Armo una ventana de n_weeks de ancho que uso para entrenar los modelos
            # Esa ventana se va corriendo de a un dia a la vez

            xtrain = df_falle.loc[delta:middle + delta].drop(['fallecidos'], axis = 1)
            ytrain = df_falle.loc[delta:middle + delta, 'fallecidos']

            xtest = df_falle.loc[middle + delta:end + delta].drop(['fallecidos'], axis = 1)
            ytest = df_falle.loc[middle + delta:end + delta, 'fallecidos']
            

            aux_r2, aux_mse = get_r2(xtrain, xtest, ytrain, ytest, m)

            # Una vez que entrene el modelo con esas entradas y calcule sus metricas las guardo en un array
            r2 = np.append(r2, aux_r2)
            mse = np.append(mse, aux_mse)

        # Evaluo que tan bien hizo cada modelo a lo largo de las semanas calculando la media de las metricas obtenidas
        res_falle = res_falle.append({'model':f'{name} Falle', 'r2':r2.mean(), 'shift':i, 'mse': mse.mean(), 'n_weeks': n_weeks}, ignore_index=True)

res_falle.sort_values('r2', ascending=False).head(10)

Unnamed: 0,model,n_weeks,shift,r2,mse
2,LR Falle,5.0,3.0,0.03935,18.446028
3,LR Falle,5.0,4.0,0.032211,18.373685
5,LR Falle,5.0,6.0,0.029976,18.454501
4,LR Falle,5.0,5.0,0.029124,18.379311
1,LR Falle,5.0,2.0,0.020472,18.539579
10,LR Falle,5.0,11.0,0.011634,18.782823
12,LR Falle,5.0,13.0,0.01014,18.800898
9,LR Falle,5.0,10.0,0.008004,18.851316
11,LR Falle,5.0,12.0,0.007663,18.784512
8,LR Falle,5.0,9.0,0.004774,18.853855


In [54]:
# Pruebas

m = LinearRegression()

tscv = TimeSeriesSplit(n_splits=6)

cv_results = cross_val_score(m, pd.DataFrame(xtrain), ytrain, cv=tscv, scoring='r2')
cv_results

array([ -6.27441321,  -3.43444927,  -0.49326971,  -1.44430392,
        -3.15734709, -38.02827901])