In [1]:
# importamos las librerías necesarias para trabajar.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, cross_val_score, train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

#Otros 
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_raw = pd.read_feather('./data/pivot_data')

df_raw.index = df_raw['index']
df_raw.drop('index', axis=1, inplace=True)

df_raw

Unnamed: 0_level_0,argentino,asistencia_respiratoria,confirmados,cuidado_intensivo,edad,fallecidos,presentan_sintomas,privado,publico,internacion,sexo_F,sexo_M
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-03-01,0,0,0,0,0.000000,0,0,0,0,0,0,0
2020-03-02,1,0,1,0,43.000000,0,1,1,0,1,0,1
2020-03-03,0,0,0,0,0.000000,0,0,0,0,0,0,0
2020-03-04,0,0,0,0,0.000000,0,0,0,0,0,0,0
2020-03-05,5,1,5,1,50.600000,1,5,3,2,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...
2020-09-26,678,7,694,9,44.386167,12,359,394,300,54,348,346
2020-09-27,556,2,567,6,40.890212,7,335,307,260,45,274,293
2020-09-28,764,3,785,6,41.391614,9,431,480,305,53,399,386
2020-09-29,892,4,906,4,42.691685,9,454,531,375,57,449,457


## Pruebo con las principales features primero

In [3]:
cols = ['asistencia_respiratoria', 'fallecidos', 'edad', 'presentan_sintomas', 'internacion', 'cuidado_intensivo']
df = df_raw[cols]

#df['cuidado**2'] = np.power(df['cuidado_intensivo'].astype(int), 2)

In [4]:
#df['fallecidos_mañana'] = df.fallecidos.shift(periods=-1, fill_value=0)
df.head()

Unnamed: 0_level_0,asistencia_respiratoria,fallecidos,edad,presentan_sintomas,internacion,cuidado_intensivo
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-01,0,0,0.0,0,0,0
2020-03-02,0,0,43.0,1,1,0
2020-03-03,0,0,0.0,0,0,0
2020-03-04,0,0,0.0,0,0,0
2020-03-05,1,1,50.6,5,3,1


In [5]:
res = pd.DataFrame({'model':[], 'r2':[] , 'mse': []})

In [6]:
df = df.loc[df.index >= pd.Timestamp('2020-04-01')]

In [7]:
df.loc[(df.index >= pd.Timestamp('2020-09-01')) & (df.index < pd.Timestamp('2020-09-15'))]

Unnamed: 0_level_0,asistencia_respiratoria,fallecidos,edad,presentan_sintomas,internacion,cuidado_intensivo
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-09-01,12,30,43.15416,649,97,17
2020-09-02,14,46,41.835033,665,126,24
2020-09-03,8,31,42.141241,687,119,21
2020-09-04,16,37,42.894996,613,105,24
2020-09-05,8,32,41.385931,622,99,14
2020-09-06,9,24,41.937582,413,66,11
2020-09-07,11,37,44.855575,681,129,18
2020-09-08,16,37,43.215947,645,128,20
2020-09-09,5,28,41.994545,664,105,10
2020-09-10,5,39,44.391092,588,107,14


In [8]:
xtrain = df.loc[df.index < pd.Timestamp('2020-09-01')].drop(['fallecidos'], axis = 1)
ytrain = df.loc[df.index < pd.Timestamp('2020-09-01'), 'fallecidos']

xtest = df.loc[(df.index >= pd.Timestamp('2020-09-01')) & (df.index < pd.Timestamp('2020-09-15'))].drop(['fallecidos'], axis = 1)
ytest = df.loc[(df.index >= pd.Timestamp('2020-09-01')) & (df.index < pd.Timestamp('2020-09-15')), 'fallecidos']

xval = df.loc[df.index >= pd.Timestamp('2020-09-15')].drop(['fallecidos'], axis = 1)
yval = df.loc[df.index >= pd.Timestamp('2020-09-15'), 'fallecidos']

## Escalo los datos

In [9]:
scaler = StandardScaler()
xtrain_scal = scaler.fit_transform(xtrain)
xtest_scal = scaler.transform(xtest)

In [10]:
lr = LinearRegression()
lr.fit(xtrain_scal, ytrain)
r2_score(ytest, lr.predict(xtest_scal))

0.6934265668612046

In [11]:
res = res.append({'model':'Linear Reg Scal',
            'r2':r2_score(ytest, lr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, lr.predict(xtest_scal))},
           ignore_index=True)

In [12]:
svr = SVR()
svr.fit(xtrain_scal, ytrain)
r2_score(ytest, svr.predict(xtest_scal))

0.44580851633820995

In [13]:
res = res.append({'model':'SVR Scal',
            'r2':r2_score(ytest, svr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, svr.predict(xtest_scal))},
           ignore_index=True)

In [14]:
knr = KNeighborsRegressor()
knr.fit(xtrain_scal, ytrain)
r2_score(ytest, knr.predict(xtest_scal))

0.558430761236685

In [15]:
res = res.append({'model':'KNR Scal',
            'r2':r2_score(ytest, knr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, knr.predict(xtest_scal))},
           ignore_index=True)

## Agrego polinomios

In [16]:
poly = PolynomialFeatures(2)
xtrain_poly = poly.fit_transform(xtrain_scal)
xtest_poly = poly.transform(xtest_scal)

In [17]:
lr = LinearRegression()
lr.fit(xtrain_poly, ytrain)
r2_score(ytest, lr.predict(xtest_poly))

0.5041068664261402

In [18]:
res = res.append({'model':'Linear Reg Poly',
            'r2':r2_score(ytest, lr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, lr.predict(xtest_poly))},
           ignore_index=True)

In [19]:
svr = SVR()
svr.fit(xtrain_poly, ytrain)
r2_score(ytest, svr.predict(xtest_poly))

0.03183604001220164

In [20]:
res = res.append({'model':'SVR Poly',
            'r2':r2_score(ytest, svr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, svr.predict(xtest_poly))},
           ignore_index=True)

In [21]:
knr = KNeighborsRegressor()
knr.fit(xtrain_poly, ytrain)
r2_score(ytest, knr.predict(xtest_poly))

0.5174382956612109

In [22]:
res = res.append({'model':'KNR Poly',
            'r2':r2_score(ytest, knr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, knr.predict(xtest_poly))},
           ignore_index=True)

In [23]:
res.sort_values('r2', ascending=False)

Unnamed: 0,model,r2,mse
0,Linear Reg Scal,0.693427,24.081656
2,KNR Scal,0.558431,34.685714
5,KNR Poly,0.517438,37.905714
3,Linear Reg Poly,0.504107,38.952912
1,SVR Scal,0.445809,43.532307
4,SVR Poly,0.031836,76.050267


# Analiso con el dataset promediado

In [24]:
df_ma = df.rolling(3).mean()
df_ma = df_ma.fillna(0)

In [25]:
xtrain = df_ma.loc[df_ma.index < pd.Timestamp('2020-09-01')].drop(['fallecidos'], axis = 1)
ytrain = df_ma.loc[df_ma.index < pd.Timestamp('2020-09-01'), 'fallecidos']

xtest = df_ma.loc[(df_ma.index >= pd.Timestamp('2020-09-01')) & (df_ma.index < pd.Timestamp('2020-09-15'))].drop(['fallecidos'], axis = 1)
ytest = df_ma.loc[(df_ma.index >= pd.Timestamp('2020-09-01')) & (df_ma.index < pd.Timestamp('2020-09-15')), 'fallecidos']

xval = df_ma.loc[df_ma.index >= pd.Timestamp('2020-09-15')].drop(['fallecidos'], axis = 1)
yval = df_ma.loc[df_ma.index >= pd.Timestamp('2020-09-15'), 'fallecidos']

In [26]:
scaler = StandardScaler()
xtrain_scal = scaler.fit_transform(xtrain)
xtest_scal = scaler.transform(xtest)

In [27]:
lr = LinearRegression()
lr.fit(xtrain_scal, ytrain)
r2_score(ytest, lr.predict(xtest_scal))

0.8312482462968932

In [28]:
res = res.append({'model':'Linear Reg Scal (Prom)',
            'r2':r2_score(ytest, lr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, lr.predict(xtest_scal))},
           ignore_index=True)

In [29]:
svr = SVR()
svr.fit(xtrain_scal, ytrain)
r2_score(ytest, svr.predict(xtest_scal))

0.4611103153002106

In [30]:
res = res.append({'model':'SVR Scal (Prom)',
            'r2':r2_score(ytest, svr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, svr.predict(xtest_scal))},
           ignore_index=True)

In [31]:
knr = KNeighborsRegressor()
knr.fit(xtrain_scal, ytrain)
r2_score(ytest, knr.predict(xtest_scal))

0.28457549688519745

In [32]:
res = res.append({'model':'KNR Scal (Prom)',
            'r2':r2_score(ytest, knr.predict(xtest_scal)),
            'mse':mean_squared_error(ytest, knr.predict(xtest_scal))},
           ignore_index=True)

## Agrego polinomios

In [33]:
poly = PolynomialFeatures(2)
xtrain_poly = poly.fit_transform(xtrain_scal)
xtest_poly = poly.transform(xtest_scal)

In [34]:
lr = LinearRegression()
lr.fit(xtrain_poly, ytrain)
r2_score(ytest, lr.predict(xtest_poly))

-0.002575169284364076

In [35]:
res = res.append({'model':'Linear Reg Poly (Prom)',
            'r2':r2_score(ytest, lr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, lr.predict(xtest_poly))},
           ignore_index=True)

In [36]:
svr = SVR()
svr.fit(xtrain_poly, ytrain)
r2_score(ytest, svr.predict(xtest_poly))

0.25212223810747303

In [37]:
res = res.append({'model':'SVR Poly (Prom)',
            'r2':r2_score(ytest, svr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, svr.predict(xtest_poly))},
           ignore_index=True)

In [38]:
knr = KNeighborsRegressor()
knr.fit(xtrain_poly, ytrain)
r2_score(ytest, knr.predict(xtest_poly))

0.5136837733610207

In [39]:
res = res.append({'model':'KNR Poly (Prom)',
            'r2':r2_score(ytest, knr.predict(xtest_poly)),
            'mse':mean_squared_error(ytest, knr.predict(xtest_poly))},
           ignore_index=True)

In [40]:
res.sort_values('r2', ascending=False)

Unnamed: 0,model,r2,mse
6,Linear Reg Scal (Prom),0.831248,6.449684
0,Linear Reg Scal,0.693427,24.081656
2,KNR Scal,0.558431,34.685714
5,KNR Poly,0.517438,37.905714
11,KNR Poly (Prom),0.513684,18.586984
3,Linear Reg Poly,0.504107,38.952912
7,SVR Scal (Prom),0.46111,20.596339
1,SVR Scal,0.445809,43.532307
8,KNR Scal (Prom),0.284575,27.343492
10,SVR Poly (Prom),0.252122,28.583854


# Pruebo usando solo las ultimas 2 semanas para entrenar

In [41]:
xtrain = df.loc[df.index < pd.Timestamp('2020-09-01')].drop(['fallecidos'], axis = 1)
ytrain = df.loc[df.index < pd.Timestamp('2020-09-01'), 'fallecidos']

xtest = df.loc[(df.index >= pd.Timestamp('2020-09-01')) & (df.index < pd.Timestamp('2020-09-15'))].drop(['fallecidos'], axis = 1)
ytest = df.loc[(df.index >= pd.Timestamp('2020-09-01')) & (df.index < pd.Timestamp('2020-09-15')), 'fallecidos']

xval = df.loc[df.index >= pd.Timestamp('2020-09-15')].drop(['fallecidos'], axis = 1)
yval = df.loc[df.index >= pd.Timestamp('2020-09-15'), 'fallecidos']

In [42]:
df.shape[0]

183

In [43]:
df = df.reset_index()
df.drop('index', axis=1, inplace=True)

In [44]:
def get_r2(xtrain, xtest, ytrain, ytest, m = KNeighborsRegressor()):
    
    m.fit(xtrain, ytrain)
    
    return r2_score(ytest, m.predict(xtest)), mean_squared_error(ytest, m.predict(xtest))

In [45]:
def get_scal(xtrain, xtest):
    scaler = StandardScaler()
    
    xtrain = scaler.fit_transform(xtrain)
    xtest = scaler.transform(xtest)
    
    return xtrain, xtest

In [46]:
def get_poly(xtrain, xtest, n=2):
    poly = PolynomialFeatures(n)
    
    xtrain = poly.fit_transform(xtrain)
    xtest = poly.transform(xtest)
    
    return xtrain, xtest

In [47]:
n_weeks = 2
start = 0
middle = 7 * n_weeks - 1
end = 7 * n_weeks + 7 - 1

In [48]:
r2 = np.array([])
mse = np.array([])


for delta in range(0, df.shape[0] - n_weeks * 7, 1):
    xtrain = df.loc[delta:middle + delta].drop(['fallecidos'], axis = 1)
    ytrain = df.loc[delta:middle + delta, 'fallecidos']
    
    xtest = df.loc[middle + delta:end + delta].drop(['fallecidos'], axis = 1)
    ytest = df.loc[middle + delta:end + delta, 'fallecidos']
    
    aux_r2, aux_mse = get_r2(xtrain, xtest, ytrain, ytest)
    
    r2 = np.append(r2, aux_r2)
    mse = np.append(mse, aux_mse)

    
r2.mean()

-0.7807620705948786

In [49]:
r2 = np.array([])
mse = np.array([])


for delta in range(0, df.shape[0] - n_weeks * 7, 1):
    xtrain = df.loc[delta:middle + delta].drop(['fallecidos'], axis = 1)
    ytrain = df.loc[delta:middle + delta, 'fallecidos']
    
    xtest = df.loc[middle + delta:end + delta].drop(['fallecidos'], axis = 1)
    ytest = df.loc[middle + delta:end + delta, 'fallecidos']
    
    xtrain_scal, xtest_scal = get_scal(xtrain, xtest)
    xtrain_poly, xtest_poly = get_poly(xtrain_scal, xtest_scal, 2)
    
    aux_r2, aux_mse = get_r2(xtrain_poly, xtest_poly, ytrain, ytest)
    
    r2 = np.append(r2, aux_r2)
    mse = np.append(mse, aux_mse)

    
r2.mean()

-1.233768130536213

## Pruebo lo mismo con el dataset promediado

In [50]:
df_ma = df_ma.reset_index()
df_ma.drop('index', axis=1, inplace=True)

In [51]:
df_ma.shape[0]

183

In [52]:
r2 = np.array([])
mse = np.array([])


for delta in range(0, df_ma.shape[0] - n_weeks * 7, 1):
    xtrain = df_ma.loc[delta:middle + delta].drop(['fallecidos'], axis = 1)
    ytrain = df_ma.loc[delta:middle + delta, 'fallecidos']
    
    xtest = df_ma.loc[middle + delta:end + delta].drop(['fallecidos'], axis = 1)
    ytest = df_ma.loc[middle + delta:end + delta, 'fallecidos']
    
    aux_r2, aux_mse = get_r2(xtrain, xtest, ytrain, ytest)
    
    r2 = np.append(r2, aux_r2)
    mse = np.append(mse, aux_mse)

    
r2.mean()

-2.665822202121558

In [53]:
r2 = np.array([])
mse = np.array([])


for delta in range(0, df.shape[0] - n_weeks * 7, 1):
    xtrain = df_ma.loc[delta:middle + delta].drop(['fallecidos'], axis = 1)
    ytrain = df_ma.loc[delta:middle + delta, 'fallecidos']
    
    xtest = df_ma.loc[middle + delta:end + delta].drop(['fallecidos'], axis = 1)
    ytest = df_ma.loc[middle + delta:end + delta, 'fallecidos']
    
    xtrain_scal, xtest_scal = get_scal(xtrain, xtest)
    xtrain_poly, xtest_poly = get_poly(xtrain_scal, xtest_scal, 2)
    
    aux_r2, aux_mse = get_r2(xtrain_poly, xtest_poly, ytrain, ytest)
    
    r2 = np.append(r2, aux_r2)
    mse = np.append(mse, aux_mse)

    
r2.mean()

-3.95575760227954

## Analisis de series temporales

In [54]:
# Pruebas

m = LinearRegression()

tscv = TimeSeriesSplit(n_splits=6)

cv_results = cross_val_score(m, pd.DataFrame(xtrain), ytrain, cv=tscv, scoring='r2')
cv_results

array([ -0.11936904, -15.68640413,  -8.94299076, -41.98884823,
       -35.24764844,  -9.80084376])