## 6. Modelo_Prediccion_por_Estacion.ipynb

### Objetivo

Notebook desarrollado con el objetivo de generar un único modelo entrenado para cada una de las estaciones BiciMad basado en el algoritmo e hiperparámetros determinados para el cluster al cual pertenece. Cada modelo final se almacena en disco en un objeto de tipo joblib.

### Descripción General de notebook

    1. Preparación de datos en entrenamiento        
    2. Creación de dataframes Train y Test
    3. Entrenamiento de modelos para estaciones por Cluster

In [1]:
from pandas import MultiIndex, Int16Dtype
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("fivethirtyeight")
sns.set_style('darkgrid')

import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, Binarizer, RobustScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, PowerTransformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.compose import ColumnTransformer


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import ElasticNet, SGDRegressor, LinearRegression, Lasso
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge,LinearRegression, LogisticRegression
from sklearn.kernel_ridge import KernelRidge
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.model_selection import KFold, ShuffleSplit, LeaveOneOut, StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import joblib

# Warnings configuration
# ==============================================================================
import warnings
# warnings.filterwarnings('ignore')

In [2]:
%run "../7. Prediccion/Funciones_Prepara_Prediccion.ipynb"

## 1. Preparación datos

Carga de datos utilizados en entrenamiento por cluster

In [3]:
# bicimad_def = _dataBaseOriginal("../../Data/DataFrame_Final_Cierre_Cluster.csv")
# datos = pd.read_csv("../../Data/DataFrame_Final_Cierre_Cluster.csv", parse_dates=['FECHA'])

bicimad_def = _dataBaseOriginal("../../Data/DataFrame_Final_Cierre_Cluster_2017_2019.csv")
datos = pd.read_csv("../../Data/DataFrame_Final_Cierre_Cluster_2017_2019.csv", parse_dates=['FECHA'])

In [4]:
bicimad_def

Unnamed: 0,ESTACION,DEMANDA,MES_sen,MES_cos,TEMP_MAX,TEMP_MIN,HUMEDAD,VIENTO,PRESION,PRECIPITACION_1h,...,DIA_SEMANA_5,DIA_SEMANA_6,DIA_SEMANA_7,DESC_TIEMPO_Clouds,DESC_TIEMPO_Drizzle,DESC_TIEMPO_Fog,DESC_TIEMPO_Mist,DESC_TIEMPO_Rain,DESC_TIEMPO_Snow,DESC_TIEMPO_Thunderstorm
0,1,2.079442,0.989821,-0.142315,-0.985324,0.269375,-0.485692,2.157932,0.427625,-0.240464,...,0,1,0,1,0,0,0,0,0,0
1,1,4.127134,0.755750,-0.654861,-0.591062,-0.150069,-0.666216,1.744687,0.525859,-0.240464,...,0,0,1,1,0,0,0,0,0,0
2,1,4.077537,0.755750,-0.654861,-0.390589,-0.819758,-0.407056,-0.618350,0.589009,-0.240464,...,0,0,0,0,0,0,0,0,0,0
3,1,4.356709,0.755750,-0.654861,-0.279216,-1.010285,-0.582084,-1.015737,0.855644,-0.240464,...,0,0,0,0,0,0,0,0,0,0
4,1,4.454347,0.755750,-0.654861,0.067156,-1.152470,-0.810381,-0.656874,0.771443,-0.240464,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176472,175,4.477337,0.909632,0.415415,-0.439594,-0.678995,0.087978,1.228235,0.178262,-0.240464,...,0,0,0,1,0,0,0,0,0,0
176473,175,5.068904,0.909632,0.415415,-0.695753,-0.634918,-0.240653,0.855396,0.501300,-0.240464,...,0,0,0,0,0,0,0,0,0,0
176474,175,5.159055,0.909632,0.415415,-0.336016,-0.616434,-0.214272,1.336527,0.518842,-0.240464,...,1,0,0,0,0,0,0,0,0,0
176475,175,5.117994,0.909632,0.415415,-0.339358,-0.703167,-0.196516,-0.852853,0.364474,-0.240464,...,0,1,0,0,0,0,0,0,0,0


In [5]:
# Identificación de variables categóricas y numéricas
cat_cols= ['MES', 'DIA_SEMANA', 'TEMPORADA', 'Es_Festivo', 'Es_FinSemana','DESC_TIEMPO']
num_cols= ['TEMP_MAX','TEMP_MIN','HUMEDAD','VIENTO','PRESION']

In [7]:
bicimad = bicimad_def.copy()
bicimad.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176477 entries, 0 to 176476
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ESTACION                  176477 non-null  int64  
 1   DEMANDA                   176477 non-null  float64
 2   MES_sen                   176477 non-null  float64
 3   MES_cos                   176477 non-null  float64
 4   TEMP_MAX                  176477 non-null  float64
 5   TEMP_MIN                  176477 non-null  float64
 6   HUMEDAD                   176477 non-null  float64
 7   VIENTO                    176477 non-null  float64
 8   PRESION                   176477 non-null  float64
 9   PRECIPITACION_1h          176477 non-null  float64
 10  PRECIPITACION_3h          176477 non-null  float64
 11  Es_Festivo_1              176477 non-null  uint8  
 12  Es_FinSemana_1            176477 non-null  uint8  
 13  TEMPORADA_OTONO           176477 non-null  u

#### Lista de Cluster/Estacion

In [8]:
df_EstacionesCluster = datos[['ESTACION', 'CLUSTER_soloDemanda']]
df_EstacionesCluster

Unnamed: 0,ESTACION,CLUSTER_soloDemanda
0,1,0
1,2,3
2,3,4
3,4,3
4,5,3
...,...,...
2912133,168,2
2912134,169,0
2912135,171,3
2912136,172,3


## 2. Creación de DF de Train y Test

In [9]:
# Semilla
seed = 1234567

# DF's de train y test
train, test = train_test_split(bicimad, test_size = 0.30, random_state = seed)    
X_train_data = train.drop(['DEMANDA'], axis=1)
X_test_data = test.drop(['DEMANDA'], axis=1)
y_train_data = train[['ESTACION','DEMANDA']]
y_test_data = test[['ESTACION','DEMANDA']]

In [10]:
X_train_data

Unnamed: 0,ESTACION,MES_sen,MES_cos,TEMP_MAX,TEMP_MIN,HUMEDAD,VIENTO,PRESION,PRECIPITACION_1h,PRECIPITACION_3h,...,DIA_SEMANA_5,DIA_SEMANA_6,DIA_SEMANA_7,DESC_TIEMPO_Clouds,DESC_TIEMPO_Drizzle,DESC_TIEMPO_Fog,DESC_TIEMPO_Mist,DESC_TIEMPO_Rain,DESC_TIEMPO_Snow,DESC_TIEMPO_Thunderstorm
103444,106,0.281733,-0.959493,-0.145568,-0.014994,0.493132,0.819561,0.114504,-0.128259,-0.197844,...,0,0,1,1,0,0,0,0,0,0
159041,159,0.540641,0.841254,-0.505304,-1.010285,0.560894,-0.865020,1.641102,-0.240464,-0.197844,...,0,0,0,0,0,0,0,0,0,0
151452,151,0.909632,0.415415,-0.627815,-0.829711,1.059119,-0.982730,1.017028,-0.240464,-0.197844,...,0,0,1,0,0,0,0,0,0,0
100271,103,0.755750,-0.654861,0.166278,-0.464297,-1.018385,-0.680970,0.231157,-0.240464,-0.197844,...,0,1,0,0,0,0,0,0,0,0
161872,161,0.909632,0.415415,-0.496394,-0.875210,1.052852,-0.219229,1.277059,-0.240464,-0.197844,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169954,169,0.755750,-0.654861,-0.456300,-0.576623,-0.032903,0.114961,0.822793,-0.240464,-0.197844,...,0,1,0,1,0,0,0,0,0,0
26950,29,-0.755750,-0.654861,1.150820,1.233386,-1.346196,0.721204,-1.215904,-0.240464,-0.197844,...,0,0,0,0,0,0,0,0,0,0
81524,85,-0.281733,-0.959493,0.997125,1.064186,-0.421769,0.234081,-0.662771,-0.240464,-0.197844,...,0,0,0,1,0,0,0,0,0,0
112255,114,-0.989821,-0.142315,1.202052,0.954704,-0.475545,-0.186576,0.532875,-0.240464,-0.197844,...,0,1,0,1,0,0,0,0,0,0


In [11]:
y= y_train_data[y_train_data['ESTACION']==1]['DEMANDA']
y.shape

(665,)

## 3. Entrenamiento de modelos para estaciones por Cluster

### CLUSTER 0

Se recorren las estaciones del Cluster y se entrena el modelo según el algoritmo e hiperparámetros definidos para el cluster

In [14]:
estaciones = df_EstacionesCluster[df_EstacionesCluster['CLUSTER_soloDemanda']==0]['ESTACION'].unique()
estaciones

array([  1,   6,  13,  19,  26,  38,  41,  45,  46,  49,  52,  59,  62,
        74,  84, 108, 114, 128, 132, 161, 164, 166, 169, 170,  31,  56,
        83, 133, 162,  79], dtype=int64)

In [15]:
# Loop sobre estaciones del cluster. Resultado se almacena en disco en objeto joblib

for estacion in estaciones:
    print(estacion)
    X_train = X_train_data[X_train_data['ESTACION'] == estacion].drop('ESTACION', axis=1)
    y_train = y_train_data[y_train_data['ESTACION'] == estacion]['DEMANDA']

    model_C0 = GradientBoostingRegressor(learning_rate=0.01,
      max_depth=80,
      n_estimators=500,
      subsample=0.2)

    resultado = model_C0.fit(X_train, y_train)
    joblib.dump(resultado, '../Modelos/Modelo_'+str(estacion)+'.pkl', compress=9)

1
6
13
19
26
38
41
45
46
49
52
59
62
74
84
108
114
128
132
161
164
166
169
170
31
56
83
133
162
79


### CLUSTER 1

In [16]:
estaciones = df_EstacionesCluster[df_EstacionesCluster['CLUSTER_soloDemanda']==1]['ESTACION'].unique()
estaciones

array([ 15,  23,  24,  29,  35,  37,  40,  97,  98, 100, 107, 109, 111,
       120, 122, 147, 158, 159, 174,  11,  28,  32,  36,  88, 117, 119,
       127, 137, 140, 143, 144, 152, 165,  34,  39,  60, 104, 112, 138,
       141, 151,  85, 173,  61, 150,  72, 101,  87, 105, 146], dtype=int64)

In [17]:
for estacion in estaciones:
    print(estacion)
    X_train = X_train_data[X_train_data['ESTACION'] == estacion].drop('ESTACION', axis=1)
    y_train = y_train_data[y_train_data['ESTACION'] == estacion]['DEMANDA']

    model_C1= RandomForestRegressor(bootstrap=True,
      max_depth=80,
      max_features=7,
      min_samples_leaf=1,
      min_samples_split=8,
      n_estimators=1000)

    resultado = model_C1.fit(X_train, y_train)
    joblib.dump(resultado, '../Modelos/Modelo_'+str(estacion)+'.pkl', compress=9)

15
23
24
29
35
37
40
97
98
100
107
109
111
120
122
147
158
159
174
11
28
32
36
88
117
119
127
137
140
143
144
152
165
34
39
60
104
112
138
141
151
85
173
61
150
72
101
87
105
146


### CLUSTER 2

In [18]:
estaciones = df_EstacionesCluster[df_EstacionesCluster['CLUSTER_soloDemanda']==2]['ESTACION'].unique()
estaciones

array([  9,  43,  57,  58,  64,  90, 129, 135, 160, 163, 168, 149, 175],
      dtype=int64)

In [19]:
for estacion in estaciones:
    print(estacion)
    X_train = X_train_data[X_train_data['ESTACION'] == estacion].drop('ESTACION', axis=1)
    y_train = y_train_data[y_train_data['ESTACION'] == estacion]['DEMANDA']

    model_C2 = XGBRegressor(gamma=0,
      learning_rate=0.1,
      max_depth=8,
      min_child_weight=3,
      n_estimators=1000,
      reg_alpha=0.1,
      reg_lambda=200)

    resultado = model_C2.fit(X_train, y_train)
    joblib.dump(resultado, '../Modelos/Modelo_'+str(estacion)+'.pkl', compress=9)

9
43
57
58
64
90
129
135
160
163
168
149
175


### CLUSTER 3

In [20]:
estaciones = df_EstacionesCluster[df_EstacionesCluster['CLUSTER_soloDemanda']==3]['ESTACION'].unique()
estaciones

array([  2,   4,   5,   7,  10,  12,  16,  18,  21,  25,  27,  33,  44,
        47,  50,  51,  54,  63,  66,  67,  77,  81,  93, 102, 110, 116,
       121, 123, 124, 125, 126, 134, 148, 153, 171,  20,  73,  80,  92,
       130, 142, 154, 167,  96, 172, 106,  82,  94,  14,  69,  89],
      dtype=int64)

In [21]:
for estacion in estaciones:
    print(estacion)
    X_train = X_train_data[X_train_data['ESTACION'] == estacion].drop('ESTACION', axis=1)
    y_train = y_train_data[y_train_data['ESTACION'] == estacion]['DEMANDA']

    model_C3 = RandomForestRegressor(bootstrap=True,
      max_depth=80,
      max_features=7,
      min_samples_leaf=1,
      min_samples_split=8,
      n_estimators=1000)

    resultado = model_C3.fit(X_train, y_train)
    joblib.dump(resultado, '../Modelos/Modelo_'+str(estacion)+'.pkl', compress=9)

2
4
5
7
10
12
16
18
21
25
27
33
44
47
50
51
54
63
66
67
77
81
93
102
110
116
121
123
124
125
126
134
148
153
171
20
73
80
92
130
142
154
167
96
172
106
82
94
14
69
89


### CLUSTER 4

In [22]:
estaciones = df_EstacionesCluster[df_EstacionesCluster['CLUSTER_soloDemanda']==4]['ESTACION'].unique()
estaciones

array([  3,   8,  17,  30,  42,  48,  53,  55,  71,  75,  76,  86,  95,
       103, 113, 115, 118, 131, 136, 145, 157,  65,  78, 155,  91, 156,
        99, 139], dtype=int64)

In [23]:
for estacion in estaciones:
    print(estacion)
    X_train = X_train_data[X_train_data['ESTACION'] == estacion].drop('ESTACION', axis=1)
    y_train = y_train_data[y_train_data['ESTACION'] == estacion]['DEMANDA']

    model_C4 = RandomForestRegressor(bootstrap=True,
      max_depth=80,
      max_features=7,
      min_samples_leaf=1,
      min_samples_split=8,
      n_estimators=1000)


    resultado = model_C4.fit(X_train, y_train)
    joblib.dump(resultado, '../Modelos/Modelo_'+str(estacion)+'.pkl', compress=9)

3
8
17
30
42
48
53
55
71
75
76
86
95
103
113
115
118
131
136
145
157
65
78
155
91
156
99
139
