# Trabajo 4: Competencia de modelos de Predicción para el número de vehículos registrados

Esteban Rojas Henao, Sergio Andrés Zambrano, Miguel Angel Rojas

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import io

from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn.metrics import r2_score
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten

# Lectura de datos

In [2]:
#Lectura del archivo
from google.colab import files
uploaded = files.upload()

 
df = pd.read_excel(io.BytesIO(uploaded['registros_autos_entrenamiento.xlsx']))

Saving registros_autos_entrenamiento.xlsx to registros_autos_entrenamiento.xlsx


In [3]:
df

Unnamed: 0,Fecha,Unidades
0,2012-01-01,0.0
1,2012-01-02,188.0
2,2012-01-03,482.0
3,2012-01-04,927.0
4,2012-01-05,1159.0
...,...,...
2187,2017-12-27,2102.0
2188,2017-12-28,3449.0
2189,2017-12-29,1084.0
2190,2017-12-30,18.0


# Características

Los datos leídos están compuestos por dos columnas: Fecha y Unidades.

Fecha: fecha en la que se registraron cierta cantidad de autos en el RUNT.

Unidades: cantidad de vehículos registrados en el Registro Único Nacional de Tránsito (RUNT).


Como se necesita hacer filtros por años, se saca otra columna con la funcion get_year que devuelve el año,tambien para dia (get_day) y mes (get_month).


In [4]:
def get_day(dt):
    return dt.day

def get_month(dt):
    return dt.month
 
def get_year(dt):
    return dt.year
 
def database(df):
  df['Fecha'] = df['Fecha'].map(pd.to_datetime)
  df['Dia'] = df['Fecha'].map(get_day)
  df['Mes'] = df['Fecha'].map(get_month)
  df['Year'] = df['Fecha'].map(get_year)
  return df

database(df)

Unnamed: 0,Fecha,Unidades,Dia,Mes,Year
0,2012-01-01,0.0,1,1,2012
1,2012-01-02,188.0,2,1,2012
2,2012-01-03,482.0,3,1,2012
3,2012-01-04,927.0,4,1,2012
4,2012-01-05,1159.0,5,1,2012
...,...,...,...,...,...
2187,2017-12-27,2102.0,27,12,2017
2188,2017-12-28,3449.0,28,12,2017
2189,2017-12-29,1084.0,29,12,2017
2190,2017-12-30,18.0,30,12,2017


In [5]:
df.head()

Unnamed: 0,Fecha,Unidades,Dia,Mes,Year
0,2012-01-01,0.0,1,1,2012
1,2012-01-02,188.0,2,1,2012
2,2012-01-03,482.0,3,1,2012
3,2012-01-04,927.0,4,1,2012
4,2012-01-05,1159.0,5,1,2012


Para las predicciones de los días comprendidos entre el 01/01/2012 y el 31/12/2017.

In [6]:
inicio_2012_2017 = dt.date(2012,1,1)
fin_2012_2017 = dt.date(2017,12,31)
df_2012_2017 = pd.DataFrame({'Fecha': pd.date_range(inicio_2012_2017, fin_2012_2017,freq='d')})
df_2012_2017 = database(df_2012_2017)
df_2012_2017

Unnamed: 0,Fecha,Dia,Mes,Year
0,2012-01-01,1,1,2012
1,2012-01-02,2,1,2012
2,2012-01-03,3,1,2012
3,2012-01-04,4,1,2012
4,2012-01-05,5,1,2012
...,...,...,...,...
2187,2017-12-27,27,12,2017
2188,2017-12-28,28,12,2017
2189,2017-12-29,29,12,2017
2190,2017-12-30,30,12,2017


Para las predicciones de los días comprendidos entre el 01/01/2018 y el 30/06/2018.

In [7]:
inicio_2018 = dt.date(2018,1,1)
fin_2018 = dt.date(2018,6,30)
df_2018 = pd.DataFrame({'Fecha': pd.date_range(inicio_2018, fin_2018,freq='d')})
df_2018 = database(df_2018)
df_2018

Unnamed: 0,Fecha,Dia,Mes,Year
0,2018-01-01,1,1,2018
1,2018-01-02,2,1,2018
2,2018-01-03,3,1,2018
3,2018-01-04,4,1,2018
4,2018-01-05,5,1,2018
...,...,...,...,...
176,2018-06-26,26,6,2018
177,2018-06-27,27,6,2018
178,2018-06-28,28,6,2018
179,2018-06-29,29,6,2018


In [8]:
predict_2018 = df_2018.drop(['Fecha'], axis=1)
predict_2018

Unnamed: 0,Dia,Mes,Year
0,1,1,2018
1,2,1,2018
2,3,1,2018
3,4,1,2018
4,5,1,2018
...,...,...,...
176,26,6,2018
177,27,6,2018
178,28,6,2018
179,29,6,2018


In [9]:
predict_2012_2017 = df_2012_2017.drop(['Fecha'], axis=1)
predict_2012_2017

Unnamed: 0,Dia,Mes,Year
0,1,1,2012
1,2,1,2012
2,3,1,2012
3,4,1,2012
4,5,1,2012
...,...,...,...
2187,27,12,2017
2188,28,12,2017
2189,29,12,2017
2190,30,12,2017


# Entrenamiento y validación

Para entrenamiento se toma toda la información hasta el 31 de diciembre de 2016 y para validación todo el año 2017

Entrenamiento: entre 01/01/2012 y 31/12/2016.

Validación: entre 01/01/2017 y 31/12/2017.


In [10]:
df_training = df[df['Year'] <= 2016].drop(['Fecha'], axis=1)
training = df_training.drop(['Unidades'], axis=1)
training_target = df_training['Unidades']
print(training)
training_target

      Dia  Mes  Year
0       1    1  2012
1       2    1  2012
2       3    1  2012
3       4    1  2012
4       5    1  2012
...   ...  ...   ...
1822   27   12  2016
1823   28   12  2016
1824   29   12  2016
1825   30   12  2016
1826   31   12  2016

[1827 rows x 3 columns]


0          0.0
1        188.0
2        482.0
3        927.0
4       1159.0
         ...  
1822    1922.0
1823    2409.0
1824    3603.0
1825     965.0
1826      46.0
Name: Unidades, Length: 1827, dtype: float64

In [11]:
df_validation = df[df['Year'] == 2017].drop(['Fecha'], axis=1)
validation = df_validation.drop(['Unidades'], axis=1)
validation_target = df_validation['Unidades']
print(validation)
validation_target

      Dia  Mes  Year
1827    1    1  2017
1828    2    1  2017
1829    3    1  2017
1830    4    1  2017
1831    5    1  2017
...   ...  ...   ...
2187   27   12  2017
2188   28   12  2017
2189   29   12  2017
2190   30   12  2017
2191   31   12  2017

[365 rows x 3 columns]


1827       0.0
1828      45.0
1829     116.0
1830     232.0
1831     455.0
         ...  
2187    2102.0
2188    3449.0
2189    1084.0
2190      18.0
2191       0.0
Name: Unidades, Length: 365, dtype: float64

In [12]:
df_validation = df[df['Year'] == 2017].drop(['Fecha'], axis=1)
validation = df_validation.drop(['Unidades'], axis=1)
validation_target = df_validation['Unidades']
print(validation)
validation_target

      Dia  Mes  Year
1827    1    1  2017
1828    2    1  2017
1829    3    1  2017
1830    4    1  2017
1831    5    1  2017
...   ...  ...   ...
2187   27   12  2017
2188   28   12  2017
2189   29   12  2017
2190   30   12  2017
2191   31   12  2017

[365 rows x 3 columns]


1827       0.0
1828      45.0
1829     116.0
1830     232.0
1831     455.0
         ...  
2187    2102.0
2188    3449.0
2189    1084.0
2190      18.0
2191       0.0
Name: Unidades, Length: 365, dtype: float64

# Modelos
Se prueban varios modelos para escoger el mejor en predecir el número de vehículos registrados.

# Modelo usando Regresión Lineal

Se entrena y se valida el modelo.

In [13]:
regr_model = linear_model.LinearRegression()
regr_model.fit(training, training_target)
y_predicted_r = regr_model.predict(training)
r2_training_r = r2_score(training_target, y_predicted_r)
r2_training_r

0.060974189310455396

In [14]:
y_validation_r = regr_model.predict(validation)
r2_validation_r = r2_score(validation_target, y_validation_r)
r2_validation_r

0.10129579125537447

In [15]:
r2_training_r - r2_validation_r

-0.04032160194491907

# Modelo usando Bosques Aleatorios

Se entrena y se valida el modelo.

In [16]:
#defining the RandomForestRegressor
RF_model=RandomForestRegressor()
 
RF_model.fit(training,training_target)
#testing
y_predicted_rf = RF_model.predict(training)
r2_training_rf = r2_score(training_target, y_predicted_rf)
r2_training_rf

0.8870616164709797

In [17]:
y_validation_rf = RF_model.predict(validation)
r2_validation_rf = r2_score(validation_target, y_validation_rf)
r2_validation_rf

0.1805650483208996

In [18]:
r2_training_rf - r2_validation_rf

0.7064965681500801

# Modelo usando Regresión Lasso

Se entrena y se valida el modelo.

In [19]:
lasso = linear_model.Lasso()
lasso.fit(training, training_target)
y_predicted_lasso = lasso.predict(training)
r2_training_lasso = r2_score(training_target, y_predicted_lasso)
r2_training_lasso

0.060972234147651516

In [20]:
y_validation_lasso = lasso.predict(validation)
r2_validation_lasso = r2_score(validation_target, y_validation_lasso)
r2_validation_lasso

0.10075119269293853

In [21]:
r2_training_lasso - r2_validation_lasso

-0.03977895854528701

# Modelo usando Redes Neuronales con Regresión

In [22]:
training.shape

(1827, 3)

In [23]:
NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = training.shape[1], activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               512       
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 256)               65792     
                                                                 
 dense_3 (Dense)             (None, 256)               65792     
                                                                 
 dense_4 (Dense)             (None, 1)                 257       
                                                                 
Total params: 165,377
Trainable params: 165,377
Non-trainable params: 0
_________________________________________________________________


In [24]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [25]:
NN_model.fit(training, training_target, epochs=45, batch_size=32, validation_split = 0.2, callbacks=callbacks_list)

Epoch 1/45
Epoch 1: val_loss improved from inf to 440.54956, saving model to Weights-001--440.54956.hdf5
Epoch 2/45
Epoch 2: val_loss did not improve from 440.54956
Epoch 3/45
Epoch 3: val_loss did not improve from 440.54956
Epoch 4/45
Epoch 4: val_loss improved from 440.54956 to 428.36765, saving model to Weights-004--428.36765.hdf5
Epoch 5/45
Epoch 5: val_loss did not improve from 428.36765
Epoch 6/45
Epoch 6: val_loss did not improve from 428.36765
Epoch 7/45
Epoch 7: val_loss improved from 428.36765 to 408.86786, saving model to Weights-007--408.86786.hdf5
Epoch 8/45
Epoch 8: val_loss did not improve from 408.86786
Epoch 9/45
Epoch 9: val_loss did not improve from 408.86786
Epoch 10/45
Epoch 10: val_loss did not improve from 408.86786
Epoch 11/45
Epoch 11: val_loss did not improve from 408.86786
Epoch 12/45
Epoch 12: val_loss did not improve from 408.86786
Epoch 13/45
Epoch 13: val_loss did not improve from 408.86786
Epoch 14/45
Epoch 14: val_loss did not improve from 408.86786
Epo

<keras.callbacks.History at 0x7f8a313ff790>

In [27]:
# Load wights file of the best model :
wights_file = 'Weights-043--392.80679.hdf5' # choose the best checkpoint 
NN_model.load_weights(wights_file) # load it
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])

In [28]:
y_predicted_nn = NN_model.predict(training)
r2_training_nn = r2_score(training_target, y_predicted_nn)
r2_training_nn

0.02933669984218623

In [29]:
y_validation_nn = NN_model.predict(validation)
r2_validation_nn = r2_score(validation_target, y_validation_nn)
r2_validation_nn

-0.07003240114827403

In [30]:
r2_training_nn-r2_validation_nn

0.09936910099046026

# Modelo Ganador

Se escogio el modelo que usa Redes Neuronales con Regresión

Se hacen las predicciones para antes del 2017 y para el 2018 

In [44]:
y_predicted_2012_2017_nn = NN_model.predict(predict_2012_2017)
df_2012_2017['Prediccion_NN'] = y_predicted_2012_2017_nn
df_2012_2017

Unnamed: 0,Fecha,Prediccion_NN
0,2012-01-01,732.432495
1,2012-01-02,737.939087
2,2012-01-03,743.673706
3,2012-01-04,749.709473
4,2012-01-05,755.876709
...,...,...
2187,2017-12-27,958.589844
2188,2017-12-28,965.412048
2189,2017-12-29,972.234253
2190,2017-12-30,979.056335


Los valores para el 2018 con este modelo:

In [46]:
y_predicted_2018_nn = NN_model.predict(predict_2018)
df_2018['Prediccion_NN'] = y_predicted_2018_nn
df_2018

Unnamed: 0,Fecha,Prediccion_NN
0,2018-01-01,734.587585
1,2018-01-02,740.092346
2,2018-01-03,745.827026
3,2018-01-04,751.857666
4,2018-01-05,758.024963
...,...,...
176,2018-06-26,923.908752
177,2018-06-27,930.730896
178,2018-06-28,937.553040
179,2018-06-29,944.375183


Se exportan los datos a los archivos predict_2018.csv y predict_2012_2017.csv

In [47]:
df_2018.to_csv('predict_2018.csv', index=False)
df_2012_2017.to_csv('predict_2012_2017.csv', index=False)
