# SOLUCIÓN IMPLEMENTADA
Opto por LightGBM, ya que el embebimiento de entidades no dio resultados esperables. Esta herramienta está basada en el método de impluso de gradiente utilizando árboles de decisión. La idea es ir uniendo modelos "débiles" detectando en cada iteración los huecos más importantes que arruinan el puntaje para conformar un modelo lo suficientemente robusto.

Vale la pena mencionar que, a diferencia del trabajo de CIFAR de clasificación de imágenes, una de las ventajas de tener la posibilidad de utilizar estos métodos es que el entrenamiento es relativamente simple de configurar y además por sobre todo es rápido; ni siquiera fue necesario recurrir a las GPU o TPU para que en menos de tres minutos pueda corrar el script completo.

In [114]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)



https://www.youtube.com/watch?v=1-NYPQw5THU&feature=youtu.be

In [115]:
import pandas as pd
import numpy as np
import datetime
from pandas_summary import DataFrameSummary

In [116]:
PATH = "/kaggle/input/rossmann-normalize-and-encode/"
df = pd.read_feather(PATH+'train_normalized_data.fth')
df_test = pd.read_feather(PATH+'test_normalized_data.fth')

In [117]:
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen', 'Promo2Weeks', 
            'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear', 'State', 
            'Week', 'Events', 'Promo_fw', 'Promo_bw', 'SchoolHoliday_fw', 'SchoolHoliday_bw']

# cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'State']

In [118]:
contin_vars = ['CompetitionDistance', 
   'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
   'Promo', 'SchoolHoliday']
# contin_vars = []

In [119]:
from lightgbm import LGBMRegressor, early_stopping

In [120]:
y_out_columns = ['Sales']

In [121]:
df_train = df[df.Date < datetime.datetime(2015, 7, 1)]  
df_val = df[df.Date >= datetime.datetime(2015, 7, 1)]
print(f'Cantidad en val: {len(df_val)}, porcentaje: {len(df_train)/(len(df_train) + len(df_val))}')

Cantidad en val: 30188, porcentaje: 0.9642465458145908


In [122]:
X_train = df_train[cat_vars + contin_vars]
X_val = df_val[cat_vars + contin_vars]
X_test = df_test[cat_vars + contin_vars]

In [123]:
X_train.shape, X_val.shape

((814150, 34), (30188, 34))

In [124]:
log_output = True
    
if log_output:
    # Escala logaritmica
    max_log_y = np.max(np.log(df[y_out_columns]))
    y_train = np.log(df_train[y_out_columns])/max_log_y
    y_val = np.log(df_val[y_out_columns])/max_log_y
else:
    # Normalización
    y_mean = df_train[y_out_columns].mean()
    y_std = df_train[y_out_columns].std()
    y_train = (df_train[y_out_columns] - y_mean)/y_std
    y_val = (df_val[y_out_columns] - y_mean)/y_std

In [125]:
# PARAMETROS DE ENTRENAMIENTO
min_child_samples=5 # minimo de muestras necesarias para una nueva hoja del árbol
n_estimators=4000 # número de estimadores
learning_rate=0.05 # tasa de aprendizaje
model = LGBMRegressor(min_child_samples=min_child_samples, n_estimators=n_estimators, learning_rate=learning_rate )

In [126]:
fit_params={"eval_metric" : 'l2', 
            "eval_set" : [(X_val, y_val)],
            'eval_names': ['valid'],
            'feature_name': 'auto', # that's actually the default
            'categorical_feature': cat_vars
           }

In [127]:
model.fit(X_train, y_train, **fit_params,
          callbacks=[
        early_stopping(stopping_rounds=100),
    ])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.135533 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2166
[LightGBM] [Info] Number of data points in the train set: 814150, number of used features: 34
[LightGBM] [Info] Start training from score 0.823418
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2307]	valid's l2: 0.000117237


# Métrica

$$
\textrm{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^{n} \left(\frac{\hat{y}_i - y_i}{y_i}\right)^2}
$$

In [128]:
model.score(X_val, y_val)

0.9178732228198393

In [129]:
if log_output:
    y_pred_train = np.exp(model.predict(X_train, verbose=1)*max_log_y)
    y_pred = np.exp(model.predict(X_val, verbose=1)*max_log_y)
    y_pred_test = np.exp(model.predict(X_test, verbose=1)*max_log_y)
else:
    y_pred_train = model.predict(X_train, verbose=1)*y_std + y_mean
    y_pred = model.predict(X_val, verbose=1)*y_std + y_mean
    y_pred_test = model.predict(X_test, verbose=1)*y_std + y_mean

In [130]:
# Train
np.sqrt((((df_train['Sales'].values - y_pred_train)/df_train['Sales'].values)**2).sum()/len(y_pred_train))

0.08420594024192386

In [131]:
# Validación
np.sqrt((((df_val['Sales'].values - y_pred)/df_val['Sales'].values)**2).sum()/len(y_pred))

0.11959078040913741

# Baseline

In [134]:
stores_mean = {}
for store, g_df in df.groupby('Store'):
    stores_mean[store] = g_df[g_df['Sales'] > 0]['Sales'].mean()

df_test['Sales'] = df_test['Store'].apply(stores_mean.get)
df_test.loc[df_test['Open'] == 0, 'Sales'] = 0
df_test[['Store', 'Sales']].head(10)
df_test[df_test['Open'] == 0][['Store', 'Sales']].head()

Unnamed: 0,Store,Sales
543,702,0.0
676,878,0.0
840,1096,0.0
1399,702,0.0
1532,878,0.0


# SUBMIT

In [139]:
import pandas as pd
df_out = pd.DataFrame(y_pred_test, columns=["Sales"])
df_out.index.name = "Id"
df_out.index += 1
df_out.to_csv('submision_lightgbm.csv')

In [140]:
df_out.head()

Unnamed: 0_level_0,Sales
Id,Unnamed: 1_level_1
1,4060.598578
2,7182.134591
3,8778.916188
4,7339.658251
5,7295.585816
