In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from lightgbm import LGBMRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import plotly.express as px
from sklearn import linear_model
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings("ignore")
random_state = 1997

In [None]:
train = pd.read_csv('/kaggle/input/forecasting-problem-unt/train_pandas.csv').sort_values('fecha').reset_index(drop=True)
test = pd.read_csv('/kaggle/input/forecasting-problem-unt/test_pandas.csv') 

In [None]:
y_train = train['total_calls']
x_train = train.drop('total_calls', axis=1)

In [None]:
def train_get_score_cv(x_train: pd.DataFrame, y_train: pd.Series, models, 
                       pipeline_engine: Pipeline, features_selected: list=None,
                       k=5, random_state=199) -> pd.DataFrame:
    """
    Obtiente el AUC como métrica usando cross validation kfold. El cross validation sirve para simular como 
    el modelo reacciona a datos no observados.
    
    Input:
    x_train[pd.DataFrame]: dataframe que contiene las variables explicativas a entrar en el modelo.
    y_train[pd.Series]: Vector que contiene la variable respuesta.
    models[dict]: Diccionario con los modelos a ser entrenados y evaluados.
    pipeline_engine[Pipeline]: sklearn pipeline --> funciones a ser procesadas en el conjunto de entrenamiento
    k[int]: número de folds en el cross validation
    
    Return:
    Un dataframe con los modelos y la métrica para cada modelo
    """
    if features_selected is None:
        features_selected = x_train.columns
        
    kf = TimeSeriesSplit(n_splits=k)
    result = np.zeros((len(models), 1))
    
    for i,model in enumerate(models.keys()):
    
        mape_metric = []

        learner = models[model]
        print(f'Model: {list(models.keys())[i]}')
        for fold, (id_train, id_test) in enumerate(kf.split(x_train, y_train)):

            Xt = x_train.iloc[id_train]; yt = y_train.iloc[id_train]
            Xv = x_train.iloc[id_test]; yv = y_train.iloc[id_test]
            if pipeline_engine != None:
                preprocess_data_cv = pipeline_engine.fit(Xt, yt)
    
                Xt = preprocess_data_cv.transform(Xt)
                features_selected = Xt.columns
                Xv = preprocess_data_cv.transform(Xv)
            learner.fit(Xt[features_selected], yt.values)
            prediction = pd.Series(learner.predict(Xv[features_selected]), index=Xv.index)   
            mape_fold =  mean_absolute_percentage_error(yv, prediction)
            mape_metric.append(mape_fold)
            print(f'Fold {fold}: Best mape score: {mape_fold}')
                                 
        mape_opt = np.mean(mape_metric)
        
        result[i] = [mape_opt]
    result = pd.DataFrame(result, columns=["MAPE"],index = list(models.keys()))

    return result

In [None]:
x_train.shape

In [None]:
models = {
    'LGBM': LGBMRegressor(random_state=random_state),
    'LGBM_Poisson': LGBMRegressor(random_state=random_state, objective='poisson'),
    'LGBM_Mape': LGBMRegressor(random_state=random_state, objective='mape'),
    'LGBM_quantile': LGBMRegressor(random_state=random_state, objective='quantile'),
    'LinearRegression' : LinearRegression(),
    'PoissonRegression': linear_model.PoissonRegressor(),
}

# EDA

In [None]:
columns_to_plot = x_train.columns[1:5]

## Target time series

In [None]:
fig = px.line(train, x='fecha', y="total_calls")
fig.show()

In [None]:
for column_i in columns_to_plot:
    fig = px.line(train, x='fecha', y=column_i)
    fig.show()

In [None]:
train_get_score_cv(x_train=x_train.drop('fecha', axis=1),
                   y_train=y_train,
                   models=models,
                   k=8,
                   pipeline_engine=None)

In [None]:
model = LGBMRegressor(random_state=random_state, objective='quantile')

In [None]:
model.fit(x_train.drop('fecha', axis=1), y_train)

In [None]:
prediction = pd.DataFrame(model.predict(test.drop('fecha', axis=1)), index=test['fecha'], columns=['total_calls'])  

In [None]:
prediction.to_csv('prediction_quantile.csv')

In [None]:
from dateutil.relativedelta import relativedelta
from sklearn.base import BaseEstimator, TransformerMixin

class AggDayofWeek(BaseEstimator, TransformerMixin):
    
    def __init__(self: object, fecha_column: str):
        self.fecha_column = fecha_column
    def fit(self, X, y=None):
        return self
        
    def transform(self, X: pd.DataFrame, y=None) -> None:

        X['dayofweek'] = (X[self.fecha_column].apply(lambda x: pd.to_datetime(x) + relativedelta(months=1)).dt.dayofweek
        )
        return X
    
    

In [None]:
from sklearn.pipeline import Pipeline
from feature_engine.selection import DropFeatures
from feature_engine.encoding import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from feature_engine.wrappers import SklearnTransformerWrapper

to_drop = ['fecha']
drop_features = DropFeatures(features_to_drop = to_drop)
agg_dates = AggDayofWeek(fecha_column = 'fecha')
one_encoder = OneHotEncoder(variables = ['dayofweek'], ignore_format=True)
lag_fet = LagFeatures()

data_pipeliene = Pipeline([ ('agg_dates', agg_dates),
                            ('drop_features', drop_features),
                            ('StandardScaler', SklearnTransformerWrapper(StandardScaler()))
                            ])


In [None]:
train_get_score_cv(x_train=x_train,
                   y_train=y_train,
                   models=models,
                   k=8,
                   pipeline_engine=data_pipeliene)

In [None]:
model = Pipeline([('data_pipeline',data_pipeliene), ('modelo', LGBMRegressor(random_state=random_state, objective='quantile'))])

In [None]:
model.fit(x_train, y_train)
prediction = pd.DataFrame(model.predict(test), index=test['fecha'], columns=['total_calls'])  
prediction.to_csv('prediction_quantile_v2.csv')