In [None]:
import warnings
warnings.filterwarnings('ignore')
from category_encoders import LeaveOneOutEncoder,WOEEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(palette = "Dark2")
my_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
  (0.8509803921568627, 0.37254901960784315, 0.00784313725490196)]
pd.set_option('display.max_columns', None)
from itertools import chain, combinations
import math

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostClassifier, Pool
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
import tensorflow as tensorflow
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler

In [1]:
# Create a class for all the models
class Model:
    def __init__(self, df:dict):
        self.df = df
        self.k = max(df["order_item_id"])-1
        
    def split_data(self, historic:bool)->tuple:
        """
        Split the data into train and test sets, depending on whether the encdoder needs to be fited on the full data or not.

        Parameters
        ----------
        historic : bool
            0 if the encoder needs to be fited on the Months April to February and to transform March
            1 if the encoder needs to be fited on the Months April to December and to transform January to March

        Returns
        -------
        tuple
            df_train : dict
                Dataframe with training data.
            df_test : dict
                Dataframe with testing data.
        """
        # Get list of the months to train, remove months on which to test        
        if historic:
            months_to_train = list(range(1,13))
            months_to_train.remove(12,1,2,3)
        else:
            months_to_train = list(range(1,13))
            months_to_train.remove(3)
        # Split into train and test. "~" in front of a variable means "not"
        df_train = self.df.loc[:k][self.df.loc[:k,"order_month"].isin(months_to_train)]
        df_test = self.df.loc[:k][~self.df.loc[:k,"order_month"].isin(months_to_train)]
        # Get the validation set
        # df_valid = self.df.iloc[k+1:, :]
        # Drop unnecessary columns
        columns_to_drop = ["order_date", "delivery_date", "user_dob", "user_reg_date", "order_id","order_item_id"]
        df_train.drop(columns_to_drop, axis=1, inplace=True)
        df_test.drop(columns_to_drop, axis=1, inplace=True)
        return df_train, df_test
    
    def LOE_Encoder(self, df_train:dict, df_test:dict, columns:list ,sig:float)->tuple:
        """
        Leave One Out Encoder to calculate the response variable for each category.

        Parameters
        ----------
        df_train : dict
            Dataframe with training data.
        df_test : dict
            Dataframe with testing data.
        columns : list
            Categorical columns to encode.
        sig : float
            Random noise added to the response variable.

        Returns
        -------
        tuple
            _description_
        """              
        encoder = LeaveOneOutEncoder(cols=columns, return_df=True,sigma=sig)
        df_encode_train = encoder.fit_transform(df_train.drop(["return"],axis=1),df_train[["return"]])
        df_encode_test = encoder.transform(df_test.drop(["return"],axis=1))
        df_encode_train , df_encode_test = df_encode_train.join(df_train[["return"]]), df_encode_test.join(df_test[["return"]])
        return df_encode_train, df_encode_test, encoder
    
    def neural_network(self, df_train:dict, df_test:dict, n_layers:int, n_nodes:int, dropout:list, activation:str, optimizer:str, loss:str, metrics:list, epochs:int, batch_size:int,verbose:int)->object:
        """
        Neural network model to predict whether an item will be returned or not.

        Parameters
        ----------
        df_train : dict
            Dataframe with training data.
        df_test : dict
            Dataframe with testing data.
        n_layers : int
            Number of layers in the neural network
        n_nodes : int
            Number of nodes in each layer
        dropout : list
            List of dropout rates for each layer
        activation : str
            Activation function for each layer except the last one
        optimizer : str
            Optimizer for the neural network
        loss : str
            Loss function for the neural network
        metrics : str
            List of metrics for the neural network
        epochs : int
            Number of epochs for the neural network
        batch_size : int
            Size of the batch
        verbose : int
            Whether to print the progress of the neural network

        Returns
        -------
        model: object
            The Neural network model
        Y_pred: array
            Array of floats with the predictions of the neural network
        mae: float
            Mean absolute error on the testing set
        """         
        X_train, Y_train = df_train.drop(["return"],axis=1), df_train["return"]
        X_test, Y_test = df_test.drop(["return"],axis=1), df_test["return"]
        X_train = StandardScaler().fit_transform(X_train)
        X_test = StandardScaler().fit_transform(X_test)
        
        model = Sequential()
        for i in range(n_layers):
            if i == 0:
                model.add(Dense(n_nodes, input_dim=len(self.df.columns), activation=activation))
            else:
                model.add(Dense(n_nodes, activation=activation))
            model.add(Dropout(dropout[i]))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
        model.fit(self.df, epochs=epochs, batch_size=batch_size,verbose=verbose)
        Y_pred = model.predict(X_test)
        mae = mean_absolute_error(Y_test, Y_pred)
        return model, Y_pred, mae
    
    def xgboost(self, df_train:dict, df_test:dict, params:dict, verbose:int):
        X_train, Y_train = df_train.drop(["return"],axis=1), df_train["return"]
        X_test, Y_test = df_test.drop(["return"],axis=1), df_test["return"]
        model = XGBClassifier(**params)
        model.fit(X_train, Y_train, verbose=verbose)
        Y_pred = model.predict(X_test)
        mae = mean_absolute_error(Y_test, Y_pred)
        return model, Y_pred, mae
    
    def catboost(self, df_train:dict, df_test:dict, params:dict, verbose:int):
        X_train, Y_train = df_train.drop(["return"],axis=1), df_train["return"]
        X_test, Y_test = df_test.drop(["return"],axis=1), df_test["return"]
        model = CatBoostClassifier(**params)
        model.fit(X_train, Y_train, verbose=verbose)
        Y_pred = model.predict(X_test)
        mae = mean_absolute_error(Y_test, Y_pred)
        return model, Y_pred, mae
    
    def lightgmb(self, df_train:dict, df_test:dict, params:dict, verbose:int):
        X_train, Y_train = df_train.drop(["return"],axis=1), df_train["return"]
        X_test, Y_test = df_test.drop(["return"],axis=1), df_test["return"]
        model = LGBMClassifier(**params)
        model.fit(X_train, Y_train, verbose=verbose)
        Y_pred = model.predict(X_test)
        mae = mean_absolute_error(Y_test, Y_pred)
        return model, Y_pred, mae
    