In [2]:
import warnings
warnings.filterwarnings('ignore')
from category_encoders import LeaveOneOutEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(palette = "Dark2")
my_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
  (0.8509803921568627, 0.37254901960784315, 0.00784313725490196)]
pd.set_option('display.max_columns', None)
from itertools import chain, combinations

In [3]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
import tensorflow as tensorflow
from lightgbm import LGBMClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

### 5 **Leave-one-out Encoding**
in order not to split the data into three different sets, we will use a leave-one-out encoding. This type of encoding calculates the target response mean for each variable **excluding** the current observation to reduce the effect of outliers. It further adds random noise to the target response to avoid overfitting.

In [10]:
df = pd.read_csv("df_processed.csv")
df

Unnamed: 0,order_item_id,order_date,delivery_date,item_id,size,item_color,brand_id,item_price,user_id,user_title,user_dob,user_state,user_reg_date,return,delivery_time,order_id,user_age,user_reg_age,order_weekday,delivery_weekday,order_month,delivery_month,order_day,delivery_day,order_week,delivery_week,average_item_price_order,order_item_count,order_sum,order_number_same_item_id,order_number_different_item_id,order_number_same_size,order_number_different_size,order_number_same_item_color,order_number_different_item_color,order_number_same_brand_id,order_number_different_brand_id,order_number_same_item_id_size,order_number_different_item_id_size,order_number_same_item_id_item_color,order_number_different_item_id_item_color,order_number_same_size_item_color,order_number_different_size_item_color,order_number_same_size_brand_id,order_number_different_size_brand_id,order_number_same_item_color_brand_id,order_number_different_item_color_brand_id,order_number_same_item_id_size_item_color,order_number_different_item_id_size_item_color,order_number_same_size_item_color_brand_id,order_number_different_size_item_color_brand_id
0,1,2012-04-01,2012-04-03,186,s,denim,25,69.90,794,Mrs,1965-01-06,Bad-Wue,2011-04-25,0.0,2,2012-04-01_794,47,342,6,1,4,4,1,3,13,14,69.93,3,209.8,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3
1,2,2012-04-01,2012-04-03,71,unsized,ocher,21,69.95,794,Mrs,1965-01-06,Bad-Wue,2011-04-25,1.0,2,2012-04-01_794,47,342,6,1,4,4,1,3,13,14,69.93,3,209.8,1,2,1,2,0,3,1,2,1,2,0,3,0,3,1,2,0,3,0,3,0,3
2,3,2012-04-01,2012-04-03,71,unsized,curry,21,69.95,794,Mrs,1965-01-06,Bad-Wue,2011-04-25,1.0,2,2012-04-01_794,47,342,6,1,4,4,1,3,13,14,69.93,3,209.8,1,2,1,2,0,3,1,2,1,2,0,3,0,3,1,2,0,3,0,3,0,3
3,4,2012-04-02,2012-04-06,22,s,green,14,39.90,808,Mrs,1959-11-09,Saxony,2012-01-04,0.0,4,2012-04-02_808,52,89,0,4,4,4,2,6,14,14,39.90,1,39.9,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1
4,5,2012-04-02,2012-04-06,151,s,black,53,29.90,825,Mrs,1964-07-11,S-Holstein,2011-02-16,0.0,4,2012-04-02_825,48,411,0,4,4,4,2,6,14,14,83.23,3,249.7,0,3,1,2,1,2,0,3,0,3,0,3,1,2,0,3,0,3,0,3,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
531165,50074,2013-04-29,2013-05-03,2342,s,terracotta,5,69.90,91920,Mrs,1962-03-08,Bayern,2013-04-29,,4,2013-04-29_91920,51,0,0,4,4,5,29,3,18,18,83.13,9,748.2,0,9,4,5,2,7,3,6,0,9,0,9,2,7,3,6,2,7,0,9,2,7
531166,50075,2013-04-29,2013-05-03,2505,s,terracotta,5,64.90,91920,Mrs,1962-03-08,Bayern,2013-04-29,,4,2013-04-29_91920,51,0,0,4,4,5,29,3,18,18,83.13,9,748.2,1,8,4,5,2,7,3,6,1,8,1,8,2,7,3,6,2,7,1,8,2,7
531167,50076,2013-04-28,2013-05-02,2470,l,white,5,79.90,85095,Mrs,1950-02-14,Berlin,2013-03-24,,4,2013-04-28_85095,63,35,6,3,4,5,28,2,17,18,79.90,1,79.9,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1
531168,50077,2013-04-28,2013-05-02,2452,m,white,5,59.90,91922,Mrs,1969-11-27,Bburg,2013-04-28,,4,2013-04-28_91922,43,0,6,3,4,5,28,2,17,18,59.90,2,119.8,1,1,1,1,0,2,1,1,1,1,0,2,0,2,1,1,0,2,0,2,0,2


In [31]:
low_cardinality_cat_cols = ["size","item_color","user_title","user_state"]
high_cardinality_cat_cols = ["item_id","brand_id","user_id"]
cat_cols = low_cardinality_cat_cols + high_cardinality_cat_cols

In [11]:
# size of the known dataset
k = max(df["order_item_id"])-1
# Drop unnecessary columns
columns_to_drop = ["order_date", "delivery_date", "user_dob", "user_reg_date", "order_id","order_item_id"]
df.drop(columns_to_drop, axis=1, inplace=True)
# Get the validation set
df_valid = df.iloc[k:, :]
# df_train,df_test = df.loc[:k][df.loc[:k,"order_month"] != 3],df.loc[:k][df.loc[:k,"order_month"] == 3]

Since we are no longer using one-hot encoding, we will rework our models not to include the one-hot encoding.

In [12]:
# Xgboost classifier to train the model and use ROC as the metric
def xgboost_classifier(df_train, df_test):
    X_train, Y_train = df_train.drop(["return"],axis=1), df_train["return"]
    X_test, Y_test = df_test.drop(["return"],axis=1), df_test["return"]
    
    model = XGBClassifier(max_depth=7, n_estimators=50, learning_rate=0.1,
                            n_jobs=-1, random_state=42)
    model.fit(X_train, Y_train, eval_metric='auc', eval_set=[(X_test, Y_test)],early_stopping_rounds = 20,verbose = 0)
    Y_pred = model.predict(X_test)
    auc = roc_auc_score(Y_test, Y_pred)
    print("Xgboost AUC on test set:", auc)
    return model,auc

In [13]:
# Catboost classifier to train the model and use ROC as the metric
def catboost_classifier(df_train, df_test,cat_cols):
    X_train, Y_train = df_train.drop(["return"],axis=1), df_train["return"]
    X_test, Y_test = df_test.drop(["return"],axis=1), df_test["return"]
    model = CatBoostClassifier(iterations=50, learning_rate=0.1, depth=10,
                            loss_function='Logloss', eval_metric='AUC',
                            random_seed=42)
    pool_train = Pool(X_train, Y_train,cat_features = cat_cols)

    pool_test = Pool(X_test, Y_test,cat_features = cat_cols)
    

    
    model.fit(pool_train, eval_set=pool_test, use_best_model=True, verbose=False)
    Y_pred = model.predict(pool_test)
    auc = roc_auc_score(Y_test, Y_pred)
    print("Catboost AUC on test set:", auc)
    return model,auc

In [39]:
# LightGBM
def lightgbm(df_train,df_test):
    X_train, Y_train = df_train.drop(["return"],axis=1), df_train["return"]
    X_test, Y_test = df_test.drop(["return"],axis=1), df_test["return"]
    
    model = LGBMClassifier(boosting_type='gbdt')
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    auc = roc_auc_score(Y_test, Y_pred)
    print("LGBM AUC on test set: ", auc)
    return model,auc

In [38]:
def neural_network(df_train,df_test,epochs):
    X_train, Y_train = df_train.drop(["return"],axis=1), df_train["return"]
    X_test, Y_test = df_test.drop(["return"],axis=1), df_test["return"]
    X_train = StandardScaler().fit_transform(X_train)
    X_test = StandardScaler().fit_transform(X_test)
    
    model = Sequential()
    model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))
    model.add(Dense(units=64, activation='relu'))
    model.add(Dense(units=1, activation='sigmoid'))
    auc = tensorflow.keras.metrics.AUC()
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc])
    
    results = model.fit(X_train, Y_train, epochs=epochs, batch_size=256, verbose=1,validation_data=(X_test, Y_test))
    Y_pred = model.predict(X_test)
    auc_1 = roc_auc_score(Y_test, Y_pred)
    print("ROC on test set:", auc_1)
    return model,auc_1, results.history[auc.name], results.history['val_'+auc.name]

In [30]:
# Get list of the months to train, remove months on which to test
months_to_train = list(range(1,13))
months_to_train.remove(3)
# Split into train and test. "~" in front of a variable means "not"
df_train = df.loc[:k][df.loc[:k,"order_month"].isin(months_to_train)]
df_test = df.loc[:k][~df.loc[:k,"order_month"].isin(months_to_train)]

In [32]:
def encode_categorical_columns(df_train,df_test,columns,sig):
    encoder = LeaveOneOutEncoder(cols=columns, return_df=True,sigma=sig, verbose=True)
    df_encode_train = encoder.fit_transform(df_train.drop(["return"],axis=1),df_train[["return"]])
    df_encode_test = encoder.transform(df_test.drop(["return"],axis=1))
    df_encode_train , df_encode_test = df_encode_train.join(df_train[["return"]]), df_encode_test.join(df_test[["return"]])
    return df_encode_train, df_encode_test

In [33]:
df_train, df_test = encode_categorical_columns(df_train,df_test,cat_cols,0.1)

In [35]:
df_train

Unnamed: 0,item_id,size,item_color,brand_id,item_price,user_id,user_title,user_state,delivery_time,user_age,user_reg_age,order_weekday,delivery_weekday,order_month,delivery_month,order_day,delivery_day,order_week,delivery_week,average_item_price_order,order_item_count,order_sum,order_number_same_item_id,order_number_different_item_id,order_number_same_size,order_number_different_size,order_number_same_item_color,order_number_different_item_color,order_number_same_brand_id,order_number_different_brand_id,order_number_same_item_id_size,order_number_different_item_id_size,order_number_same_item_id_item_color,order_number_different_item_id_item_color,order_number_same_size_item_color,order_number_different_size_item_color,order_number_same_size_brand_id,order_number_different_size_brand_id,order_number_same_item_color_brand_id,order_number_different_item_color_brand_id,order_number_same_item_id_size_item_color,order_number_different_item_id_size_item_color,order_number_same_size_item_color_brand_id,order_number_different_size_item_color_brand_id,return
0,0.607386,0.409204,0.553287,0.612820,69.90,0.582585,0.492080,0.423056,2,47,342,6,1,4,4,1,3,13,14,69.93,3,209.80,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0.0
1,0.585970,0.452830,0.446604,0.557096,69.95,0.294910,0.442463,0.452489,2,47,342,6,1,4,4,1,3,13,14,69.93,3,209.80,1,2,1,2,0,3,1,2,1,2,0,3,0,3,1,2,0,3,0,3,0,3,1.0
2,0.660359,0.479879,0.485496,0.496581,69.95,0.353396,0.461844,0.506787,2,47,342,6,1,4,4,1,3,13,14,69.93,3,209.80,1,2,1,2,0,3,1,2,1,2,0,3,0,3,1,2,0,3,0,3,0,3,1.0
3,0.420043,0.549688,0.529974,0.485950,39.90,0.531543,0.483000,0.483295,4,52,89,0,4,4,4,2,6,14,14,39.90,1,39.90,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0.0
4,0.561362,0.569386,0.480052,0.502539,29.90,0.000000,0.524143,0.469232,4,48,411,0,4,4,4,2,6,14,14,83.23,3,249.70,0,3,1,2,1,2,0,3,0,3,0,3,1,2,0,3,0,3,0,3,0,3,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412004,0.493675,0.476695,0.454651,0.480198,69.95,0.837352,0.428631,0.479597,5,32,0,3,1,2,3,28,5,9,10,41.91,5,209.55,0,5,0,5,2,3,0,5,0,5,0,5,0,5,0,5,0,5,0,5,0,5,1.0
412005,0.722676,0.558552,0.503153,0.401486,34.90,0.993669,0.498055,0.532704,5,32,0,3,1,2,3,28,5,9,10,41.91,5,209.55,3,2,3,2,2,3,3,2,3,2,1,4,1,4,3,2,1,4,1,4,1,4,1.0
412006,0.584514,0.526187,0.527930,0.485159,34.90,1.067276,0.405497,0.505760,5,32,0,3,1,2,3,28,5,9,10,41.91,5,209.55,3,2,3,2,2,3,3,2,3,2,1,4,1,4,3,2,1,4,1,4,1,4,1.0
412007,0.582378,0.542497,0.581684,0.539183,34.90,0.908470,0.428467,0.540555,5,32,0,3,1,2,3,28,5,9,10,41.91,5,209.55,3,2,3,2,1,4,3,2,3,2,1,4,1,4,3,2,1,4,1,4,1,4,1.0


In [36]:
catboost = catboost_classifier(df_train, df_test,[])

Catboost AUC on test set: 0.6637482553777858


(<catboost.core.CatBoostClassifier at 0x2b891bf9970>, 0.6637482553777858)

In [40]:
neural_network = neural_network(df_train,df_test,20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
ROC on test set: 0.7141659392463714
