In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
from itertools import chain, combinations
from sklearn.preprocessing import LabelEncoder

In [33]:
import importlib
import Model_functions
importlib.reload(Model_functions)
from Model_functions import Model_class 

In [3]:
df = pd.read_csv("df_127_columns.csv")

In [3]:
low_cardinality_cat_cols = ["size","item_color","user_title","user_state"]
high_cardinality_cat_cols = ["item_id","brand_id","user_id"]
cat_cols = low_cardinality_cat_cols + high_cardinality_cat_cols
cat_cols += ["mode_item_id","mode_size","mode_brand_id","mode_item_color",]
reduce_memory = low_cardinality_cat_cols + ["mode_size","mode_item_color"]

In [6]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 531170 entries, 0 to 531169
Columns: 127 entries, order_item_id to delivery_time_user_id_mad
dtypes: float64(54), int64(62), object(11)
memory usage: 825.7 MB


It seems that quite a few columns are stored as object types which use more memory than necessary. We can convert these to the appropriate types to save memory as we will use a quite substantial amount of feature engineering which will increase the memory requirements of storing the dataset in RAM.


In [6]:
# Convert order_id to category and the datetime features to a format YYYY-MM-DD
df["order_id"] = df["order_id"].astype("category")
df["order_date"] = pd.to_datetime(df["order_date"], format="%Y-%m-%d")
df["delivery_date"] = pd.to_datetime(df["delivery_date"], format="%Y-%m-%d")
df["user_reg_date"] = pd.to_datetime(df["user_reg_date"], format="%Y-%m-%d")
df["user_dob"] = pd.to_datetime(df["user_dob"], format="%Y-%m-%d")

# Use label encoder to convert the categorical features to numeric which will change the column type to int and reduce memory
LE = LabelEncoder()
for i in reduce_memory:
    df[i] = LE.fit_transform(df[i])

In [8]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 531170 entries, 0 to 531169
Columns: 127 entries, order_item_id to delivery_time_user_id_mad
dtypes: category(1), datetime64[ns](4), float64(54), int32(6), int64(62)
memory usage: 515.1 MB


As seen from the above output we managed to reduce the memory usage of the dataframe from 825 MB to 515 MB which constitutes a reduction of **37.5%** in memory usage.

In [9]:
# How to shift the date
# df_group[['A', 'B', 'C']] = df_gr.groupby('Name')['A', 'B', 'C'].shift(1)

In the next steps, we will be deriving new time-based features from the order and delivery dates in the hopes they will be useful in predicting the customer's return rate. First we are going to start with the days since the last order for each customer. We can hypothesize that if a customer has placed two orders in a short period of time, they are more likely to return an item.

In [10]:
# Get days since last order for each user
df_group = df[["order_date","user_id"]].drop_duplicates().sort_values(by=["user_id","order_date"])
df_group["order_date_shifted"] = df_group.groupby("user_id")["order_date"].shift(1)
df_group["days_since_last_order"] = (df_group["order_date"] - df_group["order_date_shifted"]).dt.days
df_group

Unnamed: 0,order_date,user_id,order_date_shifted,days_since_last_order
24723,2012-04-19,6,NaT,
31011,2012-04-25,6,2012-04-19,6.0
226242,2012-09-23,6,2012-04-25,151.0
240276,2012-10-06,6,2012-09-23,13.0
331324,2013-01-11,6,2012-10-06,97.0
...,...,...,...,...
531141,2013-04-29,91911,NaT,
531149,2013-04-27,91912,NaT,
531154,2013-04-29,91915,NaT,
531158,2013-04-29,91920,NaT,


In [11]:
# Get statistics for days since last order
cols_to_agg = ["user_id"]
stats_to_agg = ["mean","median","max","min","std","mad","skew"]
# Loop through the columns to aggregate
for i in cols_to_agg:
    df_group_group = df_group[[i,"days_since_last_order"]].groupby([i]).agg(stats_to_agg).reset_index().round(2)
    df_group_group.columns = [str("_" + i +"_").join(col) for col in df_group_group.columns]
    df_group_group.rename(columns= {i+"_"+i+"_":i},inplace=True)
    df_group = df_group.merge(df_group_group,on=i,how="left")
df_group.fillna(0,inplace=True)
df_group.drop(["order_date_shifted"],axis=1,inplace=True)
df = df.merge(df_group,on=["user_id","order_date"],how="left")    

In [20]:
# Create a powerset of the item descriptions columns
item_descriptions = ["item_id","size","item_color","brand_id"]
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1,len(s)+1))

Furthermore we will also be deriving the days since the last order for each customer for each product and every product variation such as size and color, and for each brand, as well as their interactions.

In [13]:
categorical_powerset = powerset(item_descriptions)
# Use the powerset to create new columns indicating the days since the last order for each combination of the item descriptions
powerset_name_list = []
# exclude powerset elements containing both item_id and brand_id as they are redundant
exclude = ["item_id","brand_id"]
for column in categorical_powerset:
    current = list(column)
    if not all(x in current for x in exclude):
        print(current)
        df_group = df[["order_date","user_id"]+current].drop_duplicates().sort_values(by=["user_id","order_date"])
        df_group["order_date_shifted"] = df_group.groupby(["user_id"] + current)["order_date"].shift(1)
        df_group["days_since_last_order_same"+ "_" + "_".join(current)] = (df_group["order_date"] - df_group["order_date_shifted"]).dt.days
        df_group.fillna(0, inplace=True)
        df_group.drop(["order_date_shifted"],axis=1,inplace=True)
        df = df.merge(df_group,on=["user_id","order_date"]+current,how="left")

['item_id']
['size']
['item_color']
['brand_id']
['item_id', 'size']
['item_id', 'item_color']
['size', 'item_color']
['size', 'brand_id']
['item_color', 'brand_id']
['item_id', 'size', 'item_color']
['size', 'item_color', 'brand_id']


In [14]:
# Get days since last delivery
df_group = df[["delivery_date","user_id"]].drop_duplicates().sort_values(by=["user_id","delivery_date"])
df_group["delivery_date_shifted"] = df_group.groupby("user_id")["delivery_date"].shift(1)
# df_group[['A', 'B', 'C']] = df_gr.groupby('Name')['A', 'B', 'C'].shift(1)
df_group["days_since_last_delivery"] = (df_group["delivery_date"] - df_group["delivery_date_shifted"]).dt.days
# df_group.fillna(0, inplace=True)
df_group

Unnamed: 0,delivery_date,user_id,delivery_date_shifted,days_since_last_delivery
24723,2012-04-24,6,NaT,
31011,2012-04-26,6,2012-04-24,2.0
31013,2012-05-07,6,2012-04-26,11.0
226242,2012-09-27,6,2012-05-07,143.0
240281,2012-10-10,6,2012-09-27,13.0
...,...,...,...,...
531146,2013-05-20,91911,2013-05-03,17.0
531149,2013-06-10,91912,NaT,
531154,2013-05-03,91915,NaT,
531158,2013-05-03,91920,NaT,


Repeat the same procedure for the delivery dates.

In [15]:
# get statistics for days since last delivery
cols_to_agg = ["user_id"]
stats_to_agg = ["mean","median","max","min","std","mad","skew"]
# Loop through the columns to aggregate
for i in cols_to_agg:
    df_group_group = df_group[[i,"days_since_last_delivery"]].groupby([i]).agg(stats_to_agg).reset_index().round(2)
    df_group_group.columns = [str("_" + i +"_").join(col) for col in df_group_group.columns]
    df_group_group.rename(columns= {i+"_"+i+"_":i},inplace=True)
    df_group = df_group.merge(df_group_group,on=i,how="left")
df_group.fillna(0,inplace=True)
df_group.drop(["delivery_date_shifted"],axis=1,inplace=True)
df = df.merge(df_group,on=["user_id","delivery_date"],how="left")   

In [17]:
categorical_powerset = powerset(item_descriptions)
# Use the powerset to create new columns indicating the days since the last delivery for each combination of the item descriptions
powerset_name_list = []
# exclude powerset elements containing both item_id and brand_id as they are redundant
exclude = ["item_id","brand_id"]
for column in categorical_powerset:
    current = list(column)
    if not all(x in current for x in exclude):
        print(current)
        df_group = df[["delivery_date","user_id"]+current].drop_duplicates().sort_values(by=["user_id","delivery_date"])
        df_group["delivery_date_shifted"] = df_group.groupby(["user_id"] + current)["delivery_date"].shift(1)
        df_group["days_since_last_delivery_same"+ "_" + "_".join(current)] = (df_group["delivery_date"] - df_group["delivery_date_shifted"]).dt.days
        df_group.fillna(0, inplace=True)
        df_group.drop(["delivery_date_shifted"],axis=1,inplace=True)
        df = df.merge(df_group,on=["user_id","delivery_date"]+current,how="left")

['item_id']
['size']
['item_color']
['brand_id']
['item_id', 'size']
['item_id', 'item_color']
['size', 'item_color']
['size', 'brand_id']
['item_color', 'brand_id']
['item_id', 'size', 'item_color']
['size', 'item_color', 'brand_id']


To get the summary statistics (mean, max, min...) for the interactions between the numeric variable **delivery date** and other numeric variables such as **order_sum** and **item_price** we will bin these variables into different bins and then calculate the summary statistics for each bin.

In [19]:
# Use the numeric variable order_sum to create a new categorical variable using bins
# Bins for the order_sum variable
bins_range = df["order_sum"].describe(percentiles=[0.1,0.25,0.5,0.75,0.9])
bins = bins_range[["min","10%","25%","50%","75%","90%","max"]].values
df['order_sum_bins'] = pd.cut(df['order_sum'], bins=bins, include_lowest=True).cat.codes

# Bins for the item_price variable
bins_range = df["item_price"].describe(percentiles=[0.1,0.25,0.5,0.75,0.95])
bins = bins_range[["min","10%","25%","50%","75%","95%","max"]].values
df['item_price_bins'] = pd.cut(df['item_price'], bins=bins, include_lowest=True).cat.codes

In [20]:
# Bins for the age variable based on the life stages of the customer
bins = [20,24,29,34,44,55,65,85]
df['age_bins'] = pd.cut(df['user_age'], bins=bins, include_lowest=True).cat.codes

In [21]:
df["user_reg_age"].describe(percentiles=[0.1,0.25,0.5,0.75,0.9])
bins = [-1,0,0.1,7,14,30,60,90,180,365,730,805]
df['reg_age_bins'] = pd.cut(df['user_reg_age'], bins=bins, include_lowest=True).cat.codes

In [22]:
bins_column_list = ["order_sum_bins","item_price_bins","age_bins","reg_age_bins"]

In [23]:
# Get the interaction between the bins, the order/delivery weekday and the delivery time
cols_to_agg = bins_column_list + ["order_weekday", "delivery_weekday"]
stats_to_agg = ["mean","std","min","max","median","mad","skew"]
# Loop through the columns to aggregate
for i in cols_to_agg:
    df_group = df[[i,"delivery_time"]].groupby([i]).agg(stats_to_agg).reset_index().round(2)
    df_group.columns = [str("_" + i +"_").join(col) for col in df_group.columns]
    df_group.rename(columns= {i+"_"+i+"_":i},inplace=True)
    df_group.fillna(0,inplace=True)
    df = df.merge(df_group,on=i,how="left")   

In [24]:
df.drop(bins_column_list,axis=1,inplace=True)

In [7]:
def powerset(iterable,r1,r2):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(r1,r2))

Also, add the interaction between the **delivery date** and the categorical variables including the user by capturing the summary statistics for each category and the interaction between categories

In [None]:
# Interactions between delivery time and categorical variables
interactions_list = ["item_id","brand_id","item_color","size","delivery_weekday","user_id"]
categorical_powerset = powerset(interactions_list,2,3)
# Use the powerset to create new columns indicating the days since the last delivery for each combination of the item descriptions
stats_to_agg = ["mean","std","min","max","median","mad","skew"]
# exclude powerset elements containing both item_id and brand_id as they are redundant
exclude = ["item_id","brand_id"]
for column in categorical_powerset:
    i = list(column)
    if not all(x in i for x in exclude):
        print(i)
        df_group = df[i+["delivery_time"]].groupby(i).agg(stats_to_agg).reset_index().round(2)
        df_group.columns = [str("_" + i[0] +"_" + i[1] + "_").join(col) for col in df_group.columns]
        df_group.rename(columns= {df_group.columns[0]:str(i[0]), df_group.columns[1]:str(i[1])},inplace=True)
        df_group.fillna(0,inplace=True)
        df = df.merge(df_group,on=i,how="left") 

Same but also add the 3x interaction between the categorical variables and the **delivery date**.

In [78]:
# Interactions between delivery time and categorical variables
interactions_list = ["item_id","brand_id","item_color","size","delivery_weekday","user_id"]
categorical_powerset = powerset(interactions_list,3,4)
# Use the powerset to create new columns indicating the days since the last delivery for each combination of the item descriptions
stats_to_agg = ["mean","std","min","max","median","mad","skew"]
# exclude powerset elements containing both item_id and brand_id as they are redundant
exclude = ["item_id","brand_id"]
for column in categorical_powerset:
    i = list(column)
    if not all(x in i for x in exclude):
        print(i)
        df_group = df[i+["delivery_time"]].groupby(i).agg(stats_to_agg).reset_index().round(2)
        df_group.columns = [str("_" + i[0] +"_" + i[1] + "_" + i[2] + "_").join(col) for col in df_group.columns]
        df_group.rename(columns= {df_group.columns[0]:str(i[0]), df_group.columns[1]:str(i[1]),df_group.columns[2]:str(i[2])},inplace=True)
        df_group.fillna(0,inplace=True)
        df = df.merge(df_group,on=i,how="left") 

['item_id', 'item_color', 'size']
['item_id', 'item_color', 'delivery_weekday']
['item_id', 'item_color', 'user_id']
['item_id', 'size', 'delivery_weekday']
['item_id', 'size', 'user_id']
['item_id', 'delivery_weekday', 'user_id']
['brand_id', 'item_color', 'size']
['brand_id', 'item_color', 'delivery_weekday']
['brand_id', 'item_color', 'user_id']
['brand_id', 'size', 'delivery_weekday']
['brand_id', 'size', 'user_id']
['brand_id', 'delivery_weekday', 'user_id']
['item_color', 'size', 'delivery_weekday']
['item_color', 'size', 'user_id']
['item_color', 'delivery_weekday', 'user_id']
['size', 'delivery_weekday', 'user_id']


In [12]:
df = pd.read_csv("df_last_X3.csv")

Calculate the total items count and the cumulative sum of itmes ordered by each customer.

In [13]:
# Total items ordered by user
df_group = df[["user_id","user_state"]].groupby(["user_id"]).count().reset_index()
df_group.rename(columns= {"user_state":"user_id_total_items_ordered"},inplace=True)
# Use join instead of merge to speed up the process as merge is slower CODEBASE INCLUDE !!!
df = df.join(df_group.set_index("user_id"),on="user_id",how="left")

In [14]:
# Cumulative sum of items ordered by user per order day
df_group = df[["user_id","user_state","order_date"]].groupby(["user_id","order_date"]).count().reset_index()
df_group.rename(columns= {"user_state":"user_id_total_items_ordered"},inplace=True)
df_group["user_id_total_items_ordered_cumsum"] = df_group.groupby(["user_id"])["user_id_total_items_ordered"].cumsum()
df_group.drop(columns=["user_id_total_items_ordered"],inplace=True)
df = df.join(df_group.set_index(["user_id","order_date"]),on=["user_id","order_date"],how="left")

In [15]:
# Total items ordered by user and interactions
interactions_list = ["item_id","brand_id","item_color","size"]
categorical_powerset = powerset(interactions_list,1,len(interactions_list)+1)
exclude = ["item_id","brand_id"]
for column in categorical_powerset:
    i = list(column)
    if not all(x in i for x in exclude):
        # print(i)
        df_group = df[["user_id","user_state"]+i].groupby(["user_id"]+i).count().reset_index()
        df_group.rename(columns= {"user_state":"user_id_"+ "_".join(i) + "_total_items_ordered"},inplace=True)
        # Use join instead of merge to speed up the process as merge is slower CODEBASE INCLUDE !!!
        df = df.join(df_group.set_index(["user_id"]+i),on=["user_id"]+i,how="left")
        

In [16]:
# Cumulative sum of items ordered by user per order day
interactions_list = ["item_id","brand_id","item_color","size"]
categorical_powerset = powerset(interactions_list,1,len(interactions_list)+1)
exclude = ["item_id","brand_id"]
for column in categorical_powerset:
    i = list(column)
    if not all(x in i for x in exclude):
        print(i)
        df_group = df[["user_id","user_state","order_date"]+i].groupby(["user_id"]+i+["order_date"]).count().reset_index()
        df_group["user_state"] = df_group.groupby(["user_id"]+i)["user_state"].cumsum()
        df_group.rename(columns= {"user_state":"user_id_"+ "_".join(i) + "_cumsum_items_ordered"},inplace=True)
        # Use join instead of merge to speed up the process as merge is slower CODEBASE INCLUDE !!!
        df = df.join(df_group.set_index(["user_id","order_date"]+i),on=["user_id","order_date"]+i,how="left")

['item_id']
['brand_id']
['item_color']
['size']
['item_id', 'item_color']
['item_id', 'size']
['brand_id', 'item_color']
['brand_id', 'size']
['item_color', 'size']
['item_id', 'item_color', 'size']
['brand_id', 'item_color', 'size']


Calculate the number of unique products ordered by each customer.

In [17]:
# n unique for user id
interactions_list = ["item_id","brand_id","item_color","size"]
categorical_powerset = powerset(interactions_list,0,len(interactions_list)-1)
exclude = ["item_id","brand_id"]
for column in categorical_powerset:
    current = list(column)
    if not all(x in current for x in exclude) and current != ['item_id', 'item_color', 'size']:
        print(current)
        df_group = df[["user_id","item_id","size","item_color","brand_id"]].groupby(["user_id"]+current).nunique()
        df_group.columns = ["user_id_"+i+"_nunique_" +"_".join(current)  for i in df_group.columns]
        # Use join instead of merge to speed up the process as merge is slower CODEBASE INCLUDE !!!
        df = df.join(df_group,on=["user_id"]+current,how="left")

[]
['item_id']
['brand_id']
['item_color']
['size']
['item_id', 'item_color']
['item_id', 'size']
['brand_id', 'item_color']
['brand_id', 'size']
['item_color', 'size']


In [19]:
# df.to_csv("df_last_467.csv",index=False)

In [4]:
# df = pd.read_csv("df_last_467.csv")

In [72]:
df

Unnamed: 0,order_item_id,order_date,delivery_date,item_id,size,item_color,brand_id,item_price,user_id,user_title,user_dob,user_state,user_reg_date,return,delivery_time,order_id,user_age,user_reg_age,order_weekday,delivery_weekday,order_month,delivery_month,order_day,delivery_day,order_week,delivery_week,order_item_count,order_sum,average_item_price_order,order_number_same_item_id,order_number_different_item_id,order_number_same_size,order_number_different_size,order_number_same_item_color,order_number_different_item_color,order_number_same_brand_id,order_number_different_brand_id,order_number_same_item_id_size,order_number_different_item_id_size,order_number_same_item_id_item_color,order_number_different_item_id_item_color,order_number_same_size_item_color,order_number_different_size_item_color,order_number_same_size_brand_id,order_number_different_size_brand_id,order_number_same_item_color_brand_id,order_number_different_item_color_brand_id,order_number_same_item_id_size_item_color,order_number_different_item_id_size_item_color,order_number_same_size_item_color_brand_id,order_number_different_size_item_color_brand_id,order_item_id_nunique,order_size_nunique,order_brand_id_nunique,order_item_color_nunique,order_item_id_color_nunique,order_item_id_size_nunique,order_brand_id_color_nunique,order_brand_id_size_nunique,order_brand_id_item_id_nunique,item_price_item_id_mean,item_price_item_id_std,item_price_item_id_min,item_price_item_id_max,item_price_item_id_sum,item_price_item_id_count,item_price_item_id_median,item_price_item_id_mad,item_price_user_id_mean,item_price_user_id_std,item_price_user_id_min,item_price_user_id_max,item_price_user_id_sum,item_price_user_id_count,item_price_user_id_median,item_price_user_id_mad,item_price_brand_id_mean,item_price_brand_id_std,item_price_brand_id_min,item_price_brand_id_max,item_price_brand_id_sum,item_price_brand_id_count,item_price_brand_id_median,item_price_brand_id_mad,price-item_price_item_id_mean,price-item_price_item_id_min,price-item_price_item_id_max,price-item_price_user_id_mean,price-item_price_user_id_min,price-item_price_user_id_max,price-item_price_brand_id_mean,price-item_price_brand_id_min,price-item_price_brand_id_max,mode_item_id,mode_size,mode_brand_id,mode_item_color,delivery_time_item_id_mean,delivery_time_item_id_std,delivery_time_item_id_min,delivery_time_item_id_max,delivery_time_item_id_median,delivery_time_item_id_mad,delivery_time_size_mean,delivery_time_size_std,delivery_time_size_min,delivery_time_size_max,delivery_time_size_median,delivery_time_size_mad,delivery_time_brand_id_mean,delivery_time_brand_id_std,delivery_time_brand_id_min,delivery_time_brand_id_max,delivery_time_brand_id_median,delivery_time_brand_id_mad,delivery_time_item_color_mean,delivery_time_item_color_std,delivery_time_item_color_min,delivery_time_item_color_max,delivery_time_item_color_median,delivery_time_item_color_mad,delivery_time_user_id_mean,delivery_time_user_id_std,delivery_time_user_id_min,delivery_time_user_id_max,delivery_time_user_id_median,delivery_time_user_id_mad,days_since_last_order,days_since_last_order_user_id_mean,days_since_last_order_user_id_median,days_since_last_order_user_id_max,days_since_last_order_user_id_min,days_since_last_order_user_id_std,days_since_last_order_user_id_mad,days_since_last_order_user_id_skew,days_since_last_order_same_item_id,days_since_last_order_same_size,days_since_last_order_same_item_color,days_since_last_order_same_brand_id,days_since_last_order_same_item_id_size,days_since_last_order_same_item_id_item_color,days_since_last_order_same_size_item_color,days_since_last_order_same_size_brand_id,days_since_last_order_same_item_color_brand_id,days_since_last_order_same_item_id_size_item_color,days_since_last_order_same_size_item_color_brand_id,days_since_last_delivery,days_since_last_delivery_user_id_mean,days_since_last_delivery_user_id_median,days_since_last_delivery_user_id_max,days_since_last_delivery_user_id_min,days_since_last_delivery_user_id_std,days_since_last_delivery_user_id_mad,days_since_last_delivery_user_id_skew,days_since_last_delivery_same_item_id,days_since_last_delivery_same_size,days_since_last_delivery_same_item_color,days_since_last_delivery_same_brand_id,days_since_last_delivery_same_item_id_size,days_since_last_delivery_same_item_id_item_color,days_since_last_delivery_same_size_item_color,days_since_last_delivery_same_size_brand_id,days_since_last_delivery_same_item_color_brand_id,days_since_last_delivery_same_item_id_size_item_color,days_since_last_delivery_same_size_item_color_brand_id,delivery_time_order_sum_bins_mean,delivery_time_order_sum_bins_std,delivery_time_order_sum_bins_min,delivery_time_order_sum_bins_max,delivery_time_order_sum_bins_median,delivery_time_order_sum_bins_mad,delivery_time_order_sum_bins_skew,delivery_time_item_price_bins_mean,delivery_time_item_price_bins_std,delivery_time_item_price_bins_min,delivery_time_item_price_bins_max,delivery_time_item_price_bins_median,delivery_time_item_price_bins_mad,delivery_time_item_price_bins_skew,delivery_time_age_bins_mean,delivery_time_age_bins_std,delivery_time_age_bins_min,delivery_time_age_bins_max,delivery_time_age_bins_median,delivery_time_age_bins_mad,delivery_time_age_bins_skew,delivery_time_reg_age_bins_mean,delivery_time_reg_age_bins_std,delivery_time_reg_age_bins_min,delivery_time_reg_age_bins_max,delivery_time_reg_age_bins_median,delivery_time_reg_age_bins_mad,delivery_time_reg_age_bins_skew,delivery_time_order_weekday_mean,delivery_time_order_weekday_std,delivery_time_order_weekday_min,delivery_time_order_weekday_max,delivery_time_order_weekday_median,delivery_time_order_weekday_mad,delivery_time_order_weekday_skew,delivery_time_delivery_weekday_mean,delivery_time_delivery_weekday_std,delivery_time_delivery_weekday_min,delivery_time_delivery_weekday_max,delivery_time_delivery_weekday_median,delivery_time_delivery_weekday_mad,delivery_time_delivery_weekday_skew,delivery_time_item_id_item_color_mean,delivery_time_item_id_item_color_std,delivery_time_item_id_item_color_min,delivery_time_item_id_item_color_max,delivery_time_item_id_item_color_median,delivery_time_item_id_item_color_mad,delivery_time_item_id_item_color_skew,delivery_time_item_id_size_mean,delivery_time_item_id_size_std,delivery_time_item_id_size_min,delivery_time_item_id_size_max,delivery_time_item_id_size_median,delivery_time_item_id_size_mad,delivery_time_item_id_size_skew,delivery_time_item_id_delivery_weekday_mean,delivery_time_item_id_delivery_weekday_std,delivery_time_item_id_delivery_weekday_min,delivery_time_item_id_delivery_weekday_max,delivery_time_item_id_delivery_weekday_median,delivery_time_item_id_delivery_weekday_mad,delivery_time_item_id_delivery_weekday_skew,delivery_time_item_id_user_id_mean,delivery_time_item_id_user_id_std,delivery_time_item_id_user_id_min,delivery_time_item_id_user_id_max,delivery_time_item_id_user_id_median,delivery_time_item_id_user_id_mad,delivery_time_item_id_user_id_skew,delivery_time_brand_id_item_color_mean,delivery_time_brand_id_item_color_std,delivery_time_brand_id_item_color_min,delivery_time_brand_id_item_color_max,delivery_time_brand_id_item_color_median,delivery_time_brand_id_item_color_mad,delivery_time_brand_id_item_color_skew,delivery_time_brand_id_size_mean,delivery_time_brand_id_size_std,delivery_time_brand_id_size_min,delivery_time_brand_id_size_max,delivery_time_brand_id_size_median,delivery_time_brand_id_size_mad,delivery_time_brand_id_size_skew,delivery_time_brand_id_delivery_weekday_mean,delivery_time_brand_id_delivery_weekday_std,delivery_time_brand_id_delivery_weekday_min,delivery_time_brand_id_delivery_weekday_max,delivery_time_brand_id_delivery_weekday_median,delivery_time_brand_id_delivery_weekday_mad,delivery_time_brand_id_delivery_weekday_skew,delivery_time_brand_id_user_id_mean,delivery_time_brand_id_user_id_std,delivery_time_brand_id_user_id_min,delivery_time_brand_id_user_id_max,delivery_time_brand_id_user_id_median,delivery_time_brand_id_user_id_mad,delivery_time_brand_id_user_id_skew,delivery_time_item_color_size_mean,delivery_time_item_color_size_std,delivery_time_item_color_size_min,delivery_time_item_color_size_max,delivery_time_item_color_size_median,delivery_time_item_color_size_mad,delivery_time_item_color_size_skew,delivery_time_item_color_delivery_weekday_mean,delivery_time_item_color_delivery_weekday_std,delivery_time_item_color_delivery_weekday_min,delivery_time_item_color_delivery_weekday_max,delivery_time_item_color_delivery_weekday_median,delivery_time_item_color_delivery_weekday_mad,delivery_time_item_color_delivery_weekday_skew,delivery_time_item_color_user_id_mean,delivery_time_item_color_user_id_std,delivery_time_item_color_user_id_min,delivery_time_item_color_user_id_max,delivery_time_item_color_user_id_median,delivery_time_item_color_user_id_mad,delivery_time_item_color_user_id_skew,delivery_time_size_delivery_weekday_mean,delivery_time_size_delivery_weekday_std,delivery_time_size_delivery_weekday_min,delivery_time_size_delivery_weekday_max,delivery_time_size_delivery_weekday_median,delivery_time_size_delivery_weekday_mad,delivery_time_size_delivery_weekday_skew,delivery_time_size_user_id_mean,delivery_time_size_user_id_std,delivery_time_size_user_id_min,delivery_time_size_user_id_max,delivery_time_size_user_id_median,delivery_time_size_user_id_mad,delivery_time_size_user_id_skew,delivery_time_delivery_weekday_user_id_mean,delivery_time_delivery_weekday_user_id_std,delivery_time_delivery_weekday_user_id_min,delivery_time_delivery_weekday_user_id_max,delivery_time_delivery_weekday_user_id_median,delivery_time_delivery_weekday_user_id_mad,delivery_time_delivery_weekday_user_id_skew,delivery_time_item_id_item_color_size_mean,delivery_time_item_id_item_color_size_std,delivery_time_item_id_item_color_size_min,delivery_time_item_id_item_color_size_max,delivery_time_item_id_item_color_size_median,delivery_time_item_id_item_color_size_mad,delivery_time_item_id_item_color_size_skew,delivery_time_item_id_item_color_delivery_weekday_mean,delivery_time_item_id_item_color_delivery_weekday_std,delivery_time_item_id_item_color_delivery_weekday_min,delivery_time_item_id_item_color_delivery_weekday_max,delivery_time_item_id_item_color_delivery_weekday_median,delivery_time_item_id_item_color_delivery_weekday_mad,delivery_time_item_id_item_color_delivery_weekday_skew,delivery_time_item_id_item_color_user_id_mean,delivery_time_item_id_item_color_user_id_std,delivery_time_item_id_item_color_user_id_min,delivery_time_item_id_item_color_user_id_max,delivery_time_item_id_item_color_user_id_median,delivery_time_item_id_item_color_user_id_mad,delivery_time_item_id_item_color_user_id_skew,delivery_time_item_id_size_delivery_weekday_mean,delivery_time_item_id_size_delivery_weekday_std,delivery_time_item_id_size_delivery_weekday_min,delivery_time_item_id_size_delivery_weekday_max,delivery_time_item_id_size_delivery_weekday_median,delivery_time_item_id_size_delivery_weekday_mad,delivery_time_item_id_size_delivery_weekday_skew,delivery_time_item_id_size_user_id_mean,delivery_time_item_id_size_user_id_std,delivery_time_item_id_size_user_id_min,delivery_time_item_id_size_user_id_max,delivery_time_item_id_size_user_id_median,delivery_time_item_id_size_user_id_mad,delivery_time_item_id_size_user_id_skew,delivery_time_item_id_delivery_weekday_user_id_mean,delivery_time_item_id_delivery_weekday_user_id_std,delivery_time_item_id_delivery_weekday_user_id_min,delivery_time_item_id_delivery_weekday_user_id_max,delivery_time_item_id_delivery_weekday_user_id_median,delivery_time_item_id_delivery_weekday_user_id_mad,delivery_time_item_id_delivery_weekday_user_id_skew,delivery_time_brand_id_item_color_size_mean,delivery_time_brand_id_item_color_size_std,delivery_time_brand_id_item_color_size_min,delivery_time_brand_id_item_color_size_max,delivery_time_brand_id_item_color_size_median,delivery_time_brand_id_item_color_size_mad,delivery_time_brand_id_item_color_size_skew,delivery_time_brand_id_item_color_delivery_weekday_mean,delivery_time_brand_id_item_color_delivery_weekday_std,delivery_time_brand_id_item_color_delivery_weekday_min,delivery_time_brand_id_item_color_delivery_weekday_max,delivery_time_brand_id_item_color_delivery_weekday_median,delivery_time_brand_id_item_color_delivery_weekday_mad,delivery_time_brand_id_item_color_delivery_weekday_skew,delivery_time_brand_id_item_color_user_id_mean,delivery_time_brand_id_item_color_user_id_std,delivery_time_brand_id_item_color_user_id_min,delivery_time_brand_id_item_color_user_id_max,delivery_time_brand_id_item_color_user_id_median,delivery_time_brand_id_item_color_user_id_mad,delivery_time_brand_id_item_color_user_id_skew,delivery_time_brand_id_size_delivery_weekday_mean,delivery_time_brand_id_size_delivery_weekday_std,delivery_time_brand_id_size_delivery_weekday_min,delivery_time_brand_id_size_delivery_weekday_max,delivery_time_brand_id_size_delivery_weekday_median,delivery_time_brand_id_size_delivery_weekday_mad,delivery_time_brand_id_size_delivery_weekday_skew,delivery_time_brand_id_size_user_id_mean,delivery_time_brand_id_size_user_id_std,delivery_time_brand_id_size_user_id_min,delivery_time_brand_id_size_user_id_max,delivery_time_brand_id_size_user_id_median,delivery_time_brand_id_size_user_id_mad,delivery_time_brand_id_size_user_id_skew,delivery_time_brand_id_delivery_weekday_user_id_mean,delivery_time_brand_id_delivery_weekday_user_id_std,delivery_time_brand_id_delivery_weekday_user_id_min,delivery_time_brand_id_delivery_weekday_user_id_max,delivery_time_brand_id_delivery_weekday_user_id_median,delivery_time_brand_id_delivery_weekday_user_id_mad,delivery_time_brand_id_delivery_weekday_user_id_skew,delivery_time_item_color_size_delivery_weekday_mean,delivery_time_item_color_size_delivery_weekday_std,delivery_time_item_color_size_delivery_weekday_min,delivery_time_item_color_size_delivery_weekday_max,delivery_time_item_color_size_delivery_weekday_median,delivery_time_item_color_size_delivery_weekday_mad,delivery_time_item_color_size_delivery_weekday_skew,delivery_time_item_color_size_user_id_mean,delivery_time_item_color_size_user_id_std,delivery_time_item_color_size_user_id_min,delivery_time_item_color_size_user_id_max,delivery_time_item_color_size_user_id_median,delivery_time_item_color_size_user_id_mad,delivery_time_item_color_size_user_id_skew,delivery_time_item_color_delivery_weekday_user_id_mean,delivery_time_item_color_delivery_weekday_user_id_std,delivery_time_item_color_delivery_weekday_user_id_min,delivery_time_item_color_delivery_weekday_user_id_max,delivery_time_item_color_delivery_weekday_user_id_median,delivery_time_item_color_delivery_weekday_user_id_mad,delivery_time_item_color_delivery_weekday_user_id_skew,delivery_time_size_delivery_weekday_user_id_mean,delivery_time_size_delivery_weekday_user_id_std,delivery_time_size_delivery_weekday_user_id_min,delivery_time_size_delivery_weekday_user_id_max,delivery_time_size_delivery_weekday_user_id_median,delivery_time_size_delivery_weekday_user_id_mad,delivery_time_size_delivery_weekday_user_id_skew,user_id_total_items_ordered,user_id_total_items_ordered_cumsum,user_id_item_id_total_items_ordered,user_id_brand_id_total_items_ordered,user_id_item_color_total_items_ordered,user_id_size_total_items_ordered,user_id_item_id_item_color_total_items_ordered,user_id_item_id_size_total_items_ordered,user_id_brand_id_item_color_total_items_ordered,user_id_brand_id_size_total_items_ordered,user_id_item_color_size_total_items_ordered,user_id_item_id_item_color_size_total_items_ordered,user_id_brand_id_item_color_size_total_items_ordered,user_id_item_id_cumsum_items_ordered,user_id_brand_id_cumsum_items_ordered,user_id_item_color_cumsum_items_ordered,user_id_size_cumsum_items_ordered,user_id_item_id_item_color_cumsum_items_ordered,user_id_item_id_size_cumsum_items_ordered,user_id_brand_id_item_color_cumsum_items_ordered,user_id_brand_id_size_cumsum_items_ordered,user_id_item_color_size_cumsum_items_ordered,user_id_item_id_item_color_size_cumsum_items_ordered,user_id_brand_id_item_color_size_cumsum_items_ordered,user_id_item_id_nunique_,user_id_size_nunique_,user_id_item_color_nunique_,user_id_brand_id_nunique_,user_id_size_nunique_item_id,user_id_item_color_nunique_item_id,user_id_brand_id_nunique_item_id,user_id_item_id_nunique_brand_id,user_id_size_nunique_brand_id,user_id_item_color_nunique_brand_id,user_id_item_id_nunique_item_color,user_id_size_nunique_item_color,user_id_brand_id_nunique_item_color,user_id_item_id_nunique_size,user_id_item_color_nunique_size,user_id_brand_id_nunique_size,user_id_size_nunique_item_id_item_color,user_id_brand_id_nunique_item_id_item_color,user_id_item_color_nunique_item_id_size,user_id_brand_id_nunique_item_id_size,user_id_item_id_nunique_brand_id_item_color,user_id_size_nunique_brand_id_item_color,user_id_item_id_nunique_brand_id_size,user_id_item_color_nunique_brand_id_size,user_id_item_id_nunique_item_color_size,user_id_brand_id_nunique_item_color_size
0,1,2012-04-01,2012-04-03,186,2,43,25,69.90,794,3,1965-01-06,0,2011-04-25,0.0,2,2012-04-01_794,47,342,6,1,4,4,1,3,13,14,3,209.8,69.93,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,2,2,2,3,1,1,1,1,1,69.47,3.41,39.90,69.90,21606.68,311,69.90,0.83,62.65,24.51,0.0,89.9,689.1,11,69.9,16.52,55.23,13.65,24.9,79.90,282536.19,5116,59.90,11.66,-0.43,-30.0,0.0,-7.25,-69.90,20.00,-14.67,-45.00,10.0,71,2,20,80,5.04,5.80,1,30,3.0,3.61,10.38,16.45,0,71,4.0,11.27,4.21,5.15,0,71,3.0,2.65,9.80,15.10,0,71,4.0,10.38,3.64,3.38,1,13,3.0,2.20,0.0,66.6,46.0,137.0,15.0,58.22,49.52,0.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,66.4,57.0,135.0,5.0,60.22,49.68,0.21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.52,15.45,0,71,4.0,10.29,2.54,10.41,16.49,0,71,4.0,11.30,2.36,10.50,16.49,0,71,4.0,11.43,2.31,10.44,16.67,0,71,3.0,11.55,2.31,10.74,16.72,1,71,3.0,11.73,2.26,7.33,12.38,0,71,4.0,7.17,3.27,3.25,2.06,1,12,3.0,1.49,1.77,4.90,5.73,1,26,3.0,3.35,2.82,4.05,3.48,1,20,4.0,1.97,2.95,2.00,0.00,2,2,2.0,0.00,0.00,3.89,4.54,0,71,3.0,2.22,6.22,3.97,4.80,1,71,3.0,2.41,5.87,3.89,3.96,0,68,3.0,1.89,7.23,2.00,0.00,2,2,2.0,0.00,0.00,11.32,17.32,0,71,4.0,12.43,2.14,9.22,14.77,0,71,4.0,9.75,2.60,2.00,0.00,2,2,2.0,0.00,0.0,7.31,12.40,0,71,4.0,7.12,3.31,2.50,1.52,1,5,2.5,1.17,0.77,2.0,0.0,2,2,2.0,0.0,0.0,2.95,1.43,1,6,3.0,1.14,0.30,3.29,2.13,1,12,3.0,1.48,2.28,2.0,0.0,2,2,2.0,0.0,0.0,3.07,1.82,1,8,2.5,1.37,1.49,2.0,0.0,2,2,2.0,0.0,0.0,2.0,0.0,2,2,2.0,0.0,0.0,4.09,5.39,1,71,3.0,2.59,6.52,3.63,2.87,0,31,3.0,1.69,3.99,2.0,0.0,2,2,2.0,0.0,0.0,3.45,2.39,1,22,3.0,1.51,3.50,2.0,0.0,2,2,2.0,0.0,0.0,2.0,0.0,2,2,2.0,0.0,0.0,11.37,17.82,0,71,4.0,12.67,2.15,2.0,0.0,2,2,2.0,0.0,0.0,2.0,0.0,2,2,2.0,0.0,0.0,2.0,0.0,2,2,2.0,0.0,0.0,11,3,1,1,1,6,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,7,3,7,6,1,1,1,1,1,1,1,1,1,4,4,3,1,1,1,1,1,1,1,1,1,1
1,2,2012-04-01,2012-04-03,71,3,69,21,69.95,794,3,1965-01-06,0,2011-04-25,1.0,2,2012-04-01_794,47,342,6,1,4,4,1,3,13,14,3,209.8,69.93,2,1,2,1,1,2,2,1,2,1,1,2,1,2,2,1,1,2,1,2,1,2,2,2,2,3,2,1,2,1,1,57.95,13.28,34.95,69.95,59108.70,1020,69.95,12.35,62.65,24.51,0.0,89.9,689.1,11,69.9,16.52,74.54,30.32,24.9,179.95,478991.63,6426,69.95,22.66,-12.00,-35.0,0.0,-7.30,-69.95,19.95,4.59,-45.05,110.0,71,2,20,80,5.39,7.38,0,71,3.0,4.20,9.84,15.38,0,71,4.0,10.49,6.96,11.49,0,71,3.0,6.70,10.01,15.26,0,71,4.0,10.63,3.64,3.38,1,13,3.0,2.20,0.0,66.6,46.0,137.0,15.0,58.22,49.52,0.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,66.4,57.0,135.0,5.0,60.22,49.68,0.21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.52,15.45,0,71,4.0,10.29,2.54,10.41,16.49,0,71,4.0,11.30,2.36,10.50,16.49,0,71,4.0,11.43,2.31,10.44,16.67,0,71,3.0,11.55,2.31,10.74,16.72,1,71,3.0,11.73,2.26,7.33,12.38,0,71,4.0,7.17,3.27,6.50,7.49,0,41,4.0,5.25,2.21,5.39,7.38,0,71,3.0,4.20,4.10,4.96,6.53,1,68,3.0,3.35,4.95,2.00,0.00,2,2,2.0,0.00,0.00,8.15,12.05,0,71,4.0,7.84,2.81,6.96,11.49,0,71,3.0,6.70,3.46,4.79,7.30,0,71,3.0,3.45,5.00,2.00,0.00,2,2,2.0,0.00,0.00,9.56,14.40,0,71,4.0,9.84,2.44,7.90,12.57,0,71,4.0,7.82,2.91,2.50,0.71,2,3,2.5,0.50,0.0,6.93,11.98,0,71,3.0,6.64,3.55,5.67,6.35,2,13,2.0,4.89,1.73,2.0,0.0,2,2,2.0,0.0,0.0,6.50,7.49,0,41,4.0,5.25,2.21,5.72,6.43,1,33,4.0,4.06,2.49,2.0,0.0,2,2,2.0,0.0,0.0,4.96,6.53,1,68,3.0,3.35,4.95,2.0,0.0,2,2,2.0,0.0,0.0,2.0,0.0,2,2,2.0,0.0,0.0,8.15,12.05,0,71,4.0,7.84,2.81,5.38,6.67,1,36,4.0,3.89,3.05,2.0,0.0,2,2,2.0,0.0,0.0,4.79,7.30,0,71,3.0,3.45,5.00,2.0,0.0,2,2,2.0,0.0,0.0,2.0,0.0,2,2,2.0,0.0,0.0,7.22,10.88,0,69,4.0,6.67,2.97,2.0,0.0,2,2,2.0,0.0,0.0,2.0,0.0,2,2,2.0,0.0,0.0,2.0,0.0,2,2,2.0,0.0,0.0,11,3,2,2,2,3,1,2,1,2,1,1,1,2,2,1,2,1,2,1,2,1,1,1,7,3,7,6,1,2,1,1,1,2,2,2,2,2,3,2,1,1,2,1,1,1,1,2,1,1
2,3,2012-04-01,2012-04-03,71,3,36,21,69.95,794,3,1965-01-06,0,2011-04-25,1.0,2,2012-04-01_794,47,342,6,1,4,4,1,3,13,14,3,209.8,69.93,2,1,2,1,1,2,2,1,2,1,1,2,1,2,2,1,1,2,1,2,1,2,2,2,2,3,2,1,2,1,1,57.95,13.28,34.95,69.95,59108.70,1020,69.95,12.35,62.65,24.51,0.0,89.9,689.1,11,69.9,16.52,74.54,30.32,24.9,179.95,478991.63,6426,69.95,22.66,-12.00,-35.0,0.0,-7.30,-69.95,19.95,4.59,-45.05,110.0,71,2,20,80,5.39,7.38,0,71,3.0,4.20,9.84,15.38,0,71,4.0,10.49,6.96,11.49,0,71,3.0,6.70,4.83,7.25,0,65,3.0,3.71,3.64,3.38,1,13,3.0,2.20,0.0,66.6,46.0,137.0,15.0,58.22,49.52,0.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,66.4,57.0,135.0,5.0,60.22,49.68,0.21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.52,15.45,0,71,4.0,10.29,2.54,10.41,16.49,0,71,4.0,11.30,2.36,10.50,16.49,0,71,4.0,11.43,2.31,10.44,16.67,0,71,3.0,11.55,2.31,10.74,16.72,1,71,3.0,11.73,2.26,7.33,12.38,0,71,4.0,7.17,3.27,5.21,5.73,1,27,3.0,3.76,2.40,5.39,7.38,0,71,3.0,4.20,4.10,4.96,6.53,1,68,3.0,3.35,4.95,2.00,0.00,2,2,2.0,0.00,0.00,5.21,5.73,1,27,3.0,3.76,2.40,6.96,11.49,0,71,3.0,6.70,3.46,4.79,7.30,0,71,3.0,3.45,5.00,2.00,0.00,2,2,2.0,0.00,0.00,4.38,4.93,1,28,3.0,2.93,2.99,4.41,6.15,0,55,3.0,2.80,5.38,2.00,0.00,2,2,2.0,0.00,0.0,6.93,11.98,0,71,3.0,6.64,3.55,5.67,6.35,2,13,2.0,4.89,1.73,2.0,0.0,2,2,2.0,0.0,0.0,5.21,5.73,1,27,3.0,3.76,2.40,5.60,6.25,1,26,3.5,4.44,2.05,2.0,0.0,2,2,2.0,0.0,0.0,4.96,6.53,1,68,3.0,3.35,4.95,2.0,0.0,2,2,2.0,0.0,0.0,2.0,0.0,2,2,2.0,0.0,0.0,5.21,5.73,1,27,3.0,3.76,2.40,5.60,6.25,1,26,3.5,4.44,2.05,2.0,0.0,2,2,2.0,0.0,0.0,4.79,7.30,0,71,3.0,3.45,5.00,2.0,0.0,2,2,2.0,0.0,0.0,2.0,0.0,2,2,2.0,0.0,0.0,4.78,5.10,1,26,3.0,3.26,2.61,2.0,0.0,2,2,2.0,0.0,0.0,2.0,0.0,2,2,2.0,0.0,0.0,2.0,0.0,2,2,2.0,0.0,0.0,11,3,2,2,1,3,1,2,1,2,1,1,1,2,2,1,2,1,2,1,2,1,1,1,7,3,7,6,1,2,1,1,1,2,1,1,1,2,3,2,1,1,2,1,1,1,1,2,1,1
3,4,2012-04-02,2012-04-06,22,2,50,14,39.90,808,3,1959-11-09,13,2012-01-04,0.0,4,2012-04-02_808,52,89,0,4,4,4,2,6,14,14,1,39.9,39.90,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,1,1,1,1,1,1,1,1,31.10,8.68,19.90,39.90,68669.20,2208,29.90,8.41,39.90,0.00,39.9,39.9,39.9,1,39.9,0.00,55.16,19.34,0.0,89.90,629751.00,11417,49.90,16.21,-8.80,-20.0,0.0,0.00,0.00,0.00,15.26,-39.90,50.0,22,2,14,46,5.26,7.63,0,71,3.0,4.20,10.38,16.45,0,71,4.0,11.27,11.40,18.02,0,71,4.0,12.90,14.68,19.65,0,71,4.0,15.55,4.00,0.00,4,4,4.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.66,13.15,0,71,3.0,8.03,3.11,10.26,16.43,0,71,4.0,11.23,2.37,10.50,16.49,0,71,4.0,11.43,2.31,8.95,14.95,0,71,3.0,9.64,2.70,10.04,16.55,0,71,3.0,11.52,2.30,11.10,17.61,0,71,3.0,12.65,2.12,7.59,10.18,0,71,4.0,6.95,2.56,4.87,7.25,0,57,3.0,3.85,3.77,6.36,10.59,0,71,2.0,6.68,2.93,4.00,0.00,4,4,4.0,0.00,0.00,20.84,25.59,0,71,4.0,22.45,1.00,12.46,19.86,0,71,4.0,14.57,1.94,6.83,12.35,0,71,2.0,7.40,3.05,4.00,0.00,4,4,4.0,0.00,0.00,15.48,20.34,0,71,4.0,16.26,1.49,15.97,20.69,0,71,4.0,16.89,1.36,4.00,0.00,4,4,4.0,0.00,0.0,11.09,17.68,0,71,3.0,12.69,2.12,4.00,0.00,4,4,4.0,0.00,0.00,4.0,0.0,4,4,4.0,0.0,0.0,9.71,11.95,1,47,4.0,9.48,1.64,10.68,13.76,1,49,3.0,11.20,1.42,4.0,0.0,4,4,4.0,0.0,0.0,5.88,10.98,1,57,2.0,6.45,3.08,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,26.32,28.20,0,71,5.0,26.19,0.58,10.99,16.06,1,61,3.0,12.28,1.73,4.0,0.0,4,4,4.0,0.0,0.0,6.87,12.77,1,65,2.0,7.70,2.93,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,16.23,21.18,0,71,4.0,17.33,1.36,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,5,2012-04-02,2012-04-06,151,2,18,53,29.90,825,3,1964-07-11,11,2011-02-16,0.0,4,2012-04-02_825,48,411,0,4,4,4,2,6,14,14,3,249.7,83.23,1,2,2,1,2,1,1,2,1,2,1,2,2,1,1,2,1,2,1,2,1,2,3,2,3,2,1,1,1,1,1,27.59,4.02,24.90,49.90,1434.80,52,24.90,2.90,79.34,45.03,29.9,139.9,714.1,9,69.9,38.27,41.11,13.08,19.9,79.90,99614.71,2423,39.90,10.10,-2.31,-5.0,20.0,49.44,0.00,110.00,11.21,-10.00,50.0,15,4,37,18,4.85,5.43,1,28,3.0,3.08,10.38,16.45,0,71,4.0,11.27,4.59,6.28,0,66,3.0,3.28,8.92,14.80,0,71,3.0,9.44,14.00,21.04,4,63,4.0,15.56,0.0,337.0,337.0,337.0,337.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,132.0,31.0,337.0,28.0,177.54,136.67,1.73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.52,15.45,0,71,4.0,10.29,2.54,9.70,15.18,0,71,3.0,10.45,2.38,10.50,16.49,0,71,4.0,11.43,2.31,9.27,15.43,0,71,3.0,10.18,2.58,10.04,16.55,0,71,3.0,11.52,2.30,11.10,17.61,0,71,3.0,12.65,2.12,9.00,8.91,3,22,4.0,7.43,1.20,4.75,5.19,1,28,3.5,2.88,3.40,4.67,3.64,1,13,5.0,2.37,1.54,4.00,0.00,4,4,4.0,0.00,0.00,4.36,5.53,0,42,3.0,2.99,3.75,4.32,5.54,0,54,3.0,2.92,4.09,3.89,6.71,0,57,2.0,3.01,5.08,4.00,0.00,4,4,4.0,0.00,0.00,8.22,13.72,0,71,3.0,8.45,2.99,8.78,14.97,0,71,3.0,9.79,2.67,18.75,29.50,4,63,4.0,22.12,2.0,11.09,17.68,0,71,3.0,12.69,2.12,4.00,0.00,4,4,4.0,0.00,0.00,4.0,0.0,4,4,4.0,0.0,0.0,4.00,0.00,4,4,4.0,0.00,0.00,4.50,0.71,4,5,4.5,0.50,0.00,4.0,0.0,4,4,4.0,0.0,0.0,5.00,4.43,1,13,4.5,3.00,1.33,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,4.17,5.13,0,41,3.0,2.69,4.12,3.04,4.01,1,32,2.0,1.82,5.29,4.0,0.0,4,4,4.0,0.0,0.0,3.02,3.89,0,42,2.0,1.71,7.21,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,7.97,13.63,0,71,3.0,8.72,2.84,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,9,3,1,1,4,4,1,1,1,1,3,1,1,1,1,2,2,1,1,1,1,2,1,1,9,2,3,7,1,1,1,1,1,1,4,2,4,4,2,3,1,1,1,1,1,1,1,1,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
531165,50074,2013-04-29,2013-05-03,2342,2,83,5,69.90,91920,3,1962-03-08,1,2013-04-29,,4,2013-04-29_91920,51,0,0,4,4,5,29,3,18,18,9,748.2,83.13,1,8,5,4,3,6,4,5,1,8,1,8,3,6,4,5,3,6,1,8,3,6,8,2,6,5,1,1,2,1,3,62.24,4.26,49.90,69.90,106988.10,1719,59.90,3.60,83.13,20.82,64.9,129.0,748.2,9,79.9,15.42,64.66,20.40,0.0,169.90,2941212.79,45487,69.90,15.02,-7.66,-20.0,0.0,13.23,-5.00,59.10,-5.24,-69.90,100.0,2505,2,5,74,12.53,16.18,0,71,4.0,12.96,10.38,16.45,0,71,4.0,11.27,12.33,17.43,0,71,4.0,13.20,13.16,17.54,0,71,4.0,13.93,4.00,0.00,4,4,4.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.26,17.98,0,71,4.0,13.22,1.99,10.41,16.49,0,71,4.0,11.30,2.36,10.50,16.49,0,71,4.0,11.43,2.31,11.16,16.40,1,71,4.0,11.49,2.27,10.04,16.55,0,71,3.0,11.52,2.30,11.10,17.61,0,71,3.0,12.65,2.12,8.12,12.37,0,62,4.0,8.06,2.70,15.23,19.37,0,71,4.0,15.92,1.38,18.25,18.25,1,61,9.0,16.41,0.66,4.00,0.00,4,4,4.0,0.00,0.00,14.57,17.67,0,71,4.0,14.95,1.25,12.29,17.57,0,71,4.0,13.25,1.88,15.31,19.17,0,71,4.0,15.92,1.30,4.00,0.00,4,4,4.0,0.00,0.00,15.52,19.42,0,71,4.0,15.83,1.39,15.06,19.01,1,71,4.0,15.73,1.34,4.00,0.00,4,4,4.0,0.00,0.0,11.09,17.68,0,71,3.0,12.69,2.12,4.00,0.00,4,4,4.0,0.00,0.00,4.0,0.0,4,4,4.0,0.0,0.0,13.59,13.77,1,42,5.0,12.22,0.79,9.27,11.41,1,48,4.0,8.41,1.97,4.0,0.0,4,4,4.0,0.0,0.0,23.20,17.95,1,61,29.0,15.61,0.19,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,16.54,17.47,1,71,6.0,15.32,0.94,18.71,18.31,1,71,13.0,16.15,0.73,4.0,0.0,4,4,4.0,0.0,0.0,14.72,19.44,0,71,4.0,15.72,1.47,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,15.66,20.15,1,71,4.0,16.30,1.44,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,9,9,1,4,3,5,1,1,3,4,3,1,3,1,4,3,5,1,1,3,4,3,1,3,8,2,5,6,1,1,1,3,1,2,2,1,1,4,3,2,1,1,1,1,2,1,3,2,2,1
531166,50075,2013-04-29,2013-05-03,2505,2,83,5,64.90,91920,3,1962-03-08,1,2013-04-29,,4,2013-04-29_91920,51,0,0,4,4,5,29,3,18,18,9,748.2,83.13,2,7,5,4,3,6,4,5,2,7,2,7,3,6,4,5,3,6,2,7,3,6,8,2,6,5,1,1,2,1,3,66.79,2.67,59.90,69.90,26650.10,399,64.90,2.51,83.13,20.82,64.9,129.0,748.2,9,79.9,15.42,64.66,20.40,0.0,169.90,2941212.79,45487,69.90,15.02,1.89,-5.0,5.0,18.23,0.00,64.10,-0.24,-64.90,105.0,2505,2,5,74,9.83,14.50,1,71,4.0,10.13,10.38,16.45,0,71,4.0,11.27,12.33,17.43,0,71,4.0,13.20,13.16,17.54,0,71,4.0,13.93,4.00,0.00,4,4,4.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.26,17.98,0,71,4.0,13.22,1.99,10.41,16.49,0,71,4.0,11.30,2.36,10.50,16.49,0,71,4.0,11.43,2.31,11.16,16.40,1,71,4.0,11.49,2.27,10.04,16.55,0,71,3.0,11.52,2.30,11.10,17.61,0,71,3.0,12.65,2.12,9.93,13.60,1,61,4.0,9.77,2.34,7.25,10.52,1,47,3.0,6.82,2.61,11.16,12.91,1,63,5.0,9.61,1.98,4.00,0.00,4,4,4.0,0.00,0.00,14.57,17.67,0,71,4.0,14.95,1.25,12.29,17.57,0,71,4.0,13.25,1.88,15.31,19.17,0,71,4.0,15.92,1.30,4.00,0.00,4,4,4.0,0.00,0.00,15.52,19.42,0,71,4.0,15.83,1.39,15.06,19.01,1,71,4.0,15.73,1.34,4.00,0.00,4,4,4.0,0.00,0.0,11.09,17.68,0,71,3.0,12.69,2.12,4.00,0.00,4,4,4.0,0.00,0.00,4.0,0.0,4,4,4.0,0.0,0.0,7.73,9.02,1,32,4.0,6.33,2.29,4.67,5.20,1,15,3.0,3.44,2.16,4.0,0.0,4,4,4.0,0.0,0.0,13.67,16.13,2,47,5.0,12.50,1.40,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,16.54,17.47,1,71,6.0,15.32,0.94,18.71,18.31,1,71,13.0,16.15,0.73,4.0,0.0,4,4,4.0,0.0,0.0,14.72,19.44,0,71,4.0,15.72,1.47,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,15.66,20.15,1,71,4.0,16.30,1.44,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,9,9,2,4,3,5,2,2,3,4,3,2,3,2,4,3,5,2,2,3,4,3,2,3,8,2,5,6,1,1,1,3,1,2,2,1,1,4,3,2,1,1,1,1,2,1,3,2,2,1
531167,50076,2013-04-28,2013-05-02,2470,0,86,5,79.90,85095,3,1950-02-14,3,2013-03-24,,4,2013-04-28_85095,63,35,6,3,4,5,28,2,17,18,1,79.9,79.90,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,1,1,1,1,1,1,1,1,79.90,0.00,79.90,79.90,112259.50,1405,79.90,0.00,79.90,0.00,79.9,79.9,239.7,3,79.9,0.00,64.66,20.40,0.0,169.90,2941212.79,45487,69.90,15.02,0.00,0.0,0.0,0.00,0.00,0.00,-15.24,-79.90,90.0,2470,0,5,64,18.14,20.52,0,71,4.0,17.72,10.64,16.69,0,71,4.0,11.64,12.33,17.43,0,71,4.0,13.20,9.71,15.53,0,71,4.0,10.49,13.33,16.17,4,32,4.0,12.44,5.0,17.5,17.5,30.0,5.0,17.68,12.50,0.00,5.0,5.0,0.0,5.0,5.0,0.0,0.0,5.0,0.0,0.0,0.0,5.0,3.5,3.5,5.0,2.0,2.12,1.50,0.00,5.0,5.0,0.0,5.0,5.0,0.0,0.0,5.0,0.0,0.0,0.0,7.66,13.15,0,71,3.0,8.03,3.11,10.41,16.49,0,71,4.0,11.30,2.36,10.28,16.39,0,71,3.0,11.31,2.34,8.74,14.48,0,71,3.0,9.40,2.70,10.74,16.72,1,71,3.0,11.73,2.26,10.96,16.93,0,71,3.0,12.28,2.09,28.24,24.02,1,71,30.0,21.32,0.30,19.00,20.79,0,71,5.0,18.36,0.91,15.22,17.79,1,71,6.0,14.45,1.43,13.33,16.17,4,32,4.0,12.44,1.73,11.71,16.27,0,71,4.0,12.30,1.90,11.92,16.57,0,71,4.0,12.55,1.86,11.06,16.15,0,71,4.0,11.93,2.04,13.33,16.17,4,32,4.0,12.44,1.73,9.71,15.69,0,71,4.0,10.52,2.47,11.97,18.67,0,71,3.0,13.63,2.02,4.00,0.00,4,4,4.0,0.00,0.0,10.96,16.80,0,71,3.0,12.27,2.06,4.00,0.00,4,4,4.0,0.00,0.00,18.0,19.8,4,32,18.0,14.0,0.0,29.93,24.24,1,71,33.5,21.33,0.19,23.33,22.36,1,71,23.0,18.61,0.77,4.0,0.0,4,4,4.0,0.0,0.0,16.07,18.82,1,69,7.0,14.20,1.58,4.0,0.0,4,4,4.0,0.0,0.0,18.0,19.8,4,32,18.0,14.0,0.0,12.76,16.95,0,71,4.0,13.09,1.78,10.68,15.26,0,71,4.0,11.46,1.99,4.0,0.0,4,4,4.0,0.0,0.0,10.69,15.15,0,71,4.0,11.18,2.08,4.0,0.0,4,4,4.0,0.0,0.0,18.0,19.8,4,32,18.0,14.0,0.0,13.21,20.13,0,71,3.0,15.12,1.82,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,3,3,3,3,1,2,1,2,1,2,1,1,1,3,3,1,2,1,2,1,2,1,1,1,1,2,2,1,2,2,1,1,2,2,1,1,1,1,2,1,1,1,2,1,1,1,1,2,1,1
531168,50077,2013-04-28,2013-05-02,2452,1,86,5,59.90,91922,3,1969-11-27,2,2013-04-28,,4,2013-04-28_91922,43,0,6,3,4,5,28,2,17,18,2,119.8,59.90,2,0,2,0,1,1,2,0,2,0,1,1,1,1,2,0,1,1,1,1,1,1,1,1,1,2,2,1,2,1,1,59.90,0.00,59.90,59.90,68166.20,1138,59.90,0.00,59.90,0.00,59.9,59.9,119.8,2,59.9,0.00,64.66,20.40,0.0,169.90,2941212.79,45487,69.90,15.02,0.00,0.0,0.0,0.00,0.00,0.00,4.76,-59.90,110.0,2452,1,5,18,23.27,21.49,0,71,18.0,19.07,9.93,15.81,0,71,4.0,10.74,12.33,17.43,0,71,4.0,13.20,9.71,15.53,0,71,4.0,10.49,4.00,0.00,4,4,4.0,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.16,14.02,0,71,3.0,8.72,2.91,10.26,16.43,0,71,4.0,11.23,2.37,9.99,15.91,0,71,4.0,10.75,2.44,11.16,16.40,1,71,4.0,11.49,2.27,10.74,16.72,1,71,3.0,11.73,2.26,10.96,16.93,0,71,3.0,12.28,2.09,24.65,20.29,1,71,26.0,17.92,0.41,26.76,22.19,0,71,27.0,19.57,0.32,24.67,23.02,0,71,18.0,20.57,0.54,4.00,0.00,4,4,4.0,0.00,0.00,11.71,16.27,0,71,4.0,12.30,1.90,11.16,16.20,0,71,4.0,11.90,2.02,11.06,16.15,0,71,4.0,11.93,2.04,4.00,0.00,4,4,4.0,0.00,0.00,9.93,16.05,0,71,4.0,10.93,2.39,11.97,18.67,0,71,3.0,13.63,2.02,4.00,0.00,4,4,4.0,0.00,0.0,10.60,16.57,0,71,3.0,11.96,2.12,4.00,0.00,4,4,4.0,0.00,0.00,4.0,0.0,4,4,4.0,0.0,0.0,24.59,17.83,1,71,28.5,15.56,0.02,24.95,20.81,1,71,25.0,18.49,0.34,4.0,0.0,4,4,4.0,0.0,0.0,28.35,24.35,1,71,25.0,21.15,0.45,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,12.06,16.66,0,71,4.0,12.80,1.83,10.68,15.26,0,71,4.0,11.46,1.99,4.0,0.0,4,4,4.0,0.0,0.0,9.48,14.73,0,71,3.0,10.24,2.39,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,11.98,18.69,0,71,3.0,13.77,2.01,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,4.0,0.0,4,4,4.0,0.0,0.0,2,2,2,2,1,2,1,2,1,2,1,1,1,2,2,1,2,1,2,1,2,1,1,1,1,1,2,1,1,2,1,1,1,2,1,1,1,1,2,1,1,1,2,1,1,1,1,2,1,1


Overall we managed to create a data frame with 467 Columns on which our final model will be trained.

In [34]:
# Instantiate the model class
Model = Model_class(df)

In [9]:
# Split the data into train and test sets
df_train,df_test, df_valid,k = Model.split_data(False)

In [76]:
# Encode the categorical variables using leave-one-out encoding
df_train_encoded,df_test_encoded,encoder = Model.LOE_Encoder(df_train,df_test, cat_cols,0.05,True)

In [88]:
catboost_params_cpu = {'iterations': 175, 'learning_rate': 0.11, 'depth': 9, 'loss_function': 'Logloss', 'random_seed': 42,"l2_leaf_reg": 15,"max_ctr_complexity" : 3}
catboost_model_cpu, catboost_Y_pred_cpu, catboost_mae_cpu, catboost_Y_pred_proba_cpu = Model.catboost(df_train,df_test,catboost_params_cpu,1,cat_cols)
print("Catboost_cpu MAE on test set:", catboost_mae_cpu)

Catboost_cpu MAE on test set: 0.303


In [20]:
# concat df_train and df_test
df_train_whole = pd.concat([df_train,df_test],axis=0)

In [90]:
catboost_params_gpu = {'iterations': 175, 'learning_rate': 0.11, 'depth': 10, 'loss_function': 'Logloss', 'random_seed': 42,"task_type":"GPU","l2_leaf_reg": 15,"max_ctr_complexity" : 3}
catboost_model_gpu, catboost_Y_pred_gpu, catboost_mae_gpu, catboost_Y_pred_proba_gpu = Model.catboost(df_train,df_test,catboost_params_gpu,1,cat_cols)
print("Catboost_gpu MAE on test set:", catboost_mae_gpu)

Catboost_gpu MAE on test set: 0.3025


In [91]:
xgboost_params = {'max_depth': 9, 'learning_rate': 0.15, 'n_estimators': 50, 'objective': 'binary:logistic', 'random_state': 42,"objective":"binary:logistic"}
xgb_model, xgb_Y_pred, xgb_mae, xgb_Y_pred_proba = Model.xgboost(df_train_encoded,df_test_encoded,xgboost_params,1)
print("Xgboost MAE on test set:", xgb_mae)

Xgboost MAE on test set: 0.3121


In [78]:
mae_dict_1 = {"Catboost_native_CPU":catboost_mae_cpu,"Catboost_native_GPU":catboost_mae_gpu,"Xgboost":xgb_mae}
model_results_1 = pd.DataFrame(mae_dict_1,index=["MAE_4"])
model_results_1.sort_values(by="MAE_4",ascending=True,inplace=True,axis=1)
model_results_1 = model_results_1.transpose()
model_results_1.index.set_names(["Model"],inplace=True)
model_results_1.reset_index(inplace=True)
model_results_1

Unnamed: 0,Model,MAE_4
0,Catboost_native_GPU,0.3025
1,Catboost_native_CPU,0.303
2,Xgboost,0.3121


In [79]:
model_performance = pd.read_csv("Model_Performances/Model_performance_1.csv")
model_performance

Unnamed: 0,Model,MAE_3
0,Catboost_native_GPU,0.3191
1,Catboost_native_CPU,0.3196
2,Xgboost,0.3258
3,Lightgbm_native,0.3301


In [80]:
model_results_whole = pd.merge(model_results_1,model_performance,on="Model",how="left")
model_results_whole["MAE_Improvement"] = model_results_whole["MAE_3"] - model_results_whole["MAE_4"]
model_results_whole = model_results_whole[["Model","MAE_3","MAE_4","MAE_Improvement"]]
model_results_whole

Unnamed: 0,Model,MAE_3,MAE_4,MAE_Improvement
0,Catboost_native_GPU,0.3191,0.3025,0.0166
1,Catboost_native_CPU,0.3196,0.303,0.0166
2,Xgboost,0.3258,0.3121,0.0137


Through the feature engineering in this notebook, we managed to increase the performance of our **Catboost** models on the test set by **0.0166** in terms of MAE and the performance of the xgboost model by **0.0137**. The GPU variation of the Catboost algorithm is the best-performing model on the test set with an MAE of **0.3025**** Therefor we will be using this model for the final predictions.

In [81]:
# Training the model on the whole dataset and predicting on the validation set
catboost_params_gpu = {'iterations': 200, 'learning_rate': 0.11, 'depth': 10, 'loss_function': 'Logloss', 'random_seed': 42,"task_type":"GPU","l2_leaf_reg": 15,"max_ctr_complexity" : 3}
catboost_model_gpu, catboost_Y_pred_gpu, catboost_mae_gpu, catboost_Y_pred_proba_gpu = Model.catboost(df_train_whole,df_valid,catboost_params_gpu,1,cat_cols)
print("Catboost_gpu MAE on test set:", catboost_mae_gpu)

0:	learn: 0.6699331	test: 0.6727814	best: 0.6727814 (0)	total: 1.35s	remaining: 4m 27s
1:	learn: 0.6513266	test: 0.6563626	best: 0.6563626 (1)	total: 2.78s	remaining: 4m 35s
2:	learn: 0.6369605	test: 0.6436451	best: 0.6436451 (2)	total: 4.17s	remaining: 4m 34s
3:	learn: 0.6258520	test: 0.6342483	best: 0.6342483 (3)	total: 5.53s	remaining: 4m 30s
4:	learn: 0.6164678	test: 0.6258636	best: 0.6258636 (4)	total: 6.88s	remaining: 4m 28s
5:	learn: 0.6086105	test: 0.6194124	best: 0.6194124 (5)	total: 8.1s	remaining: 4m 21s
6:	learn: 0.6022613	test: 0.6148477	best: 0.6148477 (6)	total: 9.4s	remaining: 4m 19s
7:	learn: 0.5969062	test: 0.6105342	best: 0.6105342 (7)	total: 10.7s	remaining: 4m 16s
8:	learn: 0.5923347	test: 0.6070752	best: 0.6070752 (8)	total: 11.9s	remaining: 4m 12s
9:	learn: 0.5876848	test: 0.6030622	best: 0.6030622 (9)	total: 13.2s	remaining: 4m 11s
10:	learn: 0.5839301	test: 0.5996968	best: 0.5996968 (10)	total: 14.5s	remaining: 4m 8s
11:	learn: 0.5806156	test: 0.5967347	best: 0

In [87]:
score = int(catboost_mae_gpu * 50078)
print("Points:",score)

Points: 14893


The model performed quite well on the validation set generating **14893 Points** using the scoring metric defined by the Data Mining Cup 2014 and placing us in the top 4 of the leaderboard. The Catboost GPU model proves to be quite capable of predicting the return of an item. Furthermore, it trains relatively fast on the GPU and can handle the large number of features we created, as well as handle the categorical variables without having to encode them ourselves as part of the data preprocessing.