# WAR OF BOOSTING DECISION TREES
In this notebook we will se amazing fight between superb machine learning algorythms.
To do this we will need some tools.

In [1]:
import pandas as pd
import numpy as np 

import random

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier 
import tensorflow as tf
from sklearn.preprocessing import normalize

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
import seaborn as sns
from sklearn.model_selection import StratifiedKFold

## Data preperation

In [3]:
data = pd.read_csv("../../australia.csv")
data.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,17.9,35.2,0.0,12.0,12.3,48.0,6.0,20.0,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,0,0
1,18.4,28.9,0.0,14.8,13.0,37.0,19.0,19.0,30.0,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0,0,0
2,19.4,37.6,0.0,10.8,10.6,46.0,30.0,15.0,42.0,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9,0,0
3,21.9,38.4,0.0,11.4,12.2,31.0,6.0,6.0,37.0,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6,0,0
4,24.2,41.0,0.0,11.2,8.4,35.0,17.0,13.0,19.0,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6,0,0


It would make sense to conver this through logarythmic scale.

In [None]:
sns.boxplot(data.RainTomorrow, np.log(data.Rainfall+1))
data.Rainfall = np.log(data.Rainfall + 1)

In [46]:
np.unique(data.RainTomorrow)

array([0, 1])

In [47]:
data.shape

(56420, 18)

In [7]:
random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(data.drop("RainTomorrow", axis =1), data.RainTomorrow, test_size = 0.2)

In [49]:
data.RainTomorrow.sum()/data.RainTomorrow.size            # 78 percent of data is one class

0.22025877348457992

For our classifiers we have to establish level playing field. It will be this function. Gets 5 fold crossvalidation of model on training set. 

In [6]:
from sklearn.model_selection import KFold
def model_auc(X,y, model, n_iter = 5): 
    """
    5 fold crossvalidation
    repeated n_iter times
    
    returns mean auc for predictions
    """
    
    out = []
    for i in range(n_iter):
        kf = KFold(n_splits=5, shuffle= True)
        for train_index, test_index in kf.split(X):
            print('started training')
            X_train_, X_cv = X.iloc[train_index,:], X.iloc[test_index,:]
            y_train_, y_cv = y.iloc[train_index], y.iloc[test_index]
            
            model.fit(X_train_, y_train_)
            
            y_prob_ = model.predict_proba(X_cv)[:,1]
            fpr, tpr, _ = roc_curve(y_cv, y_prob_)
            out.append(auc(fpr,tpr))
            
            print('ended training')
    return np.array(out).mean()

## Contestant 1 - LightGBM
Powerful model. Featherweight. It is said that it grows leaf-wise. 

### Base 
How deafult parameters of model are performing. We will see here and in upcoming models how tuning changes `auc`.

In [135]:
lgb_model_base = LGBMClassifier(metric = "auc")

In [136]:
model_auc(X_train, y_train, lgb_model, n_iter = 1)

started training
ended training
started training
ended training
started training
ended training
started training
ended training
started training
ended training


0.893259889258849

The score is quite good but I bet we could do better

### Hiperparameter tuning

In hiperparameter tuning i will do it with BayesSearchCV, instead of grid, which is very  

In [11]:
lgb_model = LGBMClassifier(metric = "auc", objective="binary")

skf = StratifiedKFold(n_splits=5,random_state=22,shuffle = True) # 5 fold crossvalidation 

In [166]:
param_grid_lgb = {
                  "boosting_type": ["gbdt",'dart'], 
                  "learning_rate": [0.001,0.01,0.07,0.1,0.5,1], 
                  "num_leaves" : [5,15,25,50,100], 
                  "colsample_bytree" : [0.1,0.3, 0.5,0.7,1],
                  "max_depth" : [-1, 5, 10,15, 100], 
                  'min_child_samples' : [0,3,6,9,15,25,40]
}

In [167]:
searchcv = RandomizedSearchCV(lgb_model,
                        param_grid_lgb,
                        cv =skf,
                        n_iter=100,
                        n_jobs = -1
                       ) 

In [168]:
searchcv.fit(X_train,y_train) 

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=22, shuffle=True),
                   error_score=nan,
                   estimator=LGBMClassifier(boosting_type='gbdt',
                                            class_weight=None,
                                            colsample_bytree=1.0,
                                            importance_type='split',
                                            learning_rate=0.1, max_depth=-1,
                                            metric='auc', min_child_samples=20,
                                            min_child_weight=0.001,
                                            min_split_gain=0.0,
                                            n_estimators=100, n_jobs=-1,
                                            num_leaves=3...
                   iid='deprecated', n_iter=100, n_jobs=-1,
                   param_distributions={'boosting_type': ['gbdt', 'dart'],
                                        'colsample_bytree': [

Best parameters are:

In [170]:
searchcv.best_params_ , searchcv.best_score_

({'num_leaves': 50,
  'min_child_samples': 25,
  'max_depth': 15,
  'learning_rate': 0.1,
  'colsample_bytree': 1,
  'boosting_type': 'gbdt'},
 0.8592696802518158)

In [37]:
lgb_model = LGBMClassifier(learning_rate = 0.1,
                           colsample_bytree = 1,
                           boosting_type = "gbdt",
                           n_estimators = 200,
                           max_depth = 15, 
                          num_leaves = 50, 
                          metric = "auc", 
                          num_iterations = 100)

In [172]:
model_auc(X_train, y_train, lgb_model, n_iter = 2)

started training




ended training
started training
ended training
started training
ended training
started training
ended training
started training
ended training
started training
ended training
started training
ended training
started training
ended training
started training
ended training
started training
ended training


0.8939860080046683

Better than base by 0.0007. Very marginal difference.

## Contestant 2 - Xgboost 
Another powerful contestant. Thinks that leaf-wise grow is stupid and prefers conservative level-wise growth. Called Extreme by some.

### Base

In [243]:
xgb_model = XGBClassifier(metric = "auc")

In [245]:
model_auc(X_train, y_train, xgb_model,n_iter =1)

started training
ended training
started training
ended training
started training
ended training
started training
ended training
started training
ended training


0.891290408877525

By dafult little worse than LGBoost.

### Hiperparameter tuning

In [188]:
param_grid_xgb = {"max_depth":np.arange(4,100),
                  "learning_rate":np.arange(0.001,1,0.1),
                  "booster" : ["dart","gbtree"],
                  "colsample_bylevel" : np.arange(0.1,1,0.1),
                  "reg_alpha":np.arange(0,10,0.1),
                  "reg_lambda":np.arange(0,10,0.1)
                  }

In [193]:
searchcv = RandomizedSearchCV(xgb_model,
                        param_grid_xgb,
                        cv =skf,
                        n_iter=50,
                        n_jobs = -1
                       ) 

In [194]:
searchcv.fit(X_train,y_train) 

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=22, shuffle=True),
                   error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster=None,
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           gpu_id=-1, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=0.300000012,
                                           max_delta_step=0, max_depth=6,
                                           metric='auc', min_chi...
       2.6, 2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8,
       3.9, 4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1,
       5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2, 6.3, 6.4,
       6.5, 6.6, 6.7, 6.8, 6.9,

In [195]:
searchcv.best_params_ , searchcv.best_score_

({'reg_lambda': 8.8,
  'reg_alpha': 4.1000000000000005,
  'max_depth': 99,
  'learning_rate': 0.101,
  'colsample_bylevel': 0.4,
  'booster': 'gbtree'},
 0.8624821910971658)

In [36]:
xgb_model = XGBClassifier(metric = "auc",
                         booster = 'gbtree',
                         colsample_bylevel = 0.4,
                         learning_rate = 0.1,
                         max_depth = 99, 
                         reg_alpha = 4.1, 
                         reg_lambda = 8.8, 
                         n_iter = 1000)


In [247]:
model_auc(X_train, y_train, xgb_model,n_iter =1)

started training
ended training
started training
ended training
started training
ended training
started training
ended training
started training
ended training


0.8965226699273348

A bit better! By 0.05

## Contestant 3 - Catboost
Russian cousin, the weird one. Likes categories, is big, fast and strong!

### Base

In [4]:
cat_model = CatBoostClassifier(eval_metric='AUC', iterations = 100, verbose = 0)


In [8]:
model_auc(X_train, y_train, cat_model,n_iter =1)

started training
ended training
started training
ended training
started training
ended training
started training
ended training
started training
ended training


0.8933666263335297

### Hiperparameter tuning

In [18]:
param_grid_cat = {
                  "max_depth":np.arange(2,16),
                  "l2_leaf_reg": [0.001, 0.1,0.5,1,5,10,100],
                  "use_best_model": [True, False],
                  "num_trees": [100,300,500,1000],
                  "min_data_in_leaf": [1,3,5,8,14,25,50,80]
                    }


In [19]:
searchcv = RandomizedSearchCV(cat_model, param_grid_cat, cv = skf) # 5 fold crossvalidation 

In [None]:
searchcv.fit(X_train, y_train)

In [211]:
searchcv.best_params_ , searchcv.best_score_

({'use_best_model': False,
  'num_trees': 1000,
  'min_data_in_leaf': 50,
  'max_depth': 9,
  'l2_leaf_reg': 1},
 0.8639887407143018)

In [35]:
cat_model = CatBoostClassifier(eval_metric='AUC',
                               num_trees = 1000 ,
                              l2_leaf_reg=1,
                              max_depth=9,
                              min_data_in_leaf=50,
                              verbose=0)


In [11]:
model_auc(X_train, y_train, cat_model,n_iter =1)

started training
ended training
started training
ended training
started training
ended training
started training
ended training
started training
ended training


0.8985058119211505

Better by 0.008

## Results of fight

Now let's see how well our models perform on `test` data.

Let them fit to whole data, and then we will test them on test set, that they never seen before.

In [60]:
lgb_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)


<catboost.core.CatBoostClassifier at 0x7f9728de8a50>

In [61]:
y_pred_lgb = lgb_model.predict(X_test)
y_prob_lgb = lgb_model.predict_proba(X_test)[:,1]

y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:,1]

y_pred_cat = cat_model.predict(X_test)
y_prob_cat = cat_model.predict_proba(X_test)[:,1]

In [62]:
print("---------------LGB------------")
lgb = classification_report(y_test,y_pred_lgb, digits = 3)
print(lgb)
print("---------------XGB------------")
xgb = classification_report(y_test,y_pred_xgb, digits = 3)
print(xgb)
print("---------------CAT------------")
cat = classification_report(y_test,y_pred_cat, digits = 3)
print(cat)


---------------LGB------------
              precision    recall  f1-score   support

           0      0.884     0.945     0.913      8816
           1      0.740     0.556     0.635      2468

    accuracy                          0.860     11284
   macro avg      0.812     0.751     0.774     11284
weighted avg      0.852     0.860     0.852     11284

---------------XGB------------
              precision    recall  f1-score   support

           0      0.885     0.948     0.915      8816
           1      0.750     0.561     0.642      2468

    accuracy                          0.863     11284
   macro avg      0.818     0.754     0.779     11284
weighted avg      0.856     0.863     0.856     11284

---------------CAT------------
              precision    recall  f1-score   support

           0      0.883     0.948     0.915      8816
           1      0.749     0.553     0.636      2468

    accuracy                          0.862     11284
   macro avg      0.816     0.750  

As we can see Models are very similar in terms of output. Judging on `acc` and `f1`
score the winner is CAT and XGB has the second place. Lets see how well our `AUC` is. 

In [63]:
fpr_l, tpr_l, _ = roc_curve(y_test, y_prob_lgb)
lgb_auc = auc(fpr_l,tpr_l)

fpr_x, tpr_x, _ = roc_curve(y_test, y_prob_xgb)
xgb_auc = auc(fpr_x,tpr_x)

fpr_c,tpr_c, _  = roc_curve(y_test, y_prob_cat) 
cat_auc = auc(fpr_c,tpr_c)

print(f'LGB\'s AUC : {lgb_auc} \nXGB\'s AUC : {xgb_auc} \nCAT\'s AUC : {cat_auc} \n   ')

LGB's AUC : 0.8955586590021971 
XGB's AUC : 0.8973239038641987 
CAT's AUC : 0.8996378233034383 
   


**So the winner is CAT!!!** 

...but it is really sad we didn't surpass 0.9 `auc` :(

So let's try just another thing.
So what if we use **All** our models? Let's use mean of their predictions.

In [66]:
fpr_new, tpr_new, _ = roc_curve(y_test, np.array(y_prob_lgb + y_prob_xgb + y_prob_cat )/3 )

In [67]:
auc(fpr_new,tpr_new)

0.9002051118196767

And we've got another winner! Team work is worth the struggle after all. Let's bury the hatchet and declare all models a winer. That is right there are no losers here.

### Wrap-up
All models were really good even without tuning. Some of them have parameters that fit to the data and control overfitting. I was not surprised that even with long tuning we had only marginally improved `auc`. At the end i did something which is called model ensamble. It basically is aggregating models predictions. In my case it was mean of predictions. 

# Extra: Regression

In [22]:
import category_encoders as ce

In [23]:
df = pd.read_csv("https://www.dropbox.com/s/360xhh2d9lnaek3/allegro-api-transactions.csv?dl=1")

In [24]:
df.head()

Unnamed: 0,lp,date,item_id,categories,pay_option_on_delivery,pay_option_transfer,seller,price,it_is_allegro_standard,it_quantity,it_is_brand_zone,it_seller_rating,it_location,main_category
0,0,2016-04-03 21:21:08,4753602474,"['Komputery', 'Dyski i napędy', 'Nośniki', 'No...",1,1,radzioch666,59.99,1,997,0,50177,Warszawa,Komputery
1,1,2016-04-03 15:35:26,4773181874,"['Odzież, Obuwie, Dodatki', 'Bielizna damska',...",1,1,InwestycjeNET,4.9,1,9288,0,12428,Warszawa,"Odzież, Obuwie, Dodatki"
2,2,2016-04-03 14:14:31,4781627074,"['Dom i Ogród', 'Budownictwo i Akcesoria', 'Śc...",1,1,otostyl_com,109.9,1,895,0,7389,Leszno,Dom i Ogród
3,3,2016-04-03 19:55:44,4783971474,"['Książki i Komiksy', 'Poradniki i albumy', 'Z...",1,1,Matfel1,18.5,0,971,0,15006,Wola Krzysztoporska,Książki i Komiksy
4,4,2016-04-03 18:05:54,4787908274,"['Odzież, Obuwie, Dodatki', 'Ślub i wesele', '...",1,1,PPHU_RICO,19.9,1,950,0,32975,BIAŁYSTOK,"Odzież, Obuwie, Dodatki"


Ip, date and item_id won't tell our model much, additionally let's delete categories and leve only main category

In [25]:
extra_data = df.iloc[:,4:]

In [26]:
extra_data.head()

Unnamed: 0,pay_option_on_delivery,pay_option_transfer,seller,price,it_is_allegro_standard,it_quantity,it_is_brand_zone,it_seller_rating,it_location,main_category
0,1,1,radzioch666,59.99,1,997,0,50177,Warszawa,Komputery
1,1,1,InwestycjeNET,4.9,1,9288,0,12428,Warszawa,"Odzież, Obuwie, Dodatki"
2,1,1,otostyl_com,109.9,1,895,0,7389,Leszno,Dom i Ogród
3,1,1,Matfel1,18.5,0,971,0,15006,Wola Krzysztoporska,Książki i Komiksy
4,1,1,PPHU_RICO,19.9,1,950,0,32975,BIAŁYSTOK,"Odzież, Obuwie, Dodatki"


In [27]:
extra_data.dtypes

pay_option_on_delivery      int64
pay_option_transfer         int64
seller                     object
price                     float64
it_is_allegro_standard      int64
it_quantity                 int64
it_is_brand_zone            int64
it_seller_rating            int64
it_location                object
main_category              object
dtype: object

###  Task 
We will try 3 diffrent encoders on data, and then we will take a look at the magic of catboost. Of course we will do it on chunk of data from obvious reasons. 

In [28]:
# taking individual categories winner from previous task.
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [29]:
extra_data = extra_data.sample(frac = 1) # shuffle 
extra_data = extra_data.iloc[0:20000,:] # take first 20.000

Doing encoders that don't need target, and doing it on deafult regressor settings

In [30]:
cols = ["it_location","main_category", "seller"]
oe = ce.one_hot.OrdinalEncoder(cols = cols) # ordinal 
be = ce.basen.BaseNEncoder(cols = cols , base= 10) # base n
oh = ce.one_hot.OneHotEncoder(cols = cols) # hashing

encoders = [oh, be, oe]

model_regression = CatBoostRegressor(verbose = 0)


In [31]:
r2 = []
rmse = []

for i in range(len(encoders)): 
    
    encoder = encoders[i]
    X_transformed =  encoder.fit_transform(extra_data.drop("price", axis =1))
    
    random.seed(42)
    X2_train, X2_test, y2_train, y2_test = train_test_split(X_transformed,extra_data.price,test_size = 0.2)
    

    model_regression.fit(X2_train, y2_train)
    y_enc_pred = model_regression.predict(X2_test)
    
    r2.append(r2_score(y2_test, y_enc_pred))
    rmse.append(mean_squared_error(y2_test, y_enc_pred, squared = False))

In [32]:
r2  # preety terrible

[0.08408948700708996, 0.05940363934444093, 0.0868072233698276]

In [33]:
rmse

[165.22537109903502, 231.28361780736512, 198.13421243387847]

In [34]:
print(f'R2: \n One-Hot: {r2[0]}, BaseN: {r2[1]}, Ordinal: {r2[2]} \nRMSE: \n One-Hot: {rmse[0]}, BaseN: {rmse[1]}, Ordinal: {rmse[2]}')

R2: 
 One-Hot: 0.08408948700708996, BaseN: 0.05940363934444093, Ordinal: 0.0868072233698276 
RMSE: 
 One-Hot: 165.22537109903502, BaseN: 231.28361780736512, Ordinal: 198.13421243387847


## Summary

Overall those encoders were really bad. Really close to random guesses in terms of `R2`. One hot encoder was better in `RMSE` and second after Ordinal in `R2` metrics. Target encoding would be much better here.  