# Searching for best models by tuning params

## Libraries

In [4]:
import pandas as pd
import numpy as np
import sklearn 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
np.random.seed(123)

## Reading processed data (script 'data-preprocessing')

In [5]:
data = pd.read_csv('../processed_data/out.csv')

## Dividing into train and test

To divide set so in both parts with have simmilar amounts of big and small credits we need to put them into groups:  
(in traditional splits, randomization makes results appear very uneven)

In [6]:
from sklearn import preprocessing

x = data[['credit_amount']].values.astype(float)
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
data['amount_groups'] = x_scaled
bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
data['amount_groups'] = np.digitize(data['amount_groups'], bins)
unique, counts = np.unique(data['amount_groups'], return_counts=True)
dict(zip(unique, counts))

{1: 445, 2: 293, 3: 97, 4: 80, 5: 38, 6: 19, 7: 14, 8: 8, 9: 6}

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop(['customer_type', 'amount_groups'], axis=1), data.customer_type, test_size=0.20, stratify = data[['amount_groups', 'customer_type']])

## Needed functions

### Encoding

In [8]:
import category_encoders as ce
class Error(Exception):
    pass
class NonMatchingLengthsError(Error):
    pass

In [9]:
def multiEnc(X_train, X_test, target_train, cols, encodings):
    """
    Lista znaków do "encodings":
        d - backward difference
        n - base N
        b - binary
        c - cat boost
        # - hashing
        h - helmert
        j - James-Stein
        l - leave one out
        m - m-estimate
        1 - one-hot
        o - ordinal
        p - polynomial
        s - sum coding
        t - target encoding
        w - weight of evidence
    """
    ce_map = {"d": ce.backward_difference.BackwardDifferenceEncoder,
             "n": ce.basen.BaseNEncoder,
             "b": ce.binary.BinaryEncoder,
             "c": ce.cat_boost.CatBoostEncoder,
             "#": ce.hashing.HashingEncoder,
             "h": ce.helmert.HelmertEncoder,
             "j": ce.james_stein.JamesSteinEncoder,
             "l": ce.leave_one_out.LeaveOneOutEncoder,
             "m": ce.m_estimate.MEstimateEncoder,
             "1": ce.one_hot.OneHotEncoder,
             "o": ce.ordinal.OrdinalEncoder,
             "p": ce.polynomial.PolynomialEncoder,
             "s": ce.sum_coding.SumEncoder,
             "t": ce.target_encoder.TargetEncoder,
             "w": ce.woe.WOEEncoder}
    try:
        if len(cols)!=len(encodings):
            raise(NonMatchingLengthsError)
    except NonMatchingLengthsError:
        print("Lengths do not match")
        return None
    e=0
    for c in cols:
        if X_train[c].dtypes=='object':
            enc=ce_map[encodings[e]](cols=c)
            enc=enc.fit(X_train, target_train)
            X_train=enc.transform(X_train)
            X_test=enc.transform(X_test)
        e=e+1
    return (X_train, X_test)

### Models creation

In [10]:
from abc import (ABC, 
                 abstractmethod, 
                 abstractproperty)
from typing import Any
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier # Inna paczka niż sklearn!
import xgboost as xgb


class Builder(ABC):

    @abstractproperty
    def product(self) -> None:
        pass
    
    
class Product():

    def __init__(self) -> None:
        self.parts = []

    def add(self, part: Any) -> None:
        self.parts.append(part)

    def list_parts(self):
        return self.parts


class ConcreteBuilder(Builder):

    def __init__(self) -> None:

        self.reset()

    def reset(self) -> None:
        self._product = Product()

    @property
    def product(self) -> Product:

        product = self._product
        self.reset()
        return product

    def _add_model(self, model_type: str, params: dict):
        if model_type == 'logistic regression':
            self._logistic_regression(params)
        if model_type == 'decision tree':
            self._decision_tree(params)
        if model_type == 'svm':
            self._svm(params)
        if model_type == 'naive bayes':
            self._naive_bayes(params)
        if model_type == 'random forest':
            self._random_forest(params)
        if model_type == 'ada boost':
            self._ada_boost(params)
        if model_type == 'gradient boost':
            self._gradient_boost(params)
        if model_type == 'xgboost':
            self._xgboost(params)
        
    def _logistic_regression(self, params: dict):
        
        lr = LogisticRegression(**params)
        
        return self._product.add(lr)
    
    def _decision_tree(self, params: dict):
        
        dt = DecisionTreeClassifier(**params)
        
        dt_params = {'criterion': dt.criterion}
        
        return self._product.add(dt)

    def _svm(self, params: dict):
        svm = SVC(**params)
        
        return self._product.add(svm)
    
    def _naive_bayes(self, params: dict):
        nb = GaussianNB(**params)

        return self._product.add(nb)
    
    def _random_forest(self, params: dict):
        
        rf = RandomForestClassifier(**params)
        return self._product.add(rf)
        
    def _ada_boost(self, params: dict):
        ada = AdaBoostClassifier(**params)
    
        return self._product.add(ada)
    
    def _gradient_boost(self, params: dict):
        gb= GradientBoostingClassifier(**params)
        return self._product.add(gb)
    
    def _xgboost(self, params: dict):
        
        xg=XGBClassifier(**params)
        return self._product.add(xg)

class Director:

    def __init__(self) -> None:
        self._builder = None

    @property
    def builder(self) -> Builder:
        return self._builder

    @builder.setter
    def builder(self, builder: Builder) -> None:

        self._builder = builder


    def add_model(self, model_type, params):
        return self.builder._add_model(model_type, params)

    def add_all_models(self):
        self.add_model('logistic regression', {})
        self.add_model('decision tree', {})
        self.add_model('svm', {})
        self.add_model('naive bayes', {})
        self.add_model('random forest', {})
        self.add_model('ada boost', {})
        self.add_model('gradient boost', {})
        self.add_model('xgboost', {})
        return builder.product.list_parts()
    
    def get_all_models(self, metric_name: str = ''):
        parts =  builder.product.list_parts()
        return parts

In [11]:
director = Director()
builder = ConcreteBuilder()
director.builder = builder

### Business metrics to rate models (simmilar to F1, but more focused on earnings for the bank)

In [12]:
ir_loan = 0.13
lgd = 0.38

In [13]:
def calculateEarningsLosses(X_test, y_pred, y_test):
    '''
    As declared, takes test data and predicted classes and calculates:
    - earnings made by following prediction
    - losses made by following prediction
    - earnings omited by following prediction
    - losses omited by following prediction
    '''
    amounts = X_test['credit_amount']
    balance_all = y_test.apply(lambda x: ir_loan if x==1 else -lgd) * amounts
    earnings_made = balance_all.iloc[np.logical_and(y_test==1, y_pred==1).array].sum()
    earnings_omitted = balance_all.iloc[np.logical_and(y_test==1, y_pred==0).array].sum()
    losses_made = balance_all.iloc[np.logical_and(y_test==0, y_pred==1).array].sum()
    losses_omitted = balance_all.iloc[np.logical_and(y_test==0, y_pred==0).array].sum()
    
    results = pd.DataFrame(columns=['Earnings made', 'Earnings omitted', 'Losses made', 'Losses omitted'])
    results.loc[0] = [earnings_made, earnings_omitted, losses_made, losses_omitted]
    
    final_balance = earnings_made + losses_made
    max_income = balance_all.iloc[(y_test==1).array].sum()
    perc_of_max_income = final_balance/max_income
    return (results, final_balance, perc_of_max_income)

## Search for best type of encoding

In [14]:
columns_enc = ['checking_account_status', 'credit_history', 'purpose', 'savings', 'present_employment', 
              'personal', 'other_debtors', 'property', 'other_installment_plans', 
              'housing', 'job', 'telephone']
def encoding_list_gen(nominal, ordinal):
    enc = [ordinal, nominal, nominal, ordinal, nominal, nominal, nominal, nominal, nominal, nominal, nominal, nominal]
    return enc
# we group columns as ordinal and nominal to encode them with the same type
#enc = ['j', '1', '1', 'j', '1', '1', '1', '1', '1', '1', '1', '1']

In [15]:
X_test.columns

Index(['checking_account_status', 'duration', 'credit_history', 'purpose',
       'credit_amount', 'savings', 'present_employment', 'installment_rate',
       'personal', 'other_debtors', 'present_residence', 'property', 'age',
       'other_installment_plans', 'housing', 'existing_credits', 'job',
       'dependents', 'telephone', 'foreign_worker', 'sex',
       'checking_account_exists', 'savings_account_exists'],
      dtype='object')

Previous attempts have shown, that XGB, GradientBoosting, RandomForest and SVM are the classifiers to aim at. In following cells we will evaluate those three algorithms with different encoding types. Because most of them are tree based, OneHot will be used only to test SVM.

### Testing function

In [16]:
from typing import List
from sklearn.metrics import f1_score

def compare_models(models_list: List, X_train, y_train, X_test, y_test, categorical_variables, encoding_list):
    results = dict()
    
    df_train, df_test = multiEnc(X_train, X_test, y_train, categorical_variables, encoding_list)
    
    
    for model in models_list:
        
        training = model.fit(df_train, y_train)
        score = training.score(df_test, y_test)
        f1 = f1_score(y_test, model.predict(df_test))
        _, _, business = calculateEarningsLosses(X_test, model.predict(df_test), y_test)
        results[model] = (['score:', score], ['f1:', f1], ['business:', business])
        
    return results

In [17]:
def compare_encoders(model, X_train, y_train, X_test, y_test, columns_enc, enc_nominal, enc_ordinal):
    results = pd.DataFrame(columns = enc_nominal, index = enc_ordinal)
    results_f1 = pd.DataFrame(columns = enc_nominal, index = enc_ordinal)
    results_bus = pd.DataFrame(columns = enc_nominal, index = enc_ordinal)
    
    for nom in enc_nominal:
        for ordi in enc_ordinal:
            encoding_list = encoding_list_gen(nom, ordi)
            df_train, df_test = multiEnc(X_train, X_test, y_train, columns_enc, encoding_list)
            training = model.fit(df_train, y_train)
            score = training.score(df_test, y_test)
            f1 = f1_score(y_test, model.predict(df_test))
            _, _, business = calculateEarningsLosses(X_test, model.predict(df_test), y_test)
            
            results.loc[ordi, nom] = score
            results_f1.loc[ordi, nom] = f1
            results_bus.loc[ordi, nom] = business
    
    return (results, results_f1, results_bus)

In [18]:
director.add_model('gradient boost', {})
director.add_model('xgboost', {})
director.add_model('random forest', {})
director.add_model('svm', {})

In [19]:
models = director.get_all_models()

In [20]:
enc_nominal = ['l', 'j', 'm'] # encoding types to check for nominal values
enc_ordinal = ['p', 'h', 'd', 'l', 'j'] # encoding types to check for ordinal values
# we don't check any encodings leading to information loss

### Gradient boosting encoders fit

In [21]:
(score, f1, buss) = compare_encoders(models[0], X_train, y_train, X_test, y_test, columns_enc, enc_nominal, enc_ordinal)

In [22]:
score

Unnamed: 0,l,j,m
p,0.76,0.76,0.76
h,0.745,0.745,0.745
d,0.77,0.77,0.775
l,0.75,0.75,0.75
j,0.75,0.75,0.75


In [23]:
f1

Unnamed: 0,l,j,m
p,0.833333,0.833333,0.833333
h,0.8223,0.8223,0.8223
d,0.840278,0.840278,0.843206
l,0.827586,0.827586,0.827586
j,0.827586,0.827586,0.827586


In [24]:
buss

Unnamed: 0,l,j,m
p,0.205109,0.205109,0.205109
h,0.12244,0.12244,0.12244
d,0.109999,0.109999,0.188288
l,0.0454018,0.0454018,0.0454018
j,0.0454018,0.0454018,0.0454018


### XGB encoders fit

In [25]:
(score, f1, buss) = compare_encoders(models[1], X_train, y_train, X_test, y_test, columns_enc, enc_nominal, enc_ordinal = ['l', 'j'])

XGB has some issues with ordinal encoders so we omit them

In [26]:
score

Unnamed: 0,l,j,m
l,0.71,0.71,0.71
j,0.71,0.71,0.71


In [27]:
f1

Unnamed: 0,l,j,m
l,0.8,0.8,0.8
j,0.8,0.8,0.8


In [28]:
buss

Unnamed: 0,l,j,m
l,-0.30158,-0.30158,-0.30158
j,-0.30158,-0.30158,-0.30158


### Random Forest encoders fit

In [29]:
(score, f1, buss) = compare_encoders(models[2], X_train, y_train, X_test, y_test, columns_enc, enc_nominal, enc_ordinal)

In [30]:
score

Unnamed: 0,l,j,m
p,0.745,0.7,0.72
h,0.74,0.71,0.745
d,0.755,0.76,0.73
l,0.71,0.71,0.695
j,0.71,0.715,0.7


In [31]:
f1

Unnamed: 0,l,j,m
p,0.8223,0.79021,0.808219
h,0.811594,0.8,0.819788
d,0.825623,0.828571,0.804348
l,0.794326,0.789855,0.781362
j,0.797203,0.798587,0.782609


In [32]:
buss

Unnamed: 0,l,j,m
p,-0.0810373,-0.0810193,-0.0630276
h,0.228108,-0.0919704,0.0964536
d,0.344288,0.1476,0.0629035
l,-0.110592,0.0216118,0.0473739
j,-0.115509,-0.163645,0.00374003


### SVM encoders fit

In [33]:
enc_nominal.append('o')

In [34]:
(score, f1, buss) = compare_encoders(models[3], X_train, y_train, X_test, y_test, columns_enc, enc_nominal, enc_ordinal)

In [35]:
score

Unnamed: 0,l,j,m,o
p,0.685,0.685,0.685,0.69
h,0.69,0.69,0.69,0.695
d,0.685,0.685,0.685,0.69
l,0.695,0.695,0.695,0.695
j,0.695,0.695,0.695,0.695


In [36]:
f1

Unnamed: 0,l,j,m,o
p,0.813056,0.813056,0.813056,0.816568
h,0.816568,0.816568,0.816568,0.820059
d,0.813056,0.813056,0.813056,0.816568
l,0.820059,0.820059,0.820059,0.820059
j,0.820059,0.820059,0.820059,0.820059


In [37]:
buss

Unnamed: 0,l,j,m,o
p,-0.758967,-0.758967,-0.758967,-0.752538
h,-0.757002,-0.757002,-0.757002,-0.750573
d,-0.758967,-0.758967,-0.758967,-0.752538
l,-0.750573,-0.750573,-0.750573,-0.750573
j,-0.750573,-0.750573,-0.750573,-0.750573


### Results

According to score and f1 score all of the algorithms perform well. However most important aspect for potenital client is how much money it would make them. In that case only two algorithms have some potential in improving which are:  
**Gradient boosting** with encoding: ordinal: **polynominal** encoding, nominal: **James-Stein** (all performed the same, I just picked that one)  
**Random Forest** with encoding: ordinal: **backward difference**, nominal: **m-estimate**  
Rest algorithms present very low income levels or even lossess.

## Tuning parameters for selected algorithms

In [38]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

Just scoring for bussiness

In [39]:
def bussScore(clf, X, y_true):
    amounts = X['credit_amount']
    y_pred = clf.predict(X)
    balance_all = y_true.apply(lambda x: ir_loan if x==1 else -lgd) * amounts
    earnings_made = balance_all.iloc[np.logical_and(y_true==1, y_pred==1).array].sum()
    losses_made = balance_all.iloc[np.logical_and(y_true==0, y_pred==1).array].sum()
    
    final_balance = earnings_made + losses_made
    max_income = balance_all.iloc[(y_true==1).array].sum()
    perc_of_max_income = final_balance/max_income
    return perc_of_max_income

Setting encodings

In [40]:
gboost_enc = encoding_list_gen('j', 'p')
rforest_enc = encoding_list_gen('m', 'b')

### Gradient boosting tuning

In [41]:
parameters = {
    "loss":["deviance"],
    "learning_rate": [0.3, 0.4, 0.5],
    "min_samples_split": [0.05, 0.06, 0.07],
    "min_samples_leaf": [0.09, 0.1, 0.12],
    "max_depth":[8],
    "max_features":["sqrt"],
    "criterion": ["mae"],
    "subsample":[0.9, 1.0],
    "n_estimators":[100, 200]
    }

In [42]:
df_train, df_test = multiEnc(X_train, X_test, y_train, columns_enc, gboost_enc)

In [43]:
grid = GridSearchCV(estimator=models[0], param_grid = parameters, scoring = bussScore, cv=4, n_jobs=-1)
grid_result = grid.fit(df_train, y_train)

In [44]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.190235 using {'criterion': 'mae', 'learning_rate': 0.3, 'loss': 'deviance', 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 0.09, 'min_samples_split': 0.05, 'n_estimators': 100, 'subsample': 0.9}


In [45]:
best_model_gboost = grid_result.best_estimator_
best_model_gboost.score(df_test, y_test)

0.765

In [46]:
f1 = f1_score(y_test, best_model_gboost.predict(df_test))
f1

0.8373702422145329

Tuning:  
loss - deviance  
learning rate - increasing from 0.1 stopped at 0.5  
min_samples_split - increasing from 0.01 stopped at 0.05  
min_samples_leaf - increasing from 0.01 stopped at 0.09  
max_depth - after first try went to 8 and stopped there  
subsample - sometimes 0.9 sometimes 1.0 stopped at 0.9  
n_estimators - increased till 100  

**Bussiness Score: 0.210441**  
**Score: 0.76**  
**f1 score: 0.83217**

### Random Forest tuning

In [51]:
parameters = {
    "min_samples_split": [0.002, 0.003, 0.005],
    "min_samples_leaf": [0.001, 0.002, 0.003],
    "max_depth":[9, 10, 11],
    "max_features":["sqrt"],
    "criterion": ["gini"],
    "n_estimators":[400, 500]
}

In [52]:
df_train, df_test = multiEnc(X_train, X_test, y_train, columns_enc, rforest_enc)

In [53]:
grid = GridSearchCV(estimator=models[2], param_grid = parameters, scoring = bussScore, cv=4, n_jobs=-1)
grid_result = grid.fit(df_train, y_train)

In [54]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.112480 using {'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 0.001, 'min_samples_split': 0.003, 'n_estimators': 400}


In [55]:
best_model_rforest = grid_result.best_estimator_
best_model_rforest.score(df_test, y_test)

0.75

In [56]:
f1 = f1_score(y_test, best_model_rforest.predict(df_test))
f1

0.8333333333333334

min_samples_split - went all the way down but not below 0.002  
min_samples_leaf - really low, betweeen 0.001-0.003
max_depth - best results at 10, sometimes 11
n_estimators - 500 was the best from evaluated
ccp_alpha - stayed at 0 at all times

Further trials weren't needed. Score sometimes goes slightly up but won't achieve gradient boost's results

## Saving results

In [57]:
def save_final_data(df_train, y_train, df_test, y_test):
    df_train.to_csv('../final_data/df_train.csv', index=False)
    df_test.to_csv('../final_data/df_test.csv', index=False)
    y_test.to_csv('../final_data/y_test.csv', index=False)
    y_train.to_csv('../final_data/y_train.csv', index=False)

save_final_data(df_train, y_train, df_test, y_test)

In [58]:
import pickle

def save_model(model, filename: str):
    pickle.dump(model, open('../models/' + str(filename) + '.sav', 'wb'))

In [59]:
save_model(best_model_gboost, 'gradient_boost_best')

In [60]:
save_model(best_model_rforest, 'random_forest_best')