In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import warnings
warnings.simplefilter(action='ignore')

## Data

### Preprocessing

In [2]:
def preprocessing(data):
    states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY", "PR"]
    
    def def_code(code):
        if code <= 0: return str(0)
        if code >  0: return str(code // 10000)
    
    def def_rate(code): 
        sector_default = {"11": 0.09, "21" :0.08, "22": 0.14, "23": 0.23, 
                   "31": 0.19, "32": 0.16, "33": 0.14, "42": 0.19, 
                   "44": 0.22, "45": 0.23, "48": 0.27, "49": 0.23, 
                   "51": 0.25, "52": 0.28, "53": 0.29, "54": 0.19,
                   "55": 0.10, "56": 0.24, "61": 0.24, "62": 0.10, 
                   "71": 0.21, "72": 0.22, "81": 0.20, "92": 0.15}

        if code in sector_default:
            return sector_default[code]
        return np.nan

    delete = ["Name", "City", "Bank", "BankState", "BalanceGross", "DisbursementGross"]
    data = data.drop(delete, axis = 1)
    
    data["State"] = data["State"].apply(lambda x : states.index(x) if not pd.isnull(x) else x)
    
    data["NAICS"]       = data["NAICS"].apply(def_code)
    data["NAICS_sector_rate"] = data.NAICS.apply(def_rate)
    data["NAICS"]       = data["NAICS"].astype(int)
    
    data["ApprovalFY"] = data["ApprovalFY"].replace("1976A", "1976", regex = True).astype(int)
    
    data["NewExist"] = data["NewExist"].replace({1.0: 0})
    data["NewExist"] = data["NewExist"].replace({2.0: 1})
    
    data["FranchiseCode"] = data["FranchiseCode"].replace({1: 0})
    data["FranchiseCode"] = np.where((data.FranchiseCode != 0), 1, data.FranchiseCode)
    
    data["UrbanRural"] = data["UrbanRural"].replace({1: 0})
    data["UrbanRural"] = data["UrbanRural"].replace({2: 1})
    data["UrbanRural"] = np.where((data["UrbanRural"] != 0) & (data["UrbanRural"] != 1), np.nan, data["UrbanRural"])
    
    data["RevLineCr"] = data["RevLineCr"].replace({"0": 0, "1":1})
    data["RevLineCr"] = data["RevLineCr"].replace({"N": 0, "Y":1})
    data["RevLineCr"] = np.where((data["RevLineCr"] != 0) & (data["RevLineCr"] != 1), np.nan, data["RevLineCr"]).astype(float)
    
    data["LowDoc"] = data["LowDoc"].replace({"[C, S, A, R, 0, 1]": np.nan})
    data["LowDoc"] = data["LowDoc"].replace({"N": 0, "Y": 1})
    data["LowDoc"] = np.where((data["LowDoc"] != 0) & (data["LowDoc"] != 1), np.nan, data["LowDoc"]).astype(float)
    
    data["ApprovalDate"]     = pd.to_datetime(data["ApprovalDate"],     format = "%d-%b-%y", errors='coerce')
    data["DisbursementDate"] = pd.to_datetime(data["DisbursementDate"], format = "%d-%b-%y", errors='coerce')
    data["ApprovalDate"]     = data["ApprovalDate"]    .apply(lambda x : pd.Timestamp(0, unit='s') if pd.isnull(x) else x)
    data["DisbursementDate"] = data["DisbursementDate"].apply(lambda x : pd.Timestamp(0, unit='s') if pd.isnull(x) else x)
    data["ApprovalDate"]     = data["ApprovalDate"]    .astype(int)/(10**15)
    data["DisbursementDate"] = data["DisbursementDate"].astype(int)/(10**15)
  
    # data["DisbursementGross"] = data["DisbursementGross"].replace("[\$,]", "", regex = True).astype(float)
    data["GrAppv"  ] = data["GrAppv"  ].replace("[\$,]", "", regex = True).astype(float) 
    data["SBA_Appv"] = data["SBA_Appv"].replace("[\$,]", "", regex = True).astype(float) 
    
    # data["DisbursementGross"] = np.log(data["DisbursementGross"])
    data["GrAppv"]   = np.log(data["GrAppv"])
    data["SBA_Appv"] = np.log(data["SBA_Appv"])
    
    data["Recession"] = data["ApprovalFY"].apply(lambda x: 1 if 2007 <= x <= 2009 else 0)

    return data

In [3]:
import pickle
with open("delete.pickle", "rb") as handle:
    delete = pickle.load(handle)

for item in delete:
    try: delete[item].remove("BankState")
    except: continue
delete

{('whole', 1): ['Zip', 'CreateJob', 'RetainedJob'],
 ('whole', 2): ['FranchiseCode', 'UrbanRural'],
 ('whole', 3): ['State', 'NoEmp', 'NewExist'],
 ('recession', 1): ['NAICS', 'LowDoc'],
 ('recession', 2): ['Zip', 'NewExist', 'CreateJob', 'FranchiseCode'],
 ('recession', 3): ['RevLineCr', 'DisbursementDate'],
 ('nonerecession', 1): ['Zip', 'NAICS'],
 ('nonerecession', 2): ['State',
  'NewExist',
  'CreateJob',
  'RetainedJob',
  'FranchiseCode',
  'UrbanRural'],
 ('nonerecession', 3): ['RevLineCr', 'LowDoc']}

In [4]:
x_train = pd.read_csv("Xtrain.csv", index_col = 0)
y_train = pd.read_csv("Ytrain.csv", index_col = 0)
x_train = preprocessing(x_train)
x_train = x_train.fillna(x_train. mean())

scalernormal_sba   = StandardScaler()
scalernormal_term  = StandardScaler()
scalernormal_emp   = StandardScaler()

scaledsba   = list(scalernormal_sba.  fit_transform(x_train[["SBA_Appv"]])[:,0])
scaledterm  = list(scalernormal_term. fit_transform(x_train[["Term"]])[:,0])
scaledemp   = list(scalernormal_emp.  fit_transform(x_train[["NoEmp"]])[:,0])

x_train["SBA_Appv"]          = scaledsba
x_train["Term"]              = scaledterm
x_train["NoEmp"]             = scaledemp

x_train.head()

Unnamed: 0_level_0,State,Zip,NAICS,ApprovalDate,ApprovalFY,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,DisbursementDate,GrAppv,SBA_Appv,NAICS_sector_rate,Recession
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,9,33027,33,1271.376,2010,0.549418,0.260038,0.0,2,0,0,0.0,0.0,0.0,1272.672,14.508658,2.435644,0.14,0
1,44,84094,23,1264.8096,2010,-0.778159,-0.058211,0.0,1,3,0,0.0,0.0,0.0,1264.9824,9.740969,-0.721346,0.23,0
2,4,90022,81,1027.9008,2002,-0.408627,-0.14196,0.0,1,1,0,0.0,1.0,0.0,1030.752,9.740969,-1.209796,0.2,0
3,49,53555,0,771.2064,1994,0.960009,-0.14196,1.0,0,0,0,0.0,0.0,0.0,775.6128,11.827736,0.67492,0.205107,0
4,25,65536,62,1139.184,2006,0.152513,-0.12521,0.0,7,2,0,0.0,0.274153,0.0,1143.7632,10.126631,-0.938026,0.1,0


In [5]:
x_test = pd.read_csv("Xtest.csv", index_col = 0)
x_test = preprocessing(x_test)
x_test = x_test. fillna(x_test. mean())

scaledsba   = list(scalernormal_sba.  transform(x_test[["SBA_Appv"]])[:,0])
scaledterm  = list(scalernormal_term. transform(x_test[["Term"]])[:,0])
scaledemp   = list(scalernormal_emp.  transform(x_test[["NoEmp"]])[:,0])

x_test["SBA_Appv"]          = scaledsba
x_test["Term"]              = scaledterm
x_test["NoEmp"]             = scaledemp

x_test.head()

Unnamed: 0_level_0,State,Zip,NAICS,ApprovalDate,ApprovalFY,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,DisbursementDate,GrAppv,SBA_Appv,NAICS_sector_rate,Recession
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,32.0,11209,44,1134.6048,2006,-0.58655,-0.058211,0.0,0,6,0,0.0,1.0,0.0,1135.9872,11.512925,0.038874,0.22,0
1,2.0,85297,72,1051.7472,2003,0.412554,0.008789,1.0,0,0,1,0.0,0.0,1.0,1056.9312,11.77529,0.597683,0.22,0
2,43.0,77450,42,1161.0432,2007,0.111454,-0.14196,1.0,1,1,1,0.0,0.0,0.0,1167.5232,12.122691,0.754291,0.19,1
3,17.0,40337,44,973.6416,2001,0.412554,-0.074961,0.0,0,0,0,0.0,0.0,0.0,988.5888,11.81303,0.536077,0.22,0
4,35.0,44087,72,1130.3712,2006,-1.079259,-0.007961,0.0,1,9,0,0.0,0.0,0.0,1135.9872,10.819778,-0.449576,0.22,0


### Different types

In [6]:
# general separate
### whole data
y_train  = y_train
whole_train = [x_train]
whole_test  = [x_test]
for k in range(3):
    whole_train.append(whole_train[k].drop(delete[("whole", k+1)], axis = 1))
    whole_test. append(whole_test[k]. drop(delete[("whole", k+1)], axis = 1))
whole_train.append(y_train)

### data with recession
y_train_0709 = y_train[(x_train["Recession"] == 1)]
x_train_0709 = x_train[(x_train["Recession"] == 1)]
x_test_0709  = x_test[(x_test["Recession"] == 1)]
recession_train = [x_train_0709]
recession_test  = [x_test_0709]
for k in range(3):
    recession_train.append(recession_train[k].drop(delete[("recession", k+1)], axis = 1))
    recession_test. append(recession_test[k]. drop(delete[("recession", k+1)], axis = 1))
recession_train.append(y_train_0709)

### data without recession
y_train_0014 = y_train[(x_train["Recession"] == 0)]
x_train_0014 = x_train[(x_train["Recession"] == 0)]
x_test_0014  = x_test[(x_test["Recession"] == 0)]
nonerecession_train = [x_train_0014]
nonerecession_test  = [x_test_0014]
for k in range(3):
    nonerecession_train.append(nonerecession_train[k].drop(delete[("nonerecession", k+1)], axis = 1))
    nonerecession_test. append(nonerecession_test[k]. drop(delete[("nonerecession", k+1)], axis = 1))
nonerecession_train.append(y_train_0014)

name  = ["whole", "recn", "none"]
train = [whole_train, recession_train, nonerecession_train]
test  = [whole_test,  recession_test,  nonerecession_test ]

## Model

In [7]:
# model training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# model evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [8]:
model_names = ["KNN", "LR", "DT", "RF", "GBM", "LGBM", "ADA", "XGB", "CAT"]
accuracy_record = pd.DataFrame({"Model": model_names})
accuracy_record.set_index("Model", inplace = True)

for datatype in range(3):
    for k in range(1, 4):
        data = train[datatype][k]
        labl = train[datatype][4]
        acc  = []

        for model in model_names:
            clfr = None
            if   model == "KNN" : clfr = KNeighborsClassifier(n_jobs=4)
            elif model == "LR"  : clfr = LogisticRegression(n_jobs=4)
            elif model == "DT"  : clfr = DecisionTreeClassifier()
            elif model == "RF"  : clfr = RandomForestClassifier()
            elif model == "GBM" : clfr = GradientBoostingClassifier()
            elif model == "LGBM": clfr = LGBMClassifier()
            elif model == "ADA" : clfr = AdaBoostClassifier()
            elif model == "XGB" : clfr = XGBClassifier(n_jobs=4)
            elif model == "CAT" : clfr = CatBoostClassifier()
            else:
                print ("Wrong classifier type!")
                break111

            acc.append(round(np.mean(cross_val_score(clfr, data, labl, cv = 10, n_jobs = 4)) * 100, 2))

            # # feature importances
            # try:
            #     print (clfr.feature_importances_)
            #     score = pd.DataFrame(clfr.feature_importances_, columns = ["Score"])
            #     score["Column"] = data.columns
            #     score = score.sort_values(by = "Score", ascending = False)
            #     print (score.head())
            #     print ("------------------------------")
            # except:
            #     continue
    
        # accuracy
        accuracy_record["{}{}".format(name[datatype], k)] = acc
print(accuracy_record, "\n")

       whole1  whole2  whole3  recn1  recn2  recn3  none1  none2  none3
Model                                                                  
KNN     68.95   68.91   70.48  72.54  74.60  74.74  67.94  73.68  74.68
LR      76.15   75.83   76.92  75.01  77.20  77.36  70.05  76.49  77.21
DT      88.05   88.18   88.26  89.67  89.21  88.83  87.41  87.75  87.59
RF      90.20   89.86   90.16  91.49  91.13  91.35  89.26  89.51  89.88
GBM     89.72   89.78   89.93  92.65  92.72  92.67  89.24  89.41  89.28
LGBM    92.31   92.34   92.22  93.79  93.66  93.50  91.88  91.78  91.64
ADA     87.66   87.66   87.65  91.40  91.30  91.36  86.87  86.71  86.86
XGB     92.51   92.51   92.38  93.64  93.75  93.46  92.09  92.01  91.63
CAT     92.65   92.67   92.53  93.90  93.66  93.51  92.26  92.14  91.92 



In [9]:
train = [x_train, whole_train[1], whole_train[2], y_train]
test  = [x_test,  whole_test [1], whole_test [2]]

### Tuning Hyperparameters

#### LightGBM

In [None]:
# try different combination of hyperparamters
for k in range(3):
    data = train[k]
    labl = train[3]
    acc  = []

    clfr = LGBMClassifier()
    # greedy choose the parameters range
    # params = {"objective"        : ["binary"]
    #           "n_estimators"     : [100, 150, 200, 600],           # default 100
    #           "learning_rate"    : [0.05, 0.1, 0.15, 0.2],         # default 0.1
    #           "max_depth"        : [3, 5, 7, 9, 11],               # default -1 (no limit)
    #           "min_data_in_leaf" : [60, 80, 100],                  # default 20
    #          }

    params = {"objective"        : ["binary"],
              "learning_rate"    : [0.05, 0.10],
              "n_estimators"     : [450, 600],
              #"max_depth"        : [6, 7],
              "min_data_in_leaf" : [20],
             }

    if k != 2: params["max_depth"] = [6]
    if k == 2: params["max_depth"] = [7]

    search = GridSearchCV(estimator = clfr, param_grid = params, scoring = "f1", n_jobs = -1, cv = 3, verbose = True)
    search.fit(data, labl)

    print (search.best_params_)
    column = ["params", "mean_test_score"]
    result = pd.DataFrame(search.cv_results_)
    print (result[column])

In [10]:
### LightGBM
LGBM = [[], [], []]
for k in range(3):
    if k == 2: max_depth = 7
    if k != 2: max_depth = 6
        
    clfr1  = LGBMClassifier(objective = "binary", n_estimators = 450, learning_rate = 0.05, max_depth = max_depth)
    clfr2  = LGBMClassifier(objective = "binary", n_estimators = 600, learning_rate = 0.05, max_depth = max_depth)
    clfr3  = LGBMClassifier(objective = "binary", n_estimators = 450, learning_rate = 0.10, max_depth = max_depth)
    clfr4  = LGBMClassifier(objective = "binary", n_estimators = 600, learning_rate = 0.10, max_depth = max_depth)

    print (np.round(cross_val_score(clfr1,  train[k], train[3], cv = 3, verbose = 0, n_jobs = 4) * 100, 2))
    print (np.round(cross_val_score(clfr2,  train[k], train[3], cv = 3, verbose = 0, n_jobs = 4) * 100, 2))
    print (np.round(cross_val_score(clfr3,  train[k], train[3], cv = 3, verbose = 0, n_jobs = 4) * 100, 2))
    print (np.round(cross_val_score(clfr4,  train[k], train[3], cv = 3, verbose = 0, n_jobs = 4) * 100, 2))
    print ("")

    LGBM[k] = [clfr1.fit(train[k], train[3]), 
               clfr2.fit(train[k], train[3]), 
               clfr3.fit(train[k], train[3]), 
               clfr4.fit(train[k], train[3])]

[92.36 92.45 93.05]
[92.43 92.44 93.02]
[92.46 92.57 93.06]
[92.52 92.46 92.94]

[92.18 92.38 93.  ]
[92.25 92.31 93.03]
[92.33 92.39 92.9 ]
[92.33 92.33 92.99]

[92.2  92.44 92.94]
[92.27 92.52 92.87]
[92.21 92.42 93.  ]
[92.15 92.25 92.81]



#### XGBoost

In [None]:
# try different combination of hyperparamters
for k in range(3):
    data = train[k]
    labl = train[3]
    acc  = []

    clfr = XGBClassifier()
    # greedy choose the parameters range
    # params = {"n_estimators"     : [100, 150, 200, 600],           # default 150
    #           "learning_rate"    : [0.05, 0.1, 0.15, 0.2],         # default 0.3
    #           "max_depth"        : [3, 5, 7],                      # default 6
    #           "min_child_weight" : [1, 3, 5],                      # default 1
    #           "gamma"            : [0.1, 0.15, 0.2, 0.25, 0.3],    # default 0.0
    #           "colsample_bytree" : [0.1, 0.5, 1.0]                 # default 1
    #          }

    params = {"objective"        : ["binary:logistic"],
              "n_estimators"     : [300],
              "learning_rate"    : [0.10, 0.15], 
              #"max_depth"        : [5, 7],
              "min_child_weight" : [3],
              "gamma"            : [0.2, 0.25],
              "colsample_bytree" : [0.6],
             }

    if k == 2: params["max_depth"] = [5]
    if k != 2: params["max_depth"] = [7]

    search = GridSearchCV(estimator = clfr, param_grid = params, scoring = "f1", n_jobs = -1, cv = 3, verbose = True)
    search.fit(data, labl)
    
    print (search.best_params_)
    column = ["params", "mean_test_score"]
    result = pd.DataFrame(search.cv_results_)
    print (result[column])

In [11]:
### XGBoost
seed = 7122
XGB  = [[], [], []]
for k in range(3):
    if k == 2: max_depth = 5
    if k != 2: max_depth = 7
        
    clfr1  = XGBClassifier(random_state = seed, objective = "binary:logistic", n_estimators = 300, learning_rate = 0.10, max_depth = max_depth, min_child_weight = 3, gamma = 0.20, colsample_bytree = 0.6)
    clfr2  = XGBClassifier(random_state = seed, objective = "binary:logistic", n_estimators = 300, learning_rate = 0.10, max_depth = max_depth, min_child_weight = 3, gamma = 0.25, colsample_bytree = 0.6)
    clfr3  = XGBClassifier(random_state = seed, objective = "binary:logistic", n_estimators = 300, learning_rate = 0.15, max_depth = max_depth, min_child_weight = 3, gamma = 0.20, colsample_bytree = 0.6)
    clfr4  = XGBClassifier(random_state = seed, objective = "binary:logistic", n_estimators = 300, learning_rate = 0.15, max_depth = max_depth, min_child_weight = 3, gamma = 0.25, colsample_bytree = 0.6)

    print (np.round(cross_val_score(clfr1,  train[k], train[3], cv = 3, verbose = 0, n_jobs = 4) * 100, 2))
    print (np.round(cross_val_score(clfr2,  train[k], train[3], cv = 3, verbose = 0, n_jobs = 4) * 100, 2))
    print (np.round(cross_val_score(clfr3,  train[k], train[3], cv = 3, verbose = 0, n_jobs = 4) * 100, 2))
    print (np.round(cross_val_score(clfr4,  train[k], train[3], cv = 3, verbose = 0, n_jobs = 4) * 100, 2))
    print ("")

    XGB[k] = [clfr1.fit(train[k], train[3]), 
              clfr2.fit(train[k], train[3]), 
              clfr3.fit(train[k], train[3]), 
              clfr4.fit(train[k], train[3])]

[92.42 92.56 93.14]
[92.36 92.64 93.09]
[92.27 92.57 92.94]
[92.3  92.46 93.04]

[92.06 92.4  92.96]
[92.23 92.38 92.96]
[92.24 92.39 92.9 ]
[92.23 92.37 92.87]

[92.09 92.21 92.96]
[92.12 92.27 93.05]
[92.2  92.35 92.94]
[92.28 92.21 92.87]



#### CatBoost

In [None]:
# try different combination of hyperparamters
for k in range(3):
    data = train[k]
    labl = train[3]
    acc  = []
    
    clfr = CatBoostClassifier(silent = True)
    # greedy choose the parameters range
    # params = {"n_estimators"     : [100, 150, 200, 600],           # default 1000
    #           "learning_rate"    : [0.05, 0.1, 0.15, 0.2],         # default 0.03
    #           "max_depth"        : [3, 5, 7, 9],                   # default 6
    #           "l2_leaf_reg"      : [2, 3, 4],                      # default 3
    #          }

    params = {"n_estimators"     : [1000],
              "learning_rate"    : [0.1], 
              "max_depth"        : [6],
              "l2_leaf_reg"      : [2, 3, 4],
             }

    search = GridSearchCV(estimator = clfr, param_grid = params, scoring = "f1", n_jobs = -1, cv = 3, verbose = True)
    search.fit(data, labl)
    
    print (search.best_params_)
    column = ["params", "mean_test_score"]
    result = pd.DataFrame(search.cv_results_)
    print (result[column])

In [12]:
### CatBoost
seed = 7122
CAT  = [[], [], []]
for k in range(3):
    clfr1  = CatBoostClassifier(random_state = seed, silent = True, n_estimators = 1000, learning_rate = 0.10, max_depth = 6, l2_leaf_reg = 2)
    clfr2  = CatBoostClassifier(random_state = seed, silent = True, n_estimators = 1000, learning_rate = 0.10, max_depth = 6, l2_leaf_reg = 3)
    clfr3  = CatBoostClassifier(random_state = seed, silent = True, n_estimators = 1000, learning_rate = 0.10, max_depth = 6, l2_leaf_reg = 4)
    clfr4  = CatBoostClassifier(random_state = seed, silent = True, n_estimators = 1200, learning_rate = 0.10, max_depth = 6, l2_leaf_reg = 2)
    clfr5  = CatBoostClassifier(random_state = seed, silent = True, n_estimators = 1200, learning_rate = 0.10, max_depth = 6, l2_leaf_reg = 3)
    clfr6  = CatBoostClassifier(random_state = seed, silent = True, n_estimators = 1200, learning_rate = 0.10, max_depth = 6, l2_leaf_reg = 4)

    print (np.round(cross_val_score(clfr1,  train[k], train[3], cv = 3, verbose = 0, n_jobs = 4) * 100, 2))
    print (np.round(cross_val_score(clfr2,  train[k], train[3], cv = 3, verbose = 0, n_jobs = 4) * 100, 2))
    print (np.round(cross_val_score(clfr3,  train[k], train[3], cv = 3, verbose = 0, n_jobs = 4) * 100, 2))
    print (np.round(cross_val_score(clfr4,  train[k], train[3], cv = 3, verbose = 0, n_jobs = 4) * 100, 2))
    print (np.round(cross_val_score(clfr5,  train[k], train[3], cv = 3, verbose = 0, n_jobs = 4) * 100, 2))
    print (np.round(cross_val_score(clfr6,  train[k], train[3], cv = 3, verbose = 0, n_jobs = 4) * 100, 2))
    print ("")

    CAT[k] = [clfr1.fit(train[k], train[3]), 
              clfr2.fit(train[k], train[3]), 
              clfr3.fit(train[k], train[3]), 
              clfr4.fit(train[k], train[3]),
              clfr5.fit(train[k], train[3]),
              clfr6.fit(train[k], train[3]),
             ]

[92.36 92.5  93.08]
[92.43 92.57 93.12]
[92.42 92.54 93.03]
[92.33 92.55 93.09]
[92.41 92.6  93.11]
[92.39 92.59 92.97]

[92.33 92.49 92.93]
[92.32 92.55 92.87]
[92.28 92.57 92.88]
[92.46 92.41 93.02]
[92.24 92.49 92.86]
[92.36 92.61 92.9 ]

[92.3  92.46 92.94]
[92.3  92.59 92.99]
[92.31 92.51 92.93]
[92.31 92.55 92.89]
[92.36 92.58 92.86]
[92.32 92.46 92.97]



### Ensemble

In [13]:
probability = np.zeros((x_test.shape[0], 2))

In [16]:
models = [LGBM, XGB, CAT]
for m in range(3):
    for k in range(3):
        model = models[m]
        
        ratio1 = 1.2 if k == 0 else 1.0
        ratio2 = 0.9 if m == 0 else 1.0
            
        for estimator in range(4):
            probability += model[k][estimator].predict_proba(test[k]) * ratio1 * ratio2

In [17]:
prediction = np.argmax(probability, axis = 1).reshape((-1, 1))
prediction

array([[1],
       [0],
       [1],
       ...,
       [0],
       [0],
       [1]])

In [18]:
index = np.arange(0, x_test.shape[0]).reshape((-1, 1))
index

array([[    0],
       [    1],
       [    2],
       ...,
       [99997],
       [99998],
       [99999]])

In [19]:
result = pd.DataFrame(np.concatenate([index, prediction], axis = 1), columns = ["Id", "ChargeOff"])
result.to_csv("output.csv", index = False)
result.head()

Unnamed: 0,Id,ChargeOff
0,0,1
1,1,0
2,2,1
3,3,0
4,4,1
