In [1]:
import csv
import numpy as np
import pandas as pd
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV

# fetch the train data
train_data = pd.read_csv('./DSN_DATASET/Train.csv')
test_data = pd.read_csv('./DSN_DATASET/Test.csv')

In [13]:
# inspecting train dataset

train_data.head(10)
train_data.describe()
train_data.dtypes

Applicant_ID       object
form_field1       float64
form_field2       float64
form_field3       float64
form_field4       float64
form_field5       float64
form_field6       float64
form_field7       float64
form_field8       float64
form_field9       float64
form_field10      float64
form_field11      float64
form_field12      float64
form_field13      float64
form_field14      float64
form_field15      float64
form_field16      float64
form_field17      float64
form_field18      float64
form_field19      float64
form_field20      float64
form_field21      float64
form_field22      float64
form_field23      float64
form_field24      float64
form_field25      float64
form_field26      float64
form_field27      float64
form_field28      float64
form_field29      float64
form_field30      float64
form_field31      float64
form_field32      float64
form_field33      float64
form_field34      float64
form_field35      float64
form_field36      float64
form_field37      float64
form_field38

In [14]:
# inspecting test dataset

# test_data.head(10)
test_data.describe()
# test_data.dtypes

Unnamed: 0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field40,form_field41,form_field42,form_field43,form_field44,form_field45,form_field46,form_field48,form_field49,form_field50
count,22890.0,22291.0,23854.0,23854.0,23854.0,18396.0,21769.0,18396.0,20600.0,23853.0,...,5172.0,7651.0,23422.0,23750.0,21638.0,10462.0,17115.0,15078.0,23854.0,19203.0
mean,3492.284404,0.557676,1.065443,0.859146,2.183538,626303.6,6797033.0,2654142.0,13505930.0,11874780.0,...,147.62328,108.209648,0.369684,6.58048,0.566219,0.066526,0.097926,301544.0,1.064118,674984.3
std,190.502764,0.826543,2.198444,3.403115,11.415706,1457540.0,16260220.0,3968185.0,22891250.0,24771130.0,...,43.580328,36.426276,0.414077,6.363075,0.19606,0.278211,0.370392,1868574.0,1.816837,6561031.0
min,2986.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.075,0.0,0.0,0.0,0.066432,0.0,0.0,0.0,0.0,0.0
25%,3356.0,0.068675,0.0,0.0,0.0,14004.0,672581.0,181663.0,1349441.0,420898.0,...,135.53175,87.4995,0.0,2.02,0.413268,0.0,0.0,72.72364,0.0,0.0450051
50%,3484.0,0.27325,0.0582,0.0,0.0,115533.0,2719888.0,959468.5,5529830.0,3651543.0,...,150.0,117.984,0.22,5.05,0.5921,0.0,0.0,313.1754,0.0,0.168
75%,3624.0,0.72885,1.30425,0.0,0.0,515911.2,7073576.0,3799849.0,17286580.0,13181260.0,...,167.07825,137.20725,0.628573,10.1,0.756544,0.0,0.0,1195.997,1.254155,0.5007093
max,3900.0,22.31505,34.5414,206.4528,297.8856,48187380.0,770988700.0,113514100.0,1443921000.0,774101400.0,...,401.4135,211.6935,2.2,91.91,0.8,5.0,19.0,121399100.0,24.0,252459100.0


In [4]:
# make a copy of train and test dataFrames

X = train_data.copy()
T = test_data.copy()

# replace missing values in both train and test data with -999
X.fillna(-999,inplace=True)
T.fillna(-999,inplace=True)

# label encode columns with categorical data in train and test data
X["form_field47"] = LabelEncoder().fit_transform(X["form_field47"])
X["default_status"] = LabelEncoder().fit_transform(X["default_status"])
T["form_field47"] = LabelEncoder().fit_transform(T["form_field47"])

# assign the target variable
y = X["default_status"]

# drop column ID and others with mininal effect on the target variable from the train and test set
X = X.drop(["Applicant_ID", "default_status", "form_field48", "form_field49"], axis =1)
T = T.drop(["Applicant_ID", "form_field48", "form_field49"], axis = 1)

In [5]:
print(X.shape, T.shape, y.shape)

(56000, 48) (24000, 48) (56000,)


In [6]:
# Scale the train and test df using MinMaxScaler

scaler = MinMaxScaler()
scaled_x = scaler.fit_transform(X)
scaled_t = scaler.transform(T)

In [9]:
# take a look at the scaled data

scaled_x[:, 20]
scaled_t[:, 20]

array([0.        , 0.38606975, 0.        , ..., 0.        , 0.38163094,
       0.        ])

In [None]:
# performing grid search using catboost to find best parameters

model = CatBoostClassifier()
parameters = {'depth': [6,8,10],
              'eval_metric':['AUC'],
              'max_depth': [2, 4, 6, 8, None],
              'logging_level':['Silent'],
              'n_estimators': [100, 400, 600, 800],
              'loss_function': ['Logloss', 'CrossEntropy'],
              'subsample':[0.2, 0.4, 0.6, 0.9], 
             }
grid_cv = GridSearchCV(estimator=model, param_grid = parameters, cv = 2, n_jobs=-1)
grid_cv.fit(X, y) 
print(grid_cv.best_params_)

In [None]:
# performing bagging using CatBoostClassifier and non scaled train data

params = {'bootstrap_type': 'Bernoulli',
 'eval_metric': 'AUC',
 'logging_level': 'Silent',
 'loss_function': 'CrossEntropy',
 'max_depth': 8,
 'n_estimators': 800,
 'subsample': 0.9}

bcm = BaggingClassifier(CatBoostClassifier(**params), n_estimators = 200)
bcm.fit(X, y)

In [10]:
# performing bagging using LGBMClassifier and fitting on scaled train data

lgbm = lgb.LGBMClassifier(n_estimators=200)
bcm_2 = BaggingClassifier(lgbm, n_estimators = 300)
bcm_2.fit(scaled_x, y)

BaggingClassifier(base_estimator=LGBMClassifier(boosting_type='gbdt',
                                                class_weight=None,
                                                colsample_bytree=1.0,
                                                importance_type='split',
                                                learning_rate=0.1, max_depth=-1,
                                                min_child_samples=20,
                                                min_child_weight=0.001,
                                                min_split_gain=0.0,
                                                n_estimators=200, n_jobs=-1,
                                                num_leaves=31, objective=None,
                                                random_state=None,
                                                reg_alpha=0.0, reg_lambda=0.0,
                                                silent=True, subsample=1.0,
                                                sub

In [12]:
# making a prediction [probability] on the non scaled test data using BaggingClassifier 
# with CatBoostClassifier estimator

prediction_non_scaled = bcm.predict_proba(T)

In [15]:
# doing the same for the scaled test data with LGBMClassifier estimator

prediction_scaled = bcm_2.predict_proba(scaled_t)
print(prediction_scaled)

[[0.70585281 0.29414719]
 [0.66979087 0.33020913]
 [0.61994906 0.38005094]
 ...
 [0.7279135  0.2720865 ]
 [0.47189122 0.52810878]
 [0.8094233  0.1905767 ]]


In [None]:
# merging the two predictions together using weighted average

final_prediction = (prediction_non_scaled*0.6 + prediction_scaled*0.4)

In [None]:
# saving the prediction to a csv file 

customer_id = test_data['Applicant_ID']
with open('output.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Application_ID", "default_status"])
    for i in range(len(final_prediction)):
        writer.writerow([customer_id[i], final_prediction[i]])