### Importing the required libraries

In [22]:
# importing all the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import StratifiedKFold

### Importing the training and test dataset

In [23]:
#importing the training and test data and removing the "ID" column
train = pd.read_csv("train_allx.csv")
train.drop("ID", axis=1, inplace=True)
# importing the test data accordingly
test = pd.read_csv("test.csv")
ids = test["ID"]
test.drop("ID", axis=1, inplace=True)
cols = train.columns

### Working with Categorical Features
Since there are no null values in both the categorical columns (D_36 and D_44), I am using label encoder straight away

In [24]:
le = LabelEncoder()
train['D_36'] = le.fit_transform(train['D_36'])
test['D_36'] = le.transform(test['D_36'])
le = LabelEncoder()
train['D_44'] = le.fit_transform(train['D_44'])
test['D_44'] = le.transform(test['D_44'])

### Working with Numerical Features

For numerical features, I am using a simple imputer to fill the null values with mean value. And also before doing this, I am removing all the columns which have more than 80% features as NULL values

In [25]:
cols = train.columns
for i in range(len(cols)):
    if train[cols[i]].isna().sum()*100/len(train)>90:
        train.drop(cols[i], axis=1, inplace=True)
        test.drop(cols[i], axis=1, inplace=True)
        print("Dropping the column - ", cols[i])

Dropping the column -  D_38
Dropping the column -  B_26
Dropping the column -  D_17
Dropping the column -  R_8
Dropping the column -  D_73
Dropping the column -  B_7
Dropping the column -  B_22
Dropping the column -  D_80
Dropping the column -  D_97
Dropping the column -  D_12
Dropping the column -  D_132
Dropping the column -  D_114
Dropping the column -  D_96
Dropping the column -  D_39
Dropping the column -  D_43


In [26]:
train = train.fillna(train.mean())
test = test.fillna(train.mean())

### Anomaly detection using Isolation Forest


In [27]:
y = pd.read_csv("train_y.csv")
train['y'] = y['Default_Flag']
# contamination = 0.01 is used so that the top 1% of the outliers will be removed from the dataset
isf = IsolationForest(contamination = 0.01)
isf.fit(train)
outliers = isf.predict(train)

  "X does not have valid feature names, but"


In [28]:
# we wont be using the outliers while training, so we are removing those
train['out'] = outliers
train = train[train['out']==1]
train.drop("out", axis=1, inplace=True)
y = train['y']
train.drop("y", axis=1, inplace=True)

### 10-Fold LightGBM Classifier

We will be using 90% of the dataset everytime for training and then averaging out the predictions to get the final output.

In [29]:
strtfdKFold = StratifiedKFold(n_splits=10)
kfold = strtfdKFold.split(train, y)
scores = []
# predprobs for lightgbm
predprobs = 0
train1 = train.copy()
test1 = test.copy()
pipeline = LGBMClassifier()
for k, (train, test) in enumerate(kfold):
    pipeline.fit(train1.iloc[train, :], y.iloc[train])
    pred = pipeline.predict(train1.iloc[test,:])
    score = metrics.accuracy_score(y.iloc[test], pred)
    print(k, score) 
    print(k, metrics.confusion_matrix(y.iloc[test], pred))
    predprobs = predprobs + pipeline.predict_proba(test1)

0 0.8178102363296598
0 [[109730    475   2450    684]
 [  1674   5590   2953    222]
 [  5333   1423   7467    939]
 [  6230    539   4669   1063]]
1 0.8152745953869824
1 [[109003    508   2941    887]
 [  1493   5576   3131    239]
 [  5027   1345   7671   1120]
 [  5809    562   4913   1216]]
2 0.8157236151372481
2 [[108962    477   2999    901]
 [  1496   5554   3144    246]
 [  4953   1278   7852   1079]
 [  5855    530   4949   1166]]


### 10-Fold CatBoost Classifier

In [None]:
strtfdKFold = StratifiedKFold(n_splits=10)
kfold = strtfdKFold.split(train1, y)
scores = []
# predprobs1 for catboost
predprobs1 = 0
pipeline = CatBoostClassifier()
for k, (train, test) in enumerate(kfold):
    pipeline.fit(train1.iloc[train, :], y.iloc[train])
    pred = pipeline.predict(train1.iloc[test,:])
    score = metrics.accuracy_score(y.iloc[test], pred)
    print(k, score) 
    print(k, metrics.confusion_matrix(y.iloc[test], pred))
    predprobs1 = predprobs1 + pipeline.predict_proba(test1)

In [44]:
# averaging out the predictions
predprobsnew = predprobs/10
predprobs1new = predprobs1/10

array([[9.96887708e-01, 3.52826179e-04, 1.07613483e-03, 1.68333075e-03],
       [7.33042438e-01, 1.77865243e-02, 1.11561521e-01, 1.37609517e-01],
       [9.45612844e-02, 1.67290113e-01, 4.51422392e-01, 2.86726210e-01],
       ...,
       [6.05227722e-02, 1.44702322e-01, 5.28506056e-01, 2.66268849e-01],
       [9.93579092e-01, 8.98342355e-04, 2.11308840e-03, 3.40947725e-03],
       [9.92082077e-01, 4.62432398e-04, 2.38644688e-03, 5.06904376e-03]])

### Using the optimized values of mix (lightgbm, catboost) to make the final predictions

In [47]:
final_prob = predprobsnew*0.35 + predprobs1new*0.65
final_prob

array([[9.97240392e-01, 3.40263373e-04, 9.34522611e-04, 1.48482205e-03],
       [7.19405519e-01, 1.96571536e-02, 1.18336310e-01, 1.42601017e-01],
       [9.01571247e-02, 1.74501997e-01, 4.47191945e-01, 2.88148933e-01],
       ...,
       [7.16044149e-02, 1.32841929e-01, 5.29259086e-01, 2.66294569e-01],
       [9.94381949e-01, 8.80927066e-04, 1.79285042e-03, 2.94427307e-03],
       [9.92981479e-01, 4.45987961e-04, 2.20539174e-03, 4.36714085e-03]])

In [48]:
#takes the highest index in each row for the final output
final_vals = np.argmax(final_prob, axis=1)

In [52]:
#generating the final submission file
subs = pd.DataFrame()
subs['id'] = ids
subs['pred'] = final_vals
subs.to_csv("final_submission.csv", header=False, index=False)