In [None]:
#class_names = [0,1] # for printing purposes later

## Finishing the task (a.k.a homework)

Try to improve the performance of a classifier on `data` with the following conditions:

1. use binning (on features of your choice, with your choice of parameters) and comment on its effects on classification
1. use at least 2 other preprocessing techniques (your choice!) on the data set and comment the classification results
1. run all classification test at least twice - once for unbalanced original data, once for balanced data (choose a balancing technique), compare those results (give a comment)

## DATA AND IMPORTS

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import norm, ttest_ind
from scipy.optimize import minimize_scalar
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import f_regression, mutual_info_regression, RFECV
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_log_error, make_scorer, mean_squared_error
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from collections import Counter
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import OneSidedSelection 
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

data = pd.read_csv('default_of_credit_card_clients.csv',sep=';')
raw_data = data.copy()

I copied data into raw_data to compare it in the end with the preprocessed and balanced data

## FUNCTION TO CHECK CLASSIFIERS RESULTS

I decided to check data with two classifiers with which I get the best AUC - Decision Tree and AdaBoost
In this function I for each classifier I print its name, confusion matrix and Area Under Curve

In [2]:
def check_classifiers(data, plot = False):
    names = ["DECISION TREE",
             "ADABOOST",
            ]

    classifiers = [
        DecisionTreeClassifier(max_depth=5),
        AdaBoostClassifier()        
    ]

    for name, clf in zip(names, classifiers):

        print(name)

        y = list(data['class'])
        X = data.drop(['class'],axis=1)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)
        y_pred = clf.fit(X_train, y_train).predict(X_test)

        cnf_matrix = confusion_matrix(y_test, y_pred)
        print(cnf_matrix)
        cnf_matrix_m = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
        print(cnf_matrix_m)
        cnf_acc = accuracy_score(y_test,y_pred)
        print("Accuracy = " + str(cnf_acc))
        probas_ = clf.fit(X_train, y_train).predict_proba(X_test)
        fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])
        roc_auc = auc(fpr, tpr)
        print("AUC = " + str(roc_auc))
        print() 
        print()
        
        if plot:
            np.set_printoptions(precision=2)
            # Plot non-normalized confusion matrix
            plt.figure()
            plot_confusion_matrix(cnf_matrix, classes=class_names,
                                  title='Confusion matrix, without normalization')
            # Plot normalized confusion matrix
            plt.figure()
            plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                                 title='Normalized confusion matrix')
            plt.show()


In [3]:
check_classifiers(data)

DECISION TREE
[[8886  471]
 [1685  958]]
[[0.94966335 0.05033665]
 [0.63753311 0.36246689]]
Accuracy = 0.8203333333333334
AUC = 0.7488825663447612


ADABOOST
[[8930  427]
 [1765  878]]
[[0.95436572 0.04563428]
 [0.66780174 0.33219826]]
Accuracy = 0.8173333333333334
AUC = 0.7737218026399817




## ROBUST SCALER

I want to detect and delete outliers from my data.

In [4]:
robust_scaler = preprocessing.RobustScaler()

# we rescale only numeric values - not indicators
columns = data.select_dtypes(include=['int64']).columns 
columns = columns.drop('class', errors = 'ignore')

robust_scaler.fit(data[columns])

data[columns] = robust_scaler.transform(data[columns])


In [5]:
check_classifiers(data)

DECISION TREE
[[8886  471]
 [1685  958]]
[[0.94966335 0.05033665]
 [0.63753311 0.36246689]]
Accuracy = 0.8203333333333334
AUC = 0.7492607625280973


ADABOOST
[[8930  427]
 [1765  878]]
[[0.95436572 0.04563428]
 [0.66780174 0.33219826]]
Accuracy = 0.8173333333333334
AUC = 0.7737218026399817




I can see that this preprocessing technique make almost no changes with the classification of data. It even make classification with Decision Tree event very little worse.

## BINING

Bining can balanced our data. I have chosen to do bining with the columns which describe "Amount of previous payment", because I think that there are values, which have good variability for bining. It might be also good idea to do to binning with colums which describe credit value and year, but I checked that they make totally no changes with the classification results, so I decide to do binig just with columns mentioned before.

In [6]:
val1 = 50 #number of bining sets
labels1 = [x+1 for x in range(val1)]
for i in range(18,24):
    tmp = str(i)
    tmp1 = 'X'+tmp
    data[tmp1] = pd.cut(data[tmp1],val1,labels = labels1)
    data[tmp1] = data[tmp1].astype('int64')

In [7]:
check_classifiers(data)

DECISION TREE
[[8922  435]
 [1719  924]]
[[0.95351074 0.04648926]
 [0.65039728 0.34960272]]
Accuracy = 0.8205
AUC = 0.7526707350758177


ADABOOST
[[8944  413]
 [1766  877]]
[[0.95586192 0.04413808]
 [0.6681801  0.3318199 ]]
Accuracy = 0.8184166666666667
AUC = 0.7704135261685031




Here I can see that bining make decision tree classification better. But still those changes are not very significant

## MIN-MAX NORMALIZATION

I will try another normalization technique to have normalized data.

In [8]:
# Min-max normalization
minmax_scaler = preprocessing.MinMaxScaler()

# we rescale only numeric values - not indicators
columns = data.select_dtypes(include=['int64']).columns 
columns = columns.drop('class', errors = 'ignore')
print('Columns to be standardized: ', list(columns))

# Prepare values for the transformation
minmax_scaler.fit(data[columns])

# MinMax
data[columns] = minmax_scaler.transform(data[columns])

Columns to be standardized:  ['X18', 'X19', 'X20', 'X21', 'X22', 'X23']


  return self.partial_fit(X, y)


In [9]:
check_classifiers(data)

DECISION TREE
[[8923  434]
 [1720  923]]
[[0.95361761 0.04638239]
 [0.65077563 0.34922437]]
Accuracy = 0.8205
AUC = 0.753578438264477


ADABOOST
[[8944  413]
 [1766  877]]
[[0.95586192 0.04413808]
 [0.6681801  0.3318199 ]]
Accuracy = 0.8184166666666667
AUC = 0.7704135261685031




Still very similar results as before.

In [10]:
data['class'].value_counts()

0    23364
1     6636
Name: class, dtype: int64

Our data is unbalanced so we need to balance it

## BALANCING

Now with the preprocessed data I will try do to some balancing. I will check one under-sampling method, one over-sampling method and one method which use over-sampling and under-sampling to compare results. As before I will use two classifiers - Decide Tree and AdaBoost and print for each of them its name, confusion matrix, normalized confusion matrix and Area Under Curve.

In [11]:

clf = DecisionTreeClassifier(max_depth = 5)


y = list(data['class'])
X = data.drop(['class'],axis=1)


names = ["DECISION TREE",
         "ADABOOST"
        ]

classifiers = [
        DecisionTreeClassifier(max_depth=5),
        AdaBoostClassifier()
    ]

resamplers = [
    (EditedNearestNeighbours(random_state=42),'ENN'),
    (SMOTE(random_state=42),'SMOTE'),
    (SMOTEENN(random_state=42),'SMOTEENN')
]

for name,clf in zip(names,classifiers):
    print(str(name + " ") * 3)

    for resampler,description in resamplers:
        print(description)
        %time X_res, y_res = resampler.fit_sample(X, y)
        print(Counter(y_res))

        X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=.4, random_state=42)
        y_pred = clf.fit(X_train, y_train).predict(X_test)
        
        cnf_matrix = confusion_matrix(y_test, y_pred)
        print(cnf_matrix)
        cnf_matrix_m = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
        print(cnf_matrix_m)
        cnf_acc = accuracy_score(y_test,y_pred)
        print("Accuracy = " + str(cnf_acc))
        probas_ = clf.fit(X_train, y_train).predict_proba(X_test)
        fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])
        roc_auc = auc(fpr, tpr)
        print("AUC = " + str(roc_auc))
        print() 
        print()

DECISION TREE DECISION TREE DECISION TREE 
ENN
Wall time: 19.9 s
Counter({0: 14107, 1: 6636})
[[5421  226]
 [1257 1394]]
[[0.95997875 0.04002125]
 [0.47416069 0.52583931]]
Accuracy = 0.8212822366835382
AUC = 0.8176113180073715


SMOTE
Wall time: 1.05 s
Counter({1: 23364, 0: 23364})
[[7777 1533]
 [3775 5607]]
[[0.83533835 0.16466165]
 [0.40236623 0.59763377]]
Accuracy = 0.7160282473785576
AUC = 0.778043524851963


SMOTEENN
Wall time: 44 s
Counter({1: 17386, 0: 12251})
[[4579  323]
 [2240 4713]]
[[0.93410853 0.06589147]
 [0.3221631  0.6778369 ]]
Accuracy = 0.7838043019822859
AUC = 0.865803987993524


ADABOOST ADABOOST ADABOOST 
ENN
Wall time: 19.5 s
Counter({0: 14107, 1: 6636})
[[5365  282]
 [1233 1418]]
[[0.95006198 0.04993802]
 [0.46510751 0.53489249]]
Accuracy = 0.8174258857556037
AUC = 0.8327548728984662


SMOTE
Wall time: 1.05 s
Counter({1: 23364, 0: 23364})
[[7716 1594]
 [3133 6249]]
[[0.82878625 0.17121375]
 [0.33393733 0.66606267]]
Accuracy = 0.7471110635566017
AUC = 0.8213233410

Finally I will check classification results for not balanced and not preprocessed data

In [12]:
check_classifiers(raw_data)

DECISION TREE
[[8885  472]
 [1685  958]]
[[0.94955648 0.05044352]
 [0.63753311 0.36246689]]
Accuracy = 0.82025
AUC = 0.7485120731842975


ADABOOST
[[8930  427]
 [1765  878]]
[[0.95436572 0.04563428]
 [0.66780174 0.33219826]]
Accuracy = 0.8173333333333334
AUC = 0.7737218026399817




I can see that the balanced data I have better results than unbalanced data and better than for not preprocessed data.

The best result is get for SMOTEENN balancing technique with the AdaBoost classifier. So it proves that data preprocesing helps with training data and that combining over-sampling and under-sampling methods deliver the best results for data classification.