In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('whitegrid')

In [None]:
delinq = pd.read_csv('cs-training.csv', index_col = 'Unnamed: 0')
delinq.head(2)

In [None]:
sns.heatmap(delinq.isnull())

In [None]:
from sklearn.model_selection import train_test_split
train, test= train_test_split(delinq, test_size=0.18, random_state=42)
train, valid = train_test_split(train, test_size=0.20, random_state=42)  #splitted data into train, validation, test set

In [None]:
train.shape, valid.shape, test.shape

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
#sns.pairplot(train)

In [None]:
sns.boxplot(train['RevolvingUtilizationOfUnsecuredLines'])  #data contains outliers,
                                                            #the 'RevolvingUtilizationOfUnsecuredLines for example

In [None]:
valid.describe()

In [None]:
train['MonthlyIncome'].quantile(0.99)

In [None]:
train = train.drop(train[train['MonthlyIncome'] > 25000].index)   #dropped rows with
train.shape                                                       #value greater than the 99th percentil

In [None]:
train['NumberRealEstateLoansOrLines'].quantile(0.99)

In [None]:
train = train.drop(train[train['NumberRealEstateLoansOrLines'] > 3].index)   #dropped rows with
train.shape                                                                  #value greater than the 99th percentile

In [None]:
train['NumberOfDependents'].quantile(0.99)

In [None]:
train = train.drop(train[train['NumberOfDependents'] > 4].index)
train.shape

In [None]:
train.shape

In [None]:
def getoutliers(a, df):                                         #returns rows outside the lower and upper limit, 
    q1 = a.quantile(0.25) - a.min()                             #that is, the suspected outliers             
    q3 = a.quantile(0.75) - a.quantile(0.50) 
    iqr = a.quantile(0.75) - a.quantile(0.25) 
    lower_limit = q1 - (1.5 *iqr)
    upper_limit = q3 + (1.5*iqr)
    a = df[(a < lower_limit) | (a > upper_limit)]
    return a

In [None]:
outlier_df = getoutliers(train['RevolvingUtilizationOfUnsecuredLines'], train)  #dropped the rows with 
train = train.drop(outlier_df.index)                                            #the outliers
train.shape

In [None]:
train['NumberOfDependents'] = train['NumberOfDependents'].astype('Int64') #converted float to integer

In [None]:
train.describe()

In [None]:
train.head(2)

In [None]:
train = train.drop(train[train['MonthlyIncome'].isnull()].index)  #dropped MonthlyIncome Nulls  

valid = valid.drop(valid[valid['MonthlyIncome'].isnull()].index)

test = test.drop(test[test['MonthlyIncome'].isnull()].index)

In [None]:
train.shape

In [None]:
sns.heatmap(train.isnull())

In [None]:
train = train.drop(train[train['NumberOfDependents'].isnull()].index)  #dropped NumberOfDependents Nulls

valid = valid.drop(train[train['NumberOfDependents'].isnull()].index)

test = test.drop(test[test['NumberOfDependents'].isnull()].index)

In [None]:
sns.heatmap(train.isnull())

In [None]:
train_y = train['SeriousDlqin2yrs']
train_X = train.drop('SeriousDlqin2yrs', axis = 1)

valid_y = valid['SeriousDlqin2yrs']
valid_X = valid.drop('SeriousDlqin2yrs', axis = 1)

test_y = test['SeriousDlqin2yrs']
test_X = test.drop('SeriousDlqin2yrs', axis = 1)


In [None]:
from imblearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer

In [None]:
pipeline = make_pipeline(PolynomialFeatures(degree = 4),
                         StandardScaler(),
                         PCA(n_components = 5),
                         SMOTE(random_state = 42),
                         XGBClassifier(n_jobs = -1, n_estimators = 200, learning_rate = 0.1, colsample_bylevel =0.8,
                         colsample_bytree = 0.8))

In [None]:
from sklearn.model_selection import cross_val_score, KFold

In [None]:
folds = KFold(n_splits=5, shuffle= True, random_state= 0)

In [None]:
#cross_val_score(pipeline, train_X, train_y, scoring= 'recall',cv = folds,verbose= True, n_jobs= -1).mean()

In [None]:
pipeline.fit(train_X, train_y)

In [None]:
prediction = pipeline.predict(valid_X)
#prediction = pipeline.predict(test_X)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(classification_report(valid_y,prediction))
#print(classification_report(test_y,prediction))

In [None]:
print(confusion_matrix(valid_y,prediction))
#print(confusion_matrix(test_y,prediction))