## Straight from the Original Dataset

In [1]:
import pandas as pd

In [2]:
training_data = pd.read_csv("training_data_modified_fe.csv", index_col = 0)
test_data = pd.read_csv("test_data_modified_fe.csv", index_col = 0)

In [3]:
training_x = training_data.drop('NEXT_MONTH_DEFAULT', 1)
training_y = training_data['NEXT_MONTH_DEFAULT']

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
x_train, x_test, y_train, y_test = train_test_split(training_x, training_y, test_size = 0.1)

In [6]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

ada_clf = AdaBoostClassifier(
             DecisionTreeClassifier(max_depth=9), n_estimators=300,
             algorithm="SAMME.R", learning_rate=0.5
         )

ada_clf.fit(x_train, y_train)
y_pred_rf = ada_clf.predict(x_test)
accuracy_score(y_test,y_pred_rf)
confusion_matrix(y_test,y_pred_rf)

array([[1755,  118],
       [ 356,  171]], dtype=int64)

In [7]:
accuracy_score(y_test,y_pred_rf)

0.8025

In [8]:
print(classification_report(y_test,y_pred_rf))

              precision    recall  f1-score   support

           0       0.83      0.94      0.88      1873
           1       0.59      0.32      0.42       527

    accuracy                           0.80      2400
   macro avg       0.71      0.63      0.65      2400
weighted avg       0.78      0.80      0.78      2400



 ## Two-way Method

In [9]:
#getting column names
col_names = list(training_data.columns)
print(col_names)

['Balance_Limit_V1', 'Gender', 'EDUCATION_STATUS', 'MARITAL_STATUS', 'AGE', 'PAY_JULY', 'PAY_AUG', 'PAY_SEP', 'PAY_OCT', 'PAY_NOV', 'PAY_DEC', 'DUE_AMT_AUG', 'PAID_AMT_JULY', 'PAID_AMT_AUG', 'PAID_AMT_SEP', 'PAID_AMT_OCT', 'PAID_AMT_NOV', 'PAID_AMT_DEC', 'Ability_to_pay_AUG', 'Ability_to_pay_SEP', 'Ability_to_pay_OCT', 'Ability_to_pay_NOV', 'Ability_to_pay_DEC', 'AVG_PAID_AMT', 'NEXT_MONTH_DEFAULT']


In [10]:
life_col =  ['Balance_Limit_V1', 'Gender', 'EDUCATION_STATUS', 'MARITAL_STATUS', 'AGE','NEXT_MONTH_DEFAULT']
amount_col = list(set(col_names)-set(life_col)) + ['NEXT_MONTH_DEFAULT']

In [11]:
training_x_life = training_data.drop(amount_col,1)
training_y_life = training_data['NEXT_MONTH_DEFAULT']

In [12]:
training_x_life.head()

Unnamed: 0_level_0,Balance_Limit_V1,Gender,EDUCATION_STATUS,MARITAL_STATUS,AGE
Client_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A2,1000000,0,0,1,1
A3,1000000,1,1,1,0
A4,100000,1,1,0,1
A5,200000,1,0,0,1
A6,1000000,1,0,1,1


In [13]:
x_train, x_test, y_train, y_test = train_test_split(training_x_life, training_y_life, test_size = 0.2)

In [14]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

ada_clf_life = AdaBoostClassifier(
             DecisionTreeClassifier(max_depth=10), n_estimators=100,
             algorithm="SAMME.R", learning_rate=0.5
         )

ada_clf_life.fit(x_train, y_train)
y_pred_life = ada_clf_life.predict(x_test)

In [15]:
accuracy_score(y_test,y_pred_life)

0.7779166666666667

In [16]:
print(classification_report(y_test,y_pred_life))

              precision    recall  f1-score   support

           0       0.78      0.99      0.87      3747
           1       0.26      0.01      0.01      1053

    accuracy                           0.78      4800
   macro avg       0.52      0.50      0.44      4800
weighted avg       0.67      0.78      0.69      4800



In [17]:
training_x_amount = training_data.drop(life_col,1)
training_y_amount = training_data['NEXT_MONTH_DEFAULT']

In [18]:
training_x_amount.head()

Unnamed: 0_level_0,PAY_JULY,PAY_AUG,PAY_SEP,PAY_OCT,PAY_NOV,PAY_DEC,DUE_AMT_AUG,PAID_AMT_JULY,PAID_AMT_AUG,PAID_AMT_SEP,PAID_AMT_OCT,PAID_AMT_NOV,PAID_AMT_DEC,Ability_to_pay_AUG,Ability_to_pay_SEP,Ability_to_pay_OCT,Ability_to_pay_NOV,Ability_to_pay_DEC,AVG_PAID_AMT
Client_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
A2,-1,-1,-1,-1,-1,-1,3389,3437,6004,39418,162772,0,538165,2615,33414,123354,-162772,552147,124966.0
A3,0,-1,-1,-1,-1,0,151818,151818,46200,43530,80811,942,33666,-105618,16582,37281,-79869,-124590,59494.5
A4,4,3,2,2,-2,-2,16082,0,0,0,0,0,0,-16082,-15477,0,0,0,0.0
A5,2,0,0,0,0,0,92848,3855,3890,3696,4620,4049,3918,-92848,-95193,-97309,-100353,-102740,4004.666667
A6,2,2,0,0,0,0,419466,0,20790,16170,17325,16401,17325,-461046,-429785,-435354,-445271,-453899,14668.5


In [19]:
x_train, x_test, y_train, y_test = train_test_split(training_x_amount, training_y_amount, test_size = 0.2)

In [20]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

ada_clf_amount = AdaBoostClassifier(
             DecisionTreeClassifier(max_depth=10), n_estimators=100,
             algorithm="SAMME.R", learning_rate=0.5
         )

ada_clf_amount.fit(x_train, y_train)
y_pred_amount = ada_clf_amount.predict(x_test)

In [21]:
accuracy_score(y_test,y_pred_amount)

0.7966666666666666

In [22]:
print(classification_report(y_test,y_pred_amount))

              precision    recall  f1-score   support

           0       0.83      0.94      0.88      3733
           1       0.58      0.31      0.40      1067

    accuracy                           0.80      4800
   macro avg       0.70      0.62      0.64      4800
weighted avg       0.77      0.80      0.77      4800



In [23]:
life_pred = ada_clf_life.predict(training_x_life)
amount_pred = ada_clf_amount.predict(training_x_amount)

In [24]:
all_data = pd.DataFrame({"Life" : life_pred,"Amount":amount_pred})

In [25]:
x_train, x_test, y_train, y_test = train_test_split(all_data, training_y, test_size = 0.2)

In [26]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

ada_clf_all = AdaBoostClassifier(
             DecisionTreeClassifier(max_depth=10), n_estimators=100,
             algorithm="SAMME.R", learning_rate=0.5
         )

ada_clf_all.fit(x_train, y_train)
y_pred_all = ada_clf_all.predict(x_test)

In [27]:
accuracy_score(y_test,y_pred_all)

0.9452083333333333

In [28]:
print(classification_report(y_test,y_pred_all))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      3733
           1       0.94      0.80      0.87      1067

    accuracy                           0.95      4800
   macro avg       0.94      0.89      0.92      4800
weighted avg       0.95      0.95      0.94      4800



In [29]:
life_col_test =  ['Balance_Limit_V1', 'Gender', 'EDUCATION_STATUS', 'MARITAL_STATUS', 'AGE']
amount_col_test = list(set(list(test_data.columns))-set(life_col_test))

In [30]:
test_x_life = test_data.drop(amount_col_test,1)
test_x_amount = test_data.drop(life_col_test,1)

In [31]:
test_life_pred = ada_clf_life.predict(test_x_life)
test_amount_pred = ada_clf_amount.predict(test_x_amount)

In [32]:
all_data_test = pd.DataFrame({"Life" : test_life_pred,"Amount":test_amount_pred})

In [45]:
test_pred = ada_clf_all.predict(all_data_test)

In [48]:
list(test_pred).count(1)

708

In [49]:
test_pred = pd.DataFrame({"NEXT_MONTH_DEFAULT":test_pred})

In [50]:
test_pred.to_csv("Predictions2.csv")