In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Credit Card Default II (balance).csv")

In [3]:
print(df)

            income        age          loan  default
0     66155.925100  59.017015   8106.532131        0
1     34415.153970  48.117153   6564.745018        0
2     57317.170060  63.108049   8020.953296        0
3     42709.534200  45.751972   6103.642260        0
4     66952.688850  18.584336   8770.099235        1
...            ...        ...           ...      ...
3423  60903.191726  21.933924  10367.081892        1
3424  62235.644695  25.033016   8083.900063        1
3425  25597.850590  26.350344   3810.110335        1
3426  48164.194974  23.141976   6529.652892        1
3427  33055.808635  26.289612   5351.851138        1

[3428 rows x 4 columns]


# Train-Test Split

In [4]:
X = df.loc[:, ["income","age","loan"]]

In [5]:
Y = df.loc[:,["default"]]

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y)
print(X_train, X_test, Y_train, Y_test)

            income        age          loan
2993  68924.039892  28.230691   9846.870552
2770  63655.604386  27.527984   8865.812735
2039  61761.297409  28.534865  10907.092117
1317  41944.268190  61.135063   1203.700529
3180  67346.131559  27.137647  13347.885966
...            ...        ...           ...
3301  43675.562357  32.480202   6741.071433
2218  37960.925789  25.207112   5525.167154
3279  43667.856513  27.960958   7073.071812
1359  23450.872130  34.406370   1419.805523
1190  60103.011570  56.053772   2632.265613

[2571 rows x 3 columns]             income        age         loan
2629  44020.312726  22.516752  5543.029537
1771  48211.581840  30.557344  2606.124698
2003  64543.780869  26.104653  9724.127771
1114  43588.081430  40.495647  6453.057979
955   53612.131230  23.282966  5976.896568
...            ...        ...          ...
1046  23787.367050  36.311713  3041.552908
827   54421.054010  22.961534  6229.836019
102   47634.549550  44.294871   141.703818
2200  26540.70836

# Logistic Regression

In [8]:
df.isnull().sum().sort_values()
#there are no null values

income     0
age        0
loan       0
default    0
dtype: int64

In [9]:
correlation = df.corr()
#to get the correlation
print(correlation)

           income       age      loan   default
income   1.000000 -0.028192  0.556173 -0.012761
age     -0.028192  1.000000 -0.247657 -0.653537
loan     0.556173 -0.247657  1.000000  0.508285
default -0.012761 -0.653537  0.508285  1.000000


In [10]:
from sklearn import linear_model
model = linear_model.LogisticRegression()

In [11]:
model.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
#predicting using train set
pred = model.predict(X_train)

In [13]:
from sklearn.metrics import confusion_matrix

In [14]:
#evaluating prediction using confusion matrix - train set
cm = confusion_matrix(Y_train, pred)
print(cm)

[[1138  151]
 [ 237 1045]]


In [15]:
#accuracy of model - train set (%)
accuracy = (cm[0,0]+cm[1,1])/sum(sum(cm))
print(accuracy)

0.8490859587709063


In [16]:
# implementing model on test set
pred = model.predict(X_test)

In [17]:
#evaluating prediction using confusion matrix - test set
cm = confusion_matrix(Y_test, pred)
print(cm)

[[379  46]
 [ 64 368]]


In [18]:
#accuracy of model - test set (%)
accuracy = (cm[0,0]+cm[1,1])/sum(sum(cm))
print(accuracy)

0.8716452742123687


In [19]:
import joblib
joblib.dump (model, "Weiyu_LogisticRegression")

['Weiyu_LogisticRegression']

# Decision Tree

In [20]:
from sklearn import tree

In [21]:
model = tree.DecisionTreeClassifier()  

In [22]:
model.fit(X_train, Y_train) 

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [23]:
pred = model.predict(X_train)

In [24]:
# evaluating prediction using confusion matrix - train set
cm = confusion_matrix(Y_train, pred)
print(cm)

[[1289    0]
 [   0 1282]]


In [25]:
#accuracy of model - train set (%)
accuracy = (cm[0,0] +cm[1,1])/sum(sum(cm))
print(accuracy)

1.0


In [26]:
pred = model.predict(X_test)

In [27]:
# evaluating prediction using confusion matrix - test set
cm = confusion_matrix(Y_test, pred)
print(cm)

[[413  12]
 [  2 430]]


In [28]:
#accuracy of model - test set (%)
accuracy = (cm[0,0] +cm[1,1])/sum(sum(cm))
print(accuracy)

0.9836639439906651


In [29]:
# Splitting train set into train and test set
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X_train, Y_train, test_size = 0.3)

In [30]:
#finding the best depth
for i in range(20):
    model = tree.DecisionTreeClassifier(max_depth=i+1)
    model.fit(X_train1, Y_train1)
    pred = model.predict(X_test1)
    cm = confusion_matrix(Y_test1, pred)
    print(i+1)
    print((cm[0,0]+cm[1,1])/(sum(sum(cm))))
    print("++++++++++++++++")
#best depth = 11


1
0.8652849740932642
++++++++++++++++
2
0.9430051813471503
++++++++++++++++
3
0.9468911917098446
++++++++++++++++
4
0.9792746113989638
++++++++++++++++
5
0.9870466321243523
++++++++++++++++
6
0.9883419689119171
++++++++++++++++
7
0.9883419689119171
++++++++++++++++
8
0.9922279792746114
++++++++++++++++
9
0.9922279792746114
++++++++++++++++
10
0.9909326424870466
++++++++++++++++
11
0.9922279792746114
++++++++++++++++
12
0.9870466321243523
++++++++++++++++
13
0.9922279792746114
++++++++++++++++
14
0.9883419689119171
++++++++++++++++
15
0.9883419689119171
++++++++++++++++
16
0.9922279792746114
++++++++++++++++
17
0.9909326424870466
++++++++++++++++
18
0.9883419689119171
++++++++++++++++
19
0.9909326424870466
++++++++++++++++
20
0.9870466321243523
++++++++++++++++


In [31]:
# Plugging in best depth into original train set
model = tree.DecisionTreeClassifier(max_depth = 11)
model.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=11, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [32]:
# evaluating prediction using confusion matrix - original train set
pred = model.predict(X_train)
cm = confusion_matrix(Y_train, pred)
print(cm)

#accuracy of model - original train set (%)
accuracy = (cm[0,0] +cm[1,1])/sum(sum(cm))
print(accuracy)

[[1289    0]
 [   0 1282]]
1.0


In [33]:
# evaluating prediction using confusion matrix - original test set
pred = model.predict(X_test)
cm = confusion_matrix(Y_test, pred)
print(cm)

#accuracy of model - test set_1 (%)
accuracy = (cm[0,0] +cm[1,1])/sum(sum(cm))
print(accuracy)

[[413  12]
 [  2 430]]
0.9836639439906651


In [34]:
#finding the best split
for i in range(150):
    model = tree.DecisionTreeClassifier(min_samples_split=i+5)
    model.fit(X_train1, Y_train1)
    pred = model.predict(X_test1)
    cm = confusion_matrix(Y_test1, pred)
    print(i+1)
    print((cm[0,0]+cm[1,1])/(sum(sum(cm))))
    print("++++++++++++++++")
#since 1 is the highest, it is best not to split

1
0.9883419689119171
++++++++++++++++
2
0.9870466321243523
++++++++++++++++
3
0.9883419689119171
++++++++++++++++
4
0.9883419689119171
++++++++++++++++
5
0.9883419689119171
++++++++++++++++
6
0.9870466321243523
++++++++++++++++
7
0.9870466321243523
++++++++++++++++
8
0.9883419689119171
++++++++++++++++
9
0.9883419689119171
++++++++++++++++
10
0.9883419689119171
++++++++++++++++
11
0.9883419689119171
++++++++++++++++
12
0.9883419689119171
++++++++++++++++
13
0.9883419689119171
++++++++++++++++
14
0.9883419689119171
++++++++++++++++
15
0.9883419689119171
++++++++++++++++
16
0.9883419689119171
++++++++++++++++
17
0.9883419689119171
++++++++++++++++
18
0.9883419689119171
++++++++++++++++
19
0.9883419689119171
++++++++++++++++
20
0.9883419689119171
++++++++++++++++
21
0.9883419689119171
++++++++++++++++
22
0.9883419689119171
++++++++++++++++
23
0.9883419689119171
++++++++++++++++
24
0.9883419689119171
++++++++++++++++
25
0.9883419689119171
++++++++++++++++
26
0.9883419689119171
++++++++++++

In [35]:
joblib.dump (model, "Weiyu_DecisionTree")

['Weiyu_DecisionTree']

# Random Forest

In [36]:
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier

In [37]:
model = ensemble.RandomForestClassifier()
model.fit(X_train, Y_train)

  


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [38]:
#finding the best depth
for i in range(20):
    model = ensemble.RandomForestClassifier(max_depth=i+1)
    model.fit(X_train1, Y_train1)
    pred = model.predict(X_test1)
    cm = confusion_matrix(Y_test1, pred)
    print(i+1)
    print((cm[0,0]+cm[1,1])/(sum(sum(cm))))
    print("++++++++++++++++")
#best depth = 12



  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


1
0.9468911917098446
++++++++++++++++
2
0.9520725388601037
++++++++++++++++


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


3
0.9559585492227979
++++++++++++++++
4
0.9559585492227979
++++++++++++++++
5
0.9753886010362695
++++++++++++++++


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


6
0.9792746113989638
++++++++++++++++


  after removing the cwd from sys.path.


7
0.9870466321243523
++++++++++++++++


  after removing the cwd from sys.path.


8
0.9896373056994818
++++++++++++++++


  after removing the cwd from sys.path.


9
0.9922279792746114
++++++++++++++++
10
0.9922279792746114
++++++++++++++++


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


11
0.9948186528497409
++++++++++++++++
12
0.9961139896373057
++++++++++++++++


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


13
0.9922279792746114
++++++++++++++++


  after removing the cwd from sys.path.


14
0.9961139896373057
++++++++++++++++


  after removing the cwd from sys.path.


15
0.9935233160621761
++++++++++++++++


  after removing the cwd from sys.path.


16
0.9948186528497409
++++++++++++++++


  after removing the cwd from sys.path.


17
0.9948186528497409
++++++++++++++++


  after removing the cwd from sys.path.


18
0.9948186528497409
++++++++++++++++


  after removing the cwd from sys.path.


19
0.9935233160621761
++++++++++++++++
20
0.9935233160621761
++++++++++++++++


  after removing the cwd from sys.path.


In [59]:
# Plugging in best depth into original train set
model = ensemble.RandomForestClassifier(max_depth = 12)
model.fit(X_train, Y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=12, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [60]:
#evaluating the accuracy of the model on the train set
pred = model.predict(X_train)
cm = confusion_matrix(Y_train, pred)
print(cm)
accuracy = (cm[0,0]+cm[1,1])/(sum(sum(cm)))
print(accuracy)

[[1289    0]
 [   0 1282]]
1.0


In [61]:
#evaluating the accuracy of the model on the test set
pred = model.predict(X_test)
cm = confusion_matrix(Y_test, pred)
print(cm)
accuracy = (cm[0,0]+cm[1,1])/(sum(sum(cm)))
print(accuracy)

[[413  12]
 [  1 431]]
0.9848308051341891


In [62]:
joblib.dump (model, "Weiyu_RandomForest")

['Weiyu_RandomForest']

# XGBoost

In [43]:
from sklearn.ensemble import GradientBoostingClassifier

In [44]:
model = ensemble.GradientBoostingClassifier()
model.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [45]:
#finding the best depth
for i in range(20):
    model = ensemble.GradientBoostingClassifier(max_depth=i+1)
    model.fit(X_train1, Y_train1)
    pred = model.predict(X_test1)
    cm = confusion_matrix(Y_test1, pred)
    print(i+1)
    print((cm[0,0]+cm[1,1])/(sum(sum(cm))))
    print("++++++++++++++++")
#best depth = 6

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


1
0.9689119170984456
++++++++++++++++
2
0.9896373056994818
++++++++++++++++


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


3
0.9922279792746114
++++++++++++++++
4
0.9922279792746114
++++++++++++++++


  y = column_or_1d(y, warn=True)


5
0.9922279792746114
++++++++++++++++


  y = column_or_1d(y, warn=True)


6
0.9948186528497409
++++++++++++++++


  y = column_or_1d(y, warn=True)


7
0.9922279792746114
++++++++++++++++


  y = column_or_1d(y, warn=True)


8
0.9935233160621761
++++++++++++++++


  y = column_or_1d(y, warn=True)


9
0.9935233160621761
++++++++++++++++


  y = column_or_1d(y, warn=True)


10
0.9922279792746114
++++++++++++++++


  y = column_or_1d(y, warn=True)


11
0.9922279792746114
++++++++++++++++


  y = column_or_1d(y, warn=True)


12
0.9922279792746114
++++++++++++++++


  y = column_or_1d(y, warn=True)


13
0.9922279792746114
++++++++++++++++


  y = column_or_1d(y, warn=True)


14
0.9922279792746114
++++++++++++++++


  y = column_or_1d(y, warn=True)


15
0.9922279792746114
++++++++++++++++


  y = column_or_1d(y, warn=True)


16
0.9896373056994818
++++++++++++++++


  y = column_or_1d(y, warn=True)


17
0.9922279792746114
++++++++++++++++


  y = column_or_1d(y, warn=True)


18
0.9922279792746114
++++++++++++++++


  y = column_or_1d(y, warn=True)


19
0.9922279792746114
++++++++++++++++


  y = column_or_1d(y, warn=True)


20
0.9922279792746114
++++++++++++++++


In [63]:
# Plugging in best depth into original train set
model = ensemble.GradientBoostingClassifier(max_depth = 6)
model.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=6,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [64]:
#evaluating the accuracy of the model on the train set
pred = model.predict(X_train)
cm = confusion_matrix(Y_train, pred)
print(cm)
accuracy = (cm[0,0]+cm[1,1])/(sum(sum(cm)))
print(accuracy)

[[1289    0]
 [   0 1282]]
1.0


In [65]:
#evaluating the accuracy of the model on the test set
pred = model.predict(X_test)
cm = confusion_matrix(Y_test, pred)
print(cm)
accuracy = (cm[0,0]+cm[1,1])/(sum(sum(cm)))
print(accuracy)

[[416   9]
 [  1 431]]
0.9883313885647608


In [66]:
joblib.dump (model, "Weiyu_XGBoost")

['Weiyu_XGBoost']

# Neural Network

In [76]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [110]:
model = Sequential()
model.add(Dense(7, input_dim = 3, activation = "relu"))

In [111]:
model.add(Dropout(0.3))
model.add(Dense(3, activation="relu"))

In [112]:
model.add(Dropout(0.3))
model.add(Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_24 (Dense)            (None, 7)                 28        
                                                                 
 dropout_17 (Dropout)        (None, 7)                 0         
                                                                 
 dense_25 (Dense)            (None, 3)                 24        
                                                                 
 dropout_18 (Dropout)        (None, 3)                 0         
                                                                 
 dense_26 (Dense)            (None, 1)                 4         
                                                                 
Total params: 56
Trainable params: 56
Non-trainable params: 0
_________________________________________________________________


In [113]:
model.compile(loss = "binary_crossentropy", metrics=["accuracy"])

In [114]:
model.fit(X_train, Y_train, batch_size = 10, epochs = 10,  verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fef3193d950>

In [115]:
#evaluating model on test set
model.evaluate(X_train, Y_train)



[0.39983126521110535, 0.8428627252578735]

In [116]:
#evaluating model on test set
model.evaluate(X_test, Y_test)



[0.39020469784736633, 0.8564760684967041]

In [117]:
model.save("Weiyu_NeuralNetwork")

INFO:tensorflow:Assets written to: Weiyu_NeuralNetwork/assets
