# Imports

In [55]:
import pandas as pd
from sklearn import linear_model, tree
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

# Data Exploration

In [45]:
df = pd.read_csv("Credit Card Default II (balance).csv")
df

Unnamed: 0,income,age,loan,default
0,66155.925100,59.017015,8106.532131,0
1,34415.153970,48.117153,6564.745018,0
2,57317.170060,63.108049,8020.953296,0
3,42709.534200,45.751972,6103.642260,0
4,66952.688850,18.584336,8770.099235,1
...,...,...,...,...
3423,60903.191726,21.933924,10367.081892,1
3424,62235.644695,25.033016,8083.900063,1
3425,25597.850590,26.350344,3810.110335,1
3426,48164.194974,23.141976,6529.652892,1


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3428 entries, 0 to 3427
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   income   3428 non-null   float64
 1   age      3428 non-null   float64
 2   loan     3428 non-null   float64
 3   default  3428 non-null   int64  
dtypes: float64(3), int64(1)
memory usage: 107.2 KB


In [47]:
df.describe()

Unnamed: 0,income,age,loan,default
count,3428.0,3428.0,3428.0,3428.0
mean,45136.875975,34.79595,5591.986695,0.5
std,14425.486619,12.840055,3174.52243,0.500073
min,20014.48947,-52.42328,1.37763,0.0
25%,32827.211177,25.171939,3103.357467,0.0
50%,45546.632873,30.086842,5540.305046,0.5
75%,57514.79413,44.369461,7809.377943,1.0
max,69995.68558,63.971796,13766.05124,1.0


In [48]:
# To understand how many people default-ed
df['default'].value_counts()

1    1714
0    1714
Name: default, dtype: int64

# Data Cleaning

In [50]:
# check for any null values
df.columns[df.isna().any()]

Index([], dtype='object')

In [51]:
# retrieve mean value of Age
age_mean = df["age"].mean()
age_mean

34.795949606582

# Modelling

In [68]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state=111)

### Using Linear Regression
Taking client's income, age and loan amount to predict whether they default

In [60]:
X = df.loc[:,["income","loan","age"]]
Y = df.loc[:,'default']

In [103]:
model = linear_model.LinearRegression()
model.fit(X,Y)
pred = model.predict(X_test)
print("Linear Regression accuracy: {0} %".format(round(model.score(X_test,Y_test) * 100, 2)))

Linear Regression accuracy: 64.75 %


In [105]:
linear_regression_rmse = mean_squared_error(Y_test,pred)**0.5
print(linear_regression_rmse)

0.29684319535433173


### Using Decision Tree

In [100]:
model = tree.DecisionTreeClassifier()
model.fit(X_train, Y_train)

DecisionTreeClassifier()

In [101]:
pred = model.predict(X_test)
cm = confusion_matrix(Y_test, pred)
accuracy = (cm[0,0] + cm[1,1])/(sum(sum(cm)))
accuracy

0.9918319719953326

In [102]:
predict_client = model.predict([[60000,4444,35]])
predict_client[0]

0

### Using Random Forest

In [91]:
model = RandomForestClassifier()
model.fit(X_train, Y_train)

RandomForestClassifier()

In [92]:
pred = model.predict(X_test)
cm = confusion_matrix(Y_test, pred)
accuracy = (cm[0,0] + cm[1,1])/(sum(sum(cm)))
accuracy

0.9906651108518086

In [93]:
predict_client = model.predict([[600000,44444,30]])
predict_client[0]

1

### Using GradientBoostingClassifier

In [106]:
model = GradientBoostingClassifier(random_state=260322)
model.fit(X_train, Y_train)

GradientBoostingClassifier(random_state=260322)

In [107]:
pred = model.predict(X_test)
cm = confusion_matrix(Y_test, pred)
accuracy = (cm[0,0] + cm[1,1])/(sum(sum(cm)))
accuracy

0.9859976662777129

In [109]:
predict_client = model.predict([[20000,44444,30]])
predict_client[0]

1