In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from tabulate import tabulate
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [47]:
data = pd.read_excel("/Users/yamanjoshi/Downloads/W4.1_CreditRisk.xls")

In [48]:
data.head(15)

Unnamed: 0,Checking Acct,Credit Hist,Purpose,Savings Acct,Employment,Gender,Personal Status,Housing,Job,Telephone,Foreign,Months Acct (Added 1 to original Months Acct Variable),Residence Time,Age subtracted 1 from original Age variable,Credit Standing
0,0Balance,Current,Small Appliance,Low,Short,M,Single,Own,Unskilled,Yes,Yes,13,3,23,Good
1,0Balance,Current,Furniture,MedLow,Unemployed,M,Divorced,Own,Skilled,Yes,Yes,25,1,32,Bad
2,No Acct,Bank Paid,Car New,Low,Long,M,Single,Own,Management,No,Yes,19,4,38,Bad
3,Low,Current,Furniture,Low,Short,M,Single,Own,Unskilled,Yes,Yes,13,2,36,Bad
4,Low,Delay,Education,MedLow,Medium,M,Single,Rent,Skilled,No,Yes,40,3,31,Good
5,No Acct,Critical,Furniture,No Acct,Short,M,Married,Own,Skilled,Yes,No,11,1,25,Good
6,0Balance,Current,Car New,Low,Short,M,Married,Own,Unskilled,Yes,Yes,13,3,26,Good
7,0Balance,Critical,Business,Low,Very Short,M,Single,Own,Unskilled,Yes,Yes,14,1,27,Good
8,High,Current,Small Appliance,Low,Short,M,Single,Own,Skilled,Yes,Yes,37,2,25,Bad
9,No Acct,Current,Small Appliance,No Acct,Very Short,F,Divorced,Own,Skilled,No,Yes,25,1,43,Bad


In [49]:
print("Number of attributes (including the target variable): ", len(data.columns))
print("\nNumber of records: :", len(data))
print("\nNames of the attributes: ")
for attr in data.columns[:-1]:
    print(str(attr) + " | ", end ="")
print("\n\nName of the target variable: ", data.columns[-1])

Number of attributes (including the target variable):  15

Number of records: : 425

Names of the attributes: 
Checking Acct | Credit Hist | Purpose | Savings Acct | Employment | Gender | Personal Status | Housing | Job | Telephone | Foreign | Months Acct (Added 1 to original Months Acct Variable) | Residence Time | Age subtracted 1 from original Age variable | 

Name of the target variable:  Credit Standing


In [50]:
data.describe()

Unnamed: 0,Months Acct (Added 1 to original Months Acct Variable),Residence Time,Age subtracted 1 from original Age variable
count,425.0,425.0,425.0
mean,22.896471,2.84,34.397647
std,12.267599,1.087146,11.045126
min,5.0,1.0,18.0
25%,13.0,2.0,26.0
50%,19.0,3.0,32.0
75%,28.0,4.0,41.0
max,73.0,4.0,73.0


In [51]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425 entries, 0 to 424
Data columns (total 15 columns):
 #   Column                                                  Non-Null Count  Dtype 
---  ------                                                  --------------  ----- 
 0   Checking Acct                                           425 non-null    object
 1   Credit Hist                                             425 non-null    object
 2   Purpose                                                 425 non-null    object
 3   Savings Acct                                            425 non-null    object
 4   Employment                                              425 non-null    object
 5   Gender                                                  424 non-null    object
 6   Personal Status                                         425 non-null    object
 7   Housing                                                 425 non-null    object
 8   Job                                               

In [52]:
data.isna().sum()

Checking Acct                                             0
Credit Hist                                               0
Purpose                                                   0
Savings Acct                                              0
Employment                                                0
Gender                                                    1
Personal Status                                           0
Housing                                                   0
Job                                                       0
Telephone                                                 0
Foreign                                                   0
Months Acct (Added 1 to original Months Acct Variable)    0
Residence Time                                            0
Age subtracted 1 from original Age variable               0
Credit Standing                                           0
dtype: int64

In [53]:
#replace gender missing values with common value
data.Gender.value_counts()

Gender
M    289
F    135
Name: count, dtype: int64

In [54]:
data.fillna({"Gender": "M"}, inplace = True)
data.isna().sum()

Checking Acct                                             0
Credit Hist                                               0
Purpose                                                   0
Savings Acct                                              0
Employment                                                0
Gender                                                    0
Personal Status                                           0
Housing                                                   0
Job                                                       0
Telephone                                                 0
Foreign                                                   0
Months Acct (Added 1 to original Months Acct Variable)    0
Residence Time                                            0
Age subtracted 1 from original Age variable               0
Credit Standing                                           0
dtype: int64

In [55]:
#non ordinal with dummy variables
dummy_columns = ["Purpose", "Gender", "Personal Status", "Housing", "Telephone", "Foreign"]
data = pd.get_dummies(data, columns = dummy_columns, dtype = "int")
data.head(10)

Unnamed: 0,Checking Acct,Credit Hist,Savings Acct,Employment,Job,Months Acct (Added 1 to original Months Acct Variable),Residence Time,Age subtracted 1 from original Age variable,Credit Standing,Purpose_Business,...,Personal Status_Divorced,Personal Status_Married,Personal Status_Single,Housing_Other,Housing_Own,Housing_Rent,Telephone_No,Telephone_Yes,Foreign_No,Foreign_Yes
0,0Balance,Current,Low,Short,Unskilled,13,3,23,Good,0,...,0,0,1,0,1,0,0,1,0,1
1,0Balance,Current,MedLow,Unemployed,Skilled,25,1,32,Bad,0,...,1,0,0,0,1,0,0,1,0,1
2,No Acct,Bank Paid,Low,Long,Management,19,4,38,Bad,0,...,0,0,1,0,1,0,1,0,0,1
3,Low,Current,Low,Short,Unskilled,13,2,36,Bad,0,...,0,0,1,0,1,0,0,1,0,1
4,Low,Delay,MedLow,Medium,Skilled,40,3,31,Good,0,...,0,0,1,0,0,1,1,0,0,1
5,No Acct,Critical,No Acct,Short,Skilled,11,1,25,Good,0,...,0,1,0,0,1,0,0,1,1,0
6,0Balance,Current,Low,Short,Unskilled,13,3,26,Good,0,...,0,1,0,0,1,0,0,1,0,1
7,0Balance,Critical,Low,Very Short,Unskilled,14,1,27,Good,1,...,0,0,1,0,1,0,0,1,0,1
8,High,Current,Low,Short,Skilled,37,2,25,Bad,0,...,0,0,1,0,1,0,0,1,0,1
9,No Acct,Current,No Acct,Very Short,Skilled,25,1,43,Bad,0,...,1,0,0,0,1,0,1,0,0,1


In [56]:
# For "Checking Acct" column
oe_checking = OrdinalEncoder(categories = [["No Acct", "0Balance", "Low", "High"]])
data["Checking Acct"] = oe_checking.fit_transform(data["Checking Acct"].values.reshape(-1, 1))

# For "Savings Acct" column
oe_savings = OrdinalEncoder(categories = [["No Acct", "Low", "MedLow", "MedHigh", "High"]])
data["Savings Acct"] = oe_savings.fit_transform(data["Savings Acct"].values.reshape(-1, 1))

# For "credit hist" column
oe_credit = OrdinalEncoder(categories = [["Critical", "Delay", "Current", "Bank Paid", "All Paid"]])
data["Credit Hist"] = oe_credit.fit_transform(data["Credit Hist"].values.reshape(-1, 1))

# For "Employment" column
oe_empl = OrdinalEncoder(categories = [["Unemployed", "Very Short", "Short", "Medium", "Long"]])
data["Employment"] = oe_empl.fit_transform(data["Employment"].values.reshape(-1, 1))

# For "Job" column
oe_job = OrdinalEncoder(categories = [["Unemployed", "Unskilled", "Skilled", "Management"]])
data["Job"] = oe_job.fit_transform(data["Job"].values.reshape(-1, 1))

In [14]:
#Move target variable to end
credit_standing = data["Credit Standing"]
data = data.drop(columns = "Credit Standing")

In [15]:
data = data.assign(credit_standing = credit_standing)

In [16]:
data.head()

Unnamed: 0,Checking Acct,Credit Hist,Savings Acct,Employment,Job,Months Acct (Added 1 to original Months Acct Variable),Residence Time,Age subtracted 1 from original Age variable,Purpose_Business,Purpose_Car New,...,Personal Status_Married,Personal Status_Single,Housing_Other,Housing_Own,Housing_Rent,Telephone_No,Telephone_Yes,Foreign_No,Foreign_Yes,credit_standing
0,1.0,2.0,1.0,2.0,1.0,13,3,23,0,0,...,0,1,0,1,0,0,1,0,1,Good
1,1.0,2.0,2.0,0.0,2.0,25,1,32,0,0,...,0,0,0,1,0,0,1,0,1,Bad
2,0.0,3.0,1.0,4.0,3.0,19,4,38,0,1,...,0,1,0,1,0,1,0,0,1,Bad
3,2.0,2.0,1.0,2.0,1.0,13,2,36,0,0,...,0,1,0,1,0,0,1,0,1,Bad
4,2.0,1.0,2.0,3.0,2.0,40,3,31,0,0,...,0,1,0,0,1,1,0,0,1,Good


In [17]:
#split data
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.06, random_state = 42)

In [18]:
#encode target variable to integer
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [19]:
#Scaling
sc = StandardScaler()
x_train_sc = x_train.copy()
x_train_sc.iloc[:, :8] = sc.fit_transform(x_train.iloc[:, :8])
x_test_sc = x_test.copy()
x_test_sc.iloc[:, :8] = sc.transform(x_test.iloc[:, :8])

In [20]:
#Decision tree
tree = DecisionTreeClassifier(random_state = 42)
tree.fit(x_train, y_train)


In [21]:
#Cross validation with decision tree
tree_score = cross_val_score(tree, x_train, y_train, scoring = "accuracy")
print("Mean accuracy for Decision Tree Model:", round(np.mean(tree_score), 3))
print("Mean standard deviation for Decision Tree Model:", round(np.std(tree_score), 3))

Mean accuracy for Decision Tree Model: 0.609
Mean standard deviation for Decision Tree Model: 0.037


In [27]:
#Naive bayes
nb = GaussianNB()
nb.fit(x_train, y_train)

In [28]:
nb_score = cross_val_score(nb, x_train, y_train, cv = 10, scoring = "accuracy")
print("Mean accuracy for Naive Bayes Model:", round(np.mean(nb_score), 3))
print("Mean standard deviation for Naive Bayes Model:", round(np.std(nb_score), 3))

Mean accuracy for Naive Bayes Model: 0.611
Mean standard deviation for Naive Bayes Model: 0.077


In [29]:
nb_final = nb.score(x_test, y_test)
print("Accuracy score on the hold-out data, Naive Bayes model: ", round(nb_final, 3))

Accuracy score on the hold-out data, Naive Bayes model:  0.615


In [30]:
#Random forest
rf = RandomForestClassifier(random_state = 42)
rf.fit(x_train, y_train)
rf.score(x_test, y_test)

0.7307692307692307

In [31]:
rf_tuned = RandomForestClassifier(max_depth = 7, min_samples_split = 4, min_samples_leaf = 2, random_state = 42)
rf_tuned_score = cross_val_score(rf_tuned, x_train, y_train, cv = 10, scoring = "accuracy")
print("Mean accuracy for Random Forest Model:", round(np.mean(rf_tuned_score), 3))
print("Mean standard deviation for Random Forest Model:", round(np.std(rf_tuned_score), 3))

Mean accuracy for Random Forest Model: 0.712
Mean standard deviation for Random Forest Model: 0.061


In [32]:
rf_tuned.fit(x_train, y_train)
rf_tuned_final = rf_tuned.score(x_test, y_test)
print("Accuracy score on the hold-out data, Random Forest model: ", round(rf_tuned_final, 3))

Accuracy score on the hold-out data, Random Forest model:  0.769


In [33]:
#SVM
svm = SVC(probability = True, random_state = 42)
svm.fit(x_train_sc, y_train)

In [34]:
svm_score = cross_val_score(svm, x_train_sc, y_train, cv = 10, scoring = "accuracy")
print("Mean accuracy for SVM Model:", round(np.mean(svm_score), 3))
print("Mean standard deviation for SVM Model:", round(np.std(svm_score), 3))

Mean accuracy for SVM Model: 0.709
Mean standard deviation for SVM Model: 0.055


In [35]:
svm_final = svm.score(x_test_sc, y_test)
print("Accuracy score on the hold-out data, SVM model: ", round(svm_final, 3))

Accuracy score on the hold-out data, SVM model:  0.808


In [38]:
#logistic regression
lr = LogisticRegression()
lr.fit(x_train_sc, y_train)

In [39]:
lr_score = cross_val_score(lr, x_train_sc, y_train, cv = 10, scoring = "accuracy")
print("Mean accuracy for Logistic Regression Model:", round(np.mean(lr_score), 3))
print("Mean standard deviation for Logistic Regression Model:", round(np.std(lr_score), 3))

Mean accuracy for Logistic Regression Model: 0.687
Mean standard deviation for Logistic Regression Model: 0.052


In [40]:
lr_final = lr.score(x_test_sc, y_test)
print("Accuracy score on the hold-out data, Logistic Regression model: ", round(lr_final, 3))

Accuracy score on the hold-out data, Logistic Regression model:  0.692
