### Setup
Import the required packages and load the data.

In [None]:
import os
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Add the parent directory to the Python path to import the core module
sys.path.append(os.path.abspath(os.path.join("..")))

### Load the csv file from dataset folder

In [2]:
from core import get_data_path

csv_file_path = get_data_path("assignment2/lc_14to16.csv")
data = pd.read_csv(csv_file_path)

  lc = pd.read_csv(csv_file_path)


In [None]:
# ----------------------------
# Data cleaning examples 
# ----------------------------
data.iloc[:,0:20].head()


# Example Task 1: Combine employment length into 3 categories:
# #  "< 1 year", "1 year" = 0-1 years
# # "2 years, 3 years, 4 years =  2-4 years
# # everything else =  5+ years

data["emp_length"] = data["emp_length"].replace(["< 1 year", "1 year"], "0-1 years")
data["emp_length"] = data["emp_length"].replace(["2 years", "3 years", "4 years", "5 years"], "2-5 years")
data["emp_length"] = data["emp_length"].replace([ "6 years", "7 years", "8 years", "9 years", "10+ years"], "5+ years")

# lc["emp_length"].value_counts()

# Example  Task 2: remove "ANY" category data for home_ownership
data = data[data.home_ownership != "ANY"]

data["home_ownership"].value_counts()


# Example Task 3: Subset data with only the following columns: "loan_amnt",  "annual_inc", "dti", "acc_now_delinq", "tot_coll_amt", "tot_cur_bal", "total_rev_hi_lim", "grade", "emp_length", "home_ownership"

data = data[["loan_amnt",  "annual_inc", "dti", "acc_now_delinq", "tot_coll_amt", "tot_cur_bal", "total_rev_hi_lim", "grade", "emp_length", "home_ownership"]]
data.head()

In [None]:
# Standardize numerical variables: "loan_amnt", "annual_inc", "dti", "acc_now_delinq", "tot_coll_amt", "tot_cur_bal", "total_rev_hi_lim"

start = datetime.now()
scaler = StandardScaler()

to_scale = ["loan_amnt", "annual_inc", "dti", "acc_now_delinq", "tot_coll_amt", "tot_cur_bal", "total_rev_hi_lim"]
data[to_scale] = scaler.fit_transform(data[to_scale])
stop = datetime.now()
print(stop - start)

data.head()

# ---------------------------------------------------------------------------------
# Report of missing values
# ---------------------------------------------------------------------------------
data.isnull().sum()

# ---------------------------------------------------------------------------------
# Eliminate rows with missing values
# One must be careful with this step, as it can lead to a huge loss of information
# Later we will see how to impute missing values using multidimesional imputation
# ---------------------------------------------------------------------------------

data = data.dropna()
data.isnull().sum()


In [None]:
# ---------------------------------------------------------------- 
# Convert grade to a tertiary variable: A,B = High, DEFG = Low,C = Medium
# Adding a new column to the dataframe called "grade_tertiary"
# ---------------------------------------------------------------- 
start = datetime.now()

data["grade_tertiary"] = data["grade"].replace(["A", "B"], "3.High").replace(["C"], "2.Medium").replace(["D", "E", "F", "G"], "1.Low")
data["grade_tertiary"].value_counts()


# filter out Medium for the grade_tertiary column 
lc2 = data[data["grade_tertiary"] != "2.Medium"]

stop = datetime.now()
print(stop - start)

print(data["grade_tertiary"].value_counts())
print(lc2["grade_tertiary"].value_counts())

lc2.head()



In [None]:
# Encoding categorical features: "emp_length", "home_ownership"

start = datetime.now()
ohe = OneHotEncoder(sparse=False)

to_encode = ["emp_length", "home_ownership"]
lc_encoded = pd.get_dummies(lc2, columns = to_encode)

stop = datetime.now()
print(stop - start)

lc_encoded.head()


In [None]:
# Run logistic regression on grade_tertiary as the target variable and the rest (exclusing grade) as the predictors

X = lc_encoded.drop(columns = ["grade_tertiary", "grade"])
y = lc_encoded["grade_tertiary"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

logit = LogisticRegression(max_iter = 1000)
logit.fit(X_train, y_train)

y_pred = logit.predict(X_test)
accuracy_score(y_test, y_pred)

# Classification report

print(classification_report(y_test, y_pred))

# Confusion matrix
confusion_matrix(y_test, y_pred)

# print a labeled confusion matrix
cm = confusion_matrix(y_test, y_pred)

sns.heatmap(cm, annot=True, fmt="d", xticklabels=["1.Low", "3.High"], yticklabels=["1.Low", "3.High"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


print(logit.classes_)

In [None]:
# ---------------------------------------------------
# Add predicted probability to the dataframe
# ---------------------------------------------------

lc_encoded["predicted"] = logit.predict(X)
lc_encoded["predicted_prob"] = logit.predict_proba(X)[:,1]
lc_encoded.head()


# ---------------------------------------------------
# ROC curve
# ---------------------------------------------------


fpr, tpr, thresholds = roc_curve(y_test, logit.predict_proba(X_test)[:,1], pos_label="3.High")
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f"ROC curve (area = {roc_auc:.2f})")
plt.plot([0,1], [0,1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.show()


In [None]:
# Now repeat the above by changing the threshold to 0.6

y_pred_06 = (logit.predict_proba(X_test)[:,1] > 0.6).astype(int)

# # Convert predicted labels back to original label types
y_pred_06 = np.where(y_pred_06 == 1, "3.High", "1.Low")

print(classification_report(y_test, y_pred_06))

cm = confusion_matrix(y_test, y_pred_06)

sns.heatmap(cm, annot=True, fmt="d", xticklabels=["1.Low", "3.High"], yticklabels=["1.Low", "3.High"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ---------------------------------------------------
# ROC curve
# ---------------------------------------------------

fpr, tpr, thresholds = roc_curve(y_test, logit.predict_proba(X_test)[:,1], pos_label="3.High")
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f"ROC curve (area = {roc_auc:.2f})")
plt.plot([0,1], [0,1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.show()


In [None]:
# ---------------------------------------------------
# Losgistic regression sklearn page:https://scikit-learn.org/1.5/modules/generated/sklearn.linear_model.LogisticRegression.html

# Hyperparameter tuning for logistic regression
# the most important hyperparameter for logistic regression is C, which is the inverse of the regularization strength   
# smaller values of C specify stronger regularization
# Regularization is a technique used to prevent overfitting by penalizing large coefficients
# The default value of C is 1

param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
logit = LogisticRegression(max_iter = 1000)
logit_cv = GridSearchCV(logit, param_grid, cv = 5)
logit_cv.fit(X_train, y_train)

print(logit_cv.best_params_)
print(logit_cv.best_score_)
print(logit_cv.best_estimator_)
print(logit_cv.best_index_)
print(logit_cv.best_estimator_.C)



# create a df with the results of the grid search
# This is done to see the results of the grid search
results = pd.DataFrame(logit_cv.cv_results_)
results

In [None]:
# ---------------------------------------------------
# Replicating the above steps for the KNN classifier
# ---------------------------------------------------
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)

sns.heatmap(cm, annot=True, fmt="d", xticklabels=["1.Low", "3.High"], yticklabels=["1.Low", "3.High"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ---------------------------------------------------
# hyperparameter tuning for KNN
# ---------------------------------------------------

param_grid = {"n_neighbors": [5, 7, 9, 11 ]}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv = 5)

knn_cv.fit(X_train, y_train)

print(knn_cv.best_params_)
print(knn_cv.best_score_)
print(knn_cv.best_estimator_)

results = pd.DataFrame(knn_cv.cv_results_)
results



In [None]:
# ---------------------------------------------------
# And now Replicating for the SGD classifier
# ---------------------------------------------------
sgd = SGDClassifier(loss="log")
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
accuracy_score(y_test, y_pred)

print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)

sns.heatmap(cm, annot=True, fmt="d", xticklabels=["1.Low", "3.High"], yticklabels=["1.Low", "3.High"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ---------------------------------------------------
# hyperparameter tuning for SGD
# ---------------------------------------------------

param_grid = {"alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
sgd = SGDClassifier()
sgd_cv = GridSearchCV(sgd, param_grid, cv = 5)

sgd_cv.fit(X_train, y_train)

print(sgd_cv.best_params_)
print(sgd_cv.best_score_)
print(sgd_cv.best_estimator_)
print(sgd_cv.best_index_)
print(sgd_cv.best_estimator_.alpha)

results = pd.DataFrame(sgd_cv.cv_results_)
results



In [None]:
# Summary statistics on the following columns: loan_amnt",  "annual_inc", "dti", "acc_now_delinq", "tot_coll_amt", "tot_cur_bal", "total_rev_hi_lim"
lc2[["loan_amnt",  "annual_inc", "dti", "acc_now_delinq", "tot_coll_amt", "tot_cur_bal", "total_rev_hi_lim"]].describe()

In [None]:
# eliminate rows with missing values in the following columns: loan_amnt",  "annual_inc", "dti", "acc_now_delinq", "tot_coll_amt", "tot_cur_bal", "total_rev_hi_lim"
lc3 = lc2.dropna(subset=["loan_amnt",  "annual_inc", "dti", "acc_now_delinq", "tot_coll_amt", "tot_cur_bal", "total_rev_hi_lim"])
lc3.shape

In [None]:
# Run logistic regression on grade_tertiary as the target variable and some numerical  as predictors
# Use the following predictors: "loan_amnt", "int_rate", "annual_inc", "dti", "fico_range_low", "fico_range_high", "revol_bal", "revol_util", "total_pymnt", "total_rec_prncp", "total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_amnt", "acc_now_delinq", "tot_coll_amt", "tot_cur_bal", "total_rev_hi_lim"

X = lc3[["loan_amnt",  "annual_inc", "dti", "acc_now_delinq", "tot_coll_amt", "tot_cur_bal", "total_rev_hi_lim"]]

y = lc3["grade_tertiary"]


# Normalize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

accuracy_score(y_test, y_pred)


# Classification report
print(classification_report(y_test, y_pred))

# Confusion matrix
confusion_matrix(y_test, y_pred)
