In [None]:
#MBA 4931/5931 HW2
#Patrick Hollenbach, William Halm

In [None]:
#1.1.1 Load dataset
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
df = pd.read_csv('credit_risk_dataset.csv')
df.head(5)

In [None]:
#1.1.1 Handle missing values
df['person_emp_length'].fillna(df['person_emp_length'].median(), inplace=True)
df['loan_int_rate'].fillna(df['loan_int_rate'].median(), inplace=True)

In [None]:
#1.2.1 Select relevant columns
numerical_columns = ['person_age', 'person_income', 'person_emp_length', 'cb_person_cred_hist_length']
df_numerical = df[numerical_columns]
#1.1 Standardize values
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numerical)

In [None]:
#1.3.1 Kmeans, determine optimal clusters with elbow method
inertia = []
silhouette_scores = []
range_n_clusters = range(2, 11)  # Test for 2 to 10 clusters

for k in range_n_clusters:
    kmeans = KMeans(n_clusters=k, random_state=42,n_init=10).fit(df_scaled)
    inertia.append(kmeans.inertia_)


# Plot Inertia
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range_n_clusters, inertia, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')

plt.tight_layout()
plt.show()

In [None]:
#1.3.2Kmeans - optimal clusters is 5 based on elbow method above
optimal_k = 5
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(df_scaled)
df[['person_age', 'person_income', 'person_emp_length', 'cb_person_cred_hist_length', 'cluster']].head()

In [None]:
#1.3.2 Plot clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['person_age'], y=df['person_income'], hue=df['cluster'], palette='viridis', alpha=0.6)
plt.xlabel('Person Age')
plt.ylabel('Person Income')
plt.title('K-Means Clustering')
plt.legend(title='Cluster')
plt.show()

In [None]:
#1.3.2 Clusters with characteristics
#
loan_characteristics = df.groupby('cluster')[['loan_amnt', 'person_income', 'loan_status']].mean()
print(loan_characteristics)

plt.figure(figsize=(12, 6))
sns.barplot(x=loan_characteristics.index, y=loan_characteristics['loan_amnt'], palette='viridis')
plt.xlabel('Cluster')
plt.ylabel('Average loan amount')
plt.title('Average loan amount by cluster')
plt.show()

plt.figure(figsize=(12, 6))
sns.barplot(x=loan_characteristics.index, y=loan_characteristics['person_income'], palette='viridis')
plt.xlabel('Cluster')
plt.ylabel('Average income')
plt.title('Average income by cluster')
plt.show()

plt.figure(figsize=(12, 6))
sns.barplot(x=loan_characteristics.index, y=loan_characteristics['loan_status']*100, palette='viridis')
plt.xlabel('Cluster')
plt.ylabel('Average loan default %')
plt.title('Average loan status by cluster')
plt.show()


In [None]:
#1.4.1 4. Interpret the clusters:
#   - What patterns do you observe in the clusters?
# One of the most notable differences between the clusters is age. Clusters 1, 2, and 4 seem to be relatively similar applicants of different age groups. An outlier is cluster 3 which is the high earners and little seperation by age or loan amount. There were also a few outlier loan applicants that had large effects on the clusters and their averages.

#   - How do different clusters compare in terms of loan characteristics (e.g., loan amount, income, loan status)?
# Some of the loan characteristics are not very seperated by cluster while other characteristics are major elements of the cluster. Average income was a characteristic that set cluster 3 apart from the others.

In [None]:
#2.1
import numpy as np # linear algebra
import matplotlib.pyplot as plt

import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score,
    roc_curve,
    accuracy_score
)
import seaborn as sns
pd.set_option('display.max_columns', None)
import os
for dirname, _, filenames in os.walk('/home/codespace/.cache/kagglehub/datasets/taweilo/loan-approval-classification-data/versions/1/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
#2.1 Load dataset
df = pd.read_csv('credit_risk_dataset.csv')
df.head(50)

In [None]:
#2.1 Preprocess dataset
# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Identify categorical columns
categorical_cols = ['person_age', 'person_income', 'person_home_ownership', 'loan_intent', 'loan_grade','loan_amnt','loan_int_rate','loan_percent_income','cb_person_default_on_file','cb_person_cred_hist_length']

# present all unique values per categorical columns
for col in categorical_cols:
    print(f'for column {col}, the unique values are')
    print(df[col].unique())

# Apply one-hot encoding to categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)



In [None]:
df_encoded.columns

In [None]:
# Check numerical data distribution among different loan status
grouped_averages = df_encoded.groupby("loan_status").mean()

# Display the grouped averages
grouped_averages.head()

In [None]:
selected_features = df_encoded.columns.drop("loan_status")
target = "loan_status"

# Features and target
X = df_encoded[selected_features]
y = df_encoded[target]

# Split data into training and testing sets (75% training, 25% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

# Initialize logistic regression model
log_reg = LogisticRegression()

# Train the model
log_reg.fit(X_train, y_train)

# Generate predictions and probabilities
y_pred = log_reg.predict(X_test)
y_pred_prob = log_reg.predict_proba(X_test)[:, 1]

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["df_encoded = 0", "df_encoded = 1"], yticklabels=["df_encoded = 0", "df_encoded = 1"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Classification Report for precision, recall, F1-score
print("Classification Report:\n", classification_report(y_test, y_pred))

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# AUC (Area Under the Curve)
auc_score = roc_auc_score(y_test, y_pred_prob)
print("AUC:", auc_score)

# Plot ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {auc_score:.2f})", color="blue")
plt.plot([0, 1], [0, 1], "k--", label="Random Classifier")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="best")
plt.grid()
plt.show()

# Results
# The biggest insight gained from this model is the importance of employment length. This had a large effect on predicting the loan default chance. The confuusion matrix helps show that the regression model is mostly accurate. It would be reasonable to rely on this model for decision making.

Group survey:
1. **Course Rating (1-5 scale):** How would you rate this course overall? (1: Poor, 2: Fair, 3: Good, 4: Very Good, 5: Excellent)

(5) The course has been very educational and contains lots of relevent content for data analysis.

2. **Difficulty Level (1-5 scale):** How difficult was this course? (1: Very Easy, 2: Easy, 3: Moderate, 4: Hard, 5: Very Hard)

(4) So far the assignments have been fairly difficult but still doable.

3. **Pros:** What aspects of the course did you find beneficial?
   
   This course has been benificial at teaching new ways to analyse data. Our prevous knowledge contained only basic statistics.

4. **Cons:** What challenges did you face in this course?

  The content of this course can be pretty difficult. Working with code can be frustrating if there is a small unknown error that stops it from working.

5. **Suggestions:** How can this course be improved in future iterations?
  
  Some in class walkthroughs would be benificial where students could write the code during class.