In [None]:
#  Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report

sns.set(style="whitegrid")
%matplotlib inline

In [None]:
# Make sure to upload "credit_risk_dataset.csv" to your Colab session.
df = pd.read_csv("credit_risk_dataset.csv")
print("Dataset shape:", df.shape)
df.head()

In [None]:
# Identify numeric and categorical columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Fill missing numeric values with the median and categorical with the mode
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

print("Missing values after imputation:")
print(df.isnull().sum())

In [None]:
# Corrected feature list with proper column names as provided
cluster_features = ['person_age', 'person_income', 'person_emp_length',
                    'loan_amnt', 'loan_int_rate', 'loan_percent_income',
                    'cb_person_cred_hist_length']
print("Columns used for clustering:", cluster_features)

# Extract the selected features from the dataframe
X_cluster = df[cluster_features].copy()

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)

# Determine the optimal number of clusters using the Elbow Method
inertia = []
K_range = range(1, 11)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8,5))
plt.plot(K_range, inertia, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method to Determine Optimal k')
plt.show()


In [None]:
# From the elbow plot, suppose we choose k = 3 (you may adjust based on the plot)
optimal_k = 3
kmeans_model = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = kmeans_model.fit_predict(X_scaled)

# Add cluster labels to the dataframe and view the cluster distribution
df['cluster'] = cluster_labels
print("Cluster counts:")
print(df['cluster'].value_counts())

In [None]:
# Use PCA to reduce dimensions to 2 for visualization of clusters
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8,6))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=cluster_labels, palette='viridis')
plt.title('Loan Applicant Clusters (PCA Projection)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster')
plt.show()

# Summarize cluster characteristics (mean values for key loan features)
cluster_summary = df.groupby('cluster')[['person_income', 'loan_amnt', 'loan_int_rate', 'loan_status']].mean()
print("Cluster Summary (Mean Values):")
print(cluster_summary)

In [None]:
print("Unique values in 'loan_intent':", df['loan_intent'].unique())

# Create dummy variables for 'loan_intent'
df_encoded = pd.get_dummies(df, columns=['loan_intent'], drop_first=True)
encoded_columns = [col for col in df_encoded.columns if col.startswith('loan_intent_')]
print("Encoded loan_intent columns:", encoded_columns)

# Define X (features) and y (target) using only the loan_intent dummies for simplicity.
X_class = df_encoded[encoded_columns]
y_class = df_encoded['loan_status']

# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X_class, y_class,
                                                    test_size=0.3,
                                                    random_state=42,
                                                    stratify=y_class)

# Train a logistic regression model with balanced class weights
logreg = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
logreg.fit(X_train, y_train)

# Predict on the test set
y_pred = logreg.predict(X_test)

# Evaluate model performance
conf_mat = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

print("\nLogistic Regression Model Evaluation (with class weights):")
print("Confusion Matrix:")
print(conf_mat)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))


In [None]:
# Expanded Feature Logistic Regression with Grid Search for Hyperparameter Tuning. We wanted to see if we could gain more accuracy by using gridsearch to help with proper parameters. they were basic parameters though

# 1. Select additional numerical features
features = ['person_income', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'person_age']
df_features = df[features].copy()

# 2. One-hot encode the 'loan_intent' categorical variable
df_intent = pd.get_dummies(df['loan_intent'], prefix='loan_intent', drop_first=True)

# 3. Combine the numerical features with the one-hot encoded features
X_full = pd.concat([df_features, df_intent], axis=1)
y_full = df['loan_status']

# 4. Standardize the numerical features
scaler = StandardScaler()
X_full[features] = scaler.fit_transform(X_full[features])

# 5. Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full, test_size=0.3, random_state=42, stratify=y_full
)

# 6. Hyperparameter tuning using GridSearchCV
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  # liblinear supports both l1 and l2
}

grid_search = GridSearchCV(
    LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    param_grid, cv=5, scoring='precision', n_jobs=-1
)
grid_search.fit(X_train, y_train)

print("Best parameters from GridSearchCV:", grid_search.best_params_)

# 7. Evaluate the best model from grid search
best_model = grid_search.best_estimator_
y_pred_grid = best_model.predict(X_test)

print("\nGridSearchCV Best Model Evaluation:")
print(classification_report(y_test, y_pred_grid, zero_division=0))

# Optional: Plotting the confusion matrix for visualization
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

conf_mat_grid = confusion_matrix(y_test, y_pred_grid)
plt.figure(figsize=(6,5))
sns.heatmap(conf_mat_grid, annot=True, fmt="d", cmap='Blues',
            xticklabels=['Predicted Non-default','Predicted Default'],
            yticklabels=['Actual Non-default','Actual Default'])
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.title("Confusion Matrix (Best GridSearchCV Model)")
plt.show()


In [None]:
# Expanded Feature Set for Improved Classification

# 1. Select additional numerical features
features = ['person_income', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'person_age']
df_features = df[features].copy()

# 2. One-hot encode the 'loan_intent' categorical variable
df_intent = pd.get_dummies(df['loan_intent'], prefix='loan_intent', drop_first=True)

# 3. Combine the numerical features with the one-hot encoded features
X_full = pd.concat([df_features, df_intent], axis=1)
y_full = df['loan_status']

# 4. Standardize the numerical features
scaler = StandardScaler()
X_full[features] = scaler.fit_transform(X_full[features])

# 5. Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full, test_size=0.3, random_state=42, stratify=y_full
)

# 6. Train a logistic regression model with balanced class weights
logreg_full = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
logreg_full.fit(X_train, y_train)

# 7. Predict on the test set
y_pred_full = logreg_full.predict(X_test)

# 8. Evaluate model performance
conf_mat_full = confusion_matrix(y_test, y_pred_full)
accuracy_full = accuracy_score(y_test, y_pred_full)
precision_full = precision_score(y_test, y_pred_full, zero_division=0)
recall_full = recall_score(y_test, y_pred_full, zero_division=0)
f1_full = f1_score(y_test, y_pred_full, zero_division=0)

print("Expanded Feature Logistic Regression Model Evaluation:")
print("Confusion Matrix:")
print(conf_mat_full)
print(f"Accuracy: {accuracy_full:.4f}")
print(f"Precision: {precision_full:.4f}")
print(f"Recall: {recall_full:.4f}")
print(f"F1 Score: {f1_full:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_full, zero_division=0))


Questions:

1. We rate this class a 5

2. for difficulty we rate this class a 3

3. A lot of the material we are learning is all applicable to our careers, but it allows us to learn new methods and other forms of applied areas in finance. The machine learning and regression components have been really helpful, because some of us are going into the quant field.

4. The challenges we would say is the learning curve on the math side. We can code out the logic, but would like to be better on the math side if models aren't correct.

5. We think your doing a great job. The only suggestion would be to recommend more github accounts to look at or research papers for application.
