In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.stats import skew, kurtosis
import datetime
import requests, zipfile, io

Problem 1:

In [None]:
df = pd.read_csv('credit_risk_dataset.csv')

In [None]:
print("Initial data shape:", df.shape)
print(df.head())

In [None]:
print("\nMissing values per column:")
print(df.isnull().sum())

In [None]:
df = df.dropna()
print("\nData shape after dropping missing values:", df.shape)

In [None]:
numerical_cols = [
    'person_age',
    'person_income',
    'person_emp_length',
    'loan_amnt',
    'loan_int_rate',
    'loan_percent_income',
    'cb_person_cred_hist_length'
    ]

In [None]:
scaler = StandardScaler()
X_num = scaler.fit_transform(df[numerical_cols])

In [None]:
inertia = []
K_range = range(1, 11)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_num)
    inertia.append(kmeans.inertia_)

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(K_range, inertia, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['cluster'] = kmeans.fit_predict(X_num)

In [None]:
cluster_summary = df.groupby('cluster')[numerical_cols + ['loan_status']].mean()
print("\nCluster Summary (average values per cluster):")
print(cluster_summary)

In [None]:
plt.figure(figsize=(6, 4))
scatter = plt.scatter(df['person_income'], df['loan_amnt'], c=df['cluster'], cmap='viridis', alpha=0.7)
plt.xlabel('Person Income')
plt.ylabel('Loan Amount')
plt.title('Clustering: Income vs Loan Amount')
plt.colorbar(scatter, label='Cluster')
plt.show()

ANALYSIS: Of the 32,581 loan applicants, three separate groups were identified by the K-Means clustering analysis. Interestingly, with an average loan amount of about $16K and an interest rate close to 12.26%, Cluster 0 (average age ≈26 years, income ≈$65K) has the highest default rate at roughly 38%. A significantly lower default rate of about 17% is displayed by Cluster 1, which consists of older applicants (average age ≈39 years), those with higher earnings (≈$89K), and those with longer employment durations (≈7.6 years). Cluster 2 has the lowest default rate, at about 14%, due to its youthful demographic (average age ≈26 years) and smaller loan amounts (≈$6.3K). These variations imply that the probability of default is highly influenced by variables like age, income, job security, and length of credit history, and that clustering can be used to find groups of borrowers with different risk profiles.

Problem 2:

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

In [None]:
print("\nUnique values in 'loan_grade':", df['loan_grade'].unique())

In [None]:
X_cat = pd.get_dummies(df['loan_grade'], drop_first=True)

In [None]:
y = df['loan_status']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_cat, y, test_size=0.3, random_state=42)

In [None]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

In [None]:
y_pred = log_reg.predict(X_test)

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_mat)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
print("Accuracy: {:.2f}".format(acc))
print("Precision: {:.2f}".format(prec))
print("Recall: {:.2f}".format(rec))
print("F1 Score: {:.2f}".format(f1))

In [None]:
coef_df = pd.DataFrame({
    'Feature': X_cat.columns,
    'Coefficient': log_reg.coef_[0]
})
print("\nLogistic Regression Coefficients:")
print(coef_df)

ANAYLSIS: With 'loan_grade' as the predictor, the logistic regression model performed well for non-default situations (class 0: precision 0.85, recall 0.92, f1-score 0.89), but much worse for defaults (class 1: precision 0.60, recall 0.44, f1-score 0.51). Overall, the model's accuracy was 81%. These measurements show that although the model consistently detects non-defaulters, it has trouble correctly capturing defaults, indicating that its predictive potential is limited when it depends just on loan_grade.
A distinct pattern can be seen in the model's coefficients: as loan_grade rises from B to G, so do the corresponding coefficients, with loan grade G displaying the highest coefficient (4.236444). This pattern suggests that there is a strong correlation between a higher loan grade and a higher chance of default. Although loan_grade is a useful measure of credit risk overall, the model's mediocre performance, especially when it comes to forecasting defaults, indicates that adding more characteristics could improve its prediction accuracy even further.


Survey Feedback:
1. 3
2. 5
3. We enjoy real world examples that help show how this course will impact us in the future. The Guest speaker we had was a perfect example of this.
4. None of our group was introduced to coding before this class, nor did we know we would have to code going into it. A more fundamental teaching of coding would be very beneficial, as opposed to splitting lessons between coding knowledge and financial equations or analyses.
5. This course can be improved by taking things slower and understanding that while some of the students may have prior knowledge on this topic, the majority, especially undergraduate students, have had no prior experience with any of it.