Step One: Preprocess Data

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('credit_risk_dataset.csv')

In [None]:
# Check for non-numeric values in the DataFrame
non_numeric_cols = df.select_dtypes(include=['object']).columns
print(f"Non-numeric columns: {non_numeric_cols}")

In [None]:
# One-hot encode the 'loan_intent', 'person_home_ownership', and other categorical variables
df = pd.get_dummies(df, columns=['loan_intent', 'person_home_ownership', 'loan_grade', 'cb_person_default_on_file'], drop_first=True)

In [None]:
# Identify numerical columns
numerical_cols = ['person_age', 'person_income', 'person_emp_length', 'cb_person_cred_hist_length', 'loan_amnt', 'loan_percent_income', 'loan_int_rate']

# Fill missing values in numerical columns with the median
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

Step Two: Train the logistic regression model

In [None]:
# Ensure all values are numeric
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Check for any remaining NaN values
print(df.isna().sum())
df.dropna(inplace=True)

In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target variable (y)
X = df.drop('loan_status', axis=1)
y = df['loan_status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Output model coefficients
coefficients = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': model.coef_[0]})
print(coefficients)

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Visualize confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

The logistic regression model trained to classify loan status (default or non-default) produced an accuracy of approximately 84.2%, indicating that it correctly classified 84.2% of the cases. The model's precision was 75.2%, suggesting that when it predicted a loan default, it was correct about 75.2% of the time. However, the recall was 43.0%, meaning the model identified only 43.0% of the actual defaults. The F1-score, which balances precision and recall, was 54.7%. These results indicate that while the model is generally reliable and good at predicting non-defaults, it may miss a significant number of actual defaults, highlighting the need for further refinement to improve recall and overall performance in identifying loan defaults.                                                                       
**Microstoft Copilot was used to help generate code**