In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
from google.colab import files  # Use this only in Google Colab

uploaded = files.upload()  # This will prompt you to upload a file

import pandas as pd

# Load the uploaded CSV file into a DataFrame
# If you uploaded 'your_file.csv', change the filename accordingly
df = pd.read_csv(next(iter(uploaded)))
df.head()


In [None]:
df.columns


In [None]:
# Drop rows where any of the selected features are missing
df_cleaned = df[features + ['Delinquent_Account']].dropna()

# Redefine features and target
X = df_cleaned[features]
y = df_cleaned['Delinquent_Account']


In [None]:
from sklearn.impute import SimpleImputer

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Proceed with train-test split on imputed data
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.3, random_state=42)


In [None]:
# Step 1: Import
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

# Step 2: Load Data
df = pd.read_csv("Geldium_Dataset.csv")

# Step 3: Feature Selection
features = ['Income', 'Credit_Score', 'Credit_Utilization', 'Missed_Payments',
            'Debt_to_Income_Ratio', 'Account_Tenure']
X = df[features]
y = df['Delinquent_Account']

# Step 4: Impute Missing Values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.3, random_state=42)

# Step 6: Train Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 7: Predict & Evaluate
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))

# Step 8: Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr, label='Logistic Regression')
plt.plot([0,1], [0,1], '--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Step 1: Reuse your imputed data
# (Assumes `X_imputed`, `y` already defined as in previous step)

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.3, random_state=42)

# Step 3: Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 4: Predict
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]

# Step 5: Evaluate
print("Classification Report (Random Forest):\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob_rf))

# Step 6: Plot ROC Curve
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)
plt.plot(fpr_rf, tpr_rf, label='Random Forest')
plt.plot([0,1], [0,1], '--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve – Random Forest')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Step 1: Get feature names (after dropping unneeded columns earlier)
feature_names = X.columns

# Step 2: Get importance values
importances = rf_model.feature_importances_

# Step 3: Create DataFrame for visualization
feat_imp_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Step 4: Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feat_imp_df.head(10))
plt.title("Top 10 Important Features – Random Forest")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.tight_layout()
plt.grid(True)
plt.show()
