Load CSVs and Fill Missing Values

In [15]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Load CSVs
train_df = pd.read_csv("cs-training.csv")
test_df = pd.read_csv("cs-test.csv")

# Fill missing values
train_df = train_df.fillna(train_df.median())
test_df = test_df.fillna(test_df.median())


Prepare the Training Data

In [16]:
X_train = train_df.drop(['Unnamed: 0', 'SeriousDlqin2yrs'], axis=1)
y_train = train_df['SeriousDlqin2yrs']

Prepare the test data: Dropping empty columns

In [17]:
X_test = test_df.drop(['Unnamed: 0', 'SeriousDlqin2yrs'], axis=1, errors='ignore')

Training and Predicting

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Split training data into train and validation sets
X = train_df.drop(['Unnamed: 0', 'SeriousDlqin2yrs'], axis=1)
y = train_df['SeriousDlqin2yrs']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train model on training split
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred = model.predict(X_val)
y_val_proba = model.predict_proba(X_val)[:, 1]

print("Evaluation on Validation Set:")
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))
print("ROC AUC Score:", roc_auc_score(y_val, y_val_proba))


Evaluation on Validation Set:
Confusion Matrix:
 [[27718   277]
 [ 1629   376]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.99      0.97     27995
           1       0.58      0.19      0.28      2005

    accuracy                           0.94     30000
   macro avg       0.76      0.59      0.62     30000
weighted avg       0.92      0.94      0.92     30000

ROC AUC Score: 0.8415317395028238


In [22]:
# Predict on test data (which has no labels)
X_test = test_df.drop(['Unnamed: 0', 'SeriousDlqin2yrs'], axis=1, errors='ignore')

y_test_pred = model.predict(X_test)
y_test_proba = model.predict_proba(X_test)[:, 1]