In [1]:
import zipfile
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

In [2]:
# Extract the uploaded zip file
zip_file_path = '/content/playground-series-s4e10.zip'
extract_dir = '/content/playground-series-s4e10/'


In [3]:
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [4]:
# Load the datasets
train_df = pd.read_csv(os.path.join(extract_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(extract_dir, 'test.csv'))
sample_submission_df = pd.read_csv(os.path.join(extract_dir, 'sample_submission.csv'))


In [5]:
# Encode categorical features
label_encoders = {}
categorical_features = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']

In [6]:
for feature in categorical_features:
    le = LabelEncoder()
    train_df[feature] = le.fit_transform(train_df[feature])
    test_df[feature] = le.transform(test_df[feature])
    label_encoders[feature] = le


In [7]:
# Split the training data into train and validation sets
X = train_df.drop(columns=['id', 'loan_status'])
y = train_df['loan_status']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']

In [9]:
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_val[numerical_features] = scaler.transform(X_val[numerical_features])
test_df[numerical_features] = scaler.transform(test_df[numerical_features])

In [10]:
# Perform hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [11]:
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='roc_auc', verbose=2)
grid_search.fit(X_train, y_train)


Fitting 3 folds for each of 243 candidates, totalling 729 fits


243 fits failed out of a total of 729.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
162 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sk

In [12]:
# Get the best model from GridSearch
best_rf_model = grid_search.best_estimator_

In [13]:
# Make predictions on the validation set
y_val_pred = best_rf_model.predict_proba(X_val)[:, 1]

In [14]:
# Evaluate the model using ROC-AUC score
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f'Best ROC-AUC Score: {roc_auc}')

Best ROC-AUC Score: 0.9380307681684052


In [15]:
# Make predictions on the test dataset
test_pred = best_rf_model.predict_proba(test_df.drop(columns=['id']))[:, 1]

In [17]:
# Prepare the submission file
submission_df = pd.DataFrame({'id': test_df['id'], 'loan_status': test_pred})
submission_file_path = '/content/loan_approval_submission_tuned.csv'
submission_df.to_csv(submission_file_path, index=False)

In [18]:
# Display the first few rows of the submission file
print(submission_df.head())

      id  loan_status
0  58645     0.943852
1  58646     0.018923
2  58647     0.543481
3  58648     0.013619
4  58649     0.137626
