In [2]:
import torch, pandas as pd, numpy as np, seaborn as sns
from matplotlib import pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, roc_curve, roc_auc_score, RocCurveDisplay, make_scorer
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV, RandomizedSearchCV
from itertools import combinations, chain
from xgboost import XGBClassifier

In [3]:
#load the cleaned data
df = pd.read_csv('https://raw.githubusercontent.com/tanujmath/CreditGuard/main/data/cs-training-new.csv')

## We do the Model Fitting twice, one with scaling and one without

### Unscaled Model Fitting

In [4]:
#Split the data into training and testing
X = df.drop('SeriousDlqin2yrs', axis = 1)
y=df['SeriousDlqin2yrs']


X_train, X_test, y_train, y_test = train_test_split(X,y ,
                                   random_state=14,
                                   test_size=0.2,
                                   shuffle=True)

#### Linear Regression

In [5]:
# Linear Regression with standardized data

l_reg = LinearRegression(copy_X=True, fit_intercept=True)
l_reg.fit(X_train,y_train)

# Calculate the AUC
l_reg_auc = roc_auc_score(y_test, l_reg.predict(X_test))
l_reg_auc

0.6861101962589922

#### Ridge Regularization

In [6]:
reg_ridge = Ridge(alpha=0.2, fit_intercept=True, copy_X=True)
reg_ridge.fit(X_train, y_train)

# Calculate the AUC
ridge_auc = roc_auc_score(y_test, reg_ridge.predict(X_test))
ridge_auc

0.6861101416409553

#### Lasso Regularization

In [7]:
reg_lasso = Lasso(alpha=0.2, fit_intercept=True, copy_X=True)
reg_lasso.fit(X_train, y_train)

# Calculate the AUC
lasso_auc = roc_auc_score(y_test, reg_lasso.predict(X_test))
lasso_auc

0.6298431861541092

#### Logistic Regression

In [10]:
#Define the model with random hyperparameters
logit = LogisticRegression(n_jobs = -1)

#Fit data into the model
logit.fit(X_train, y_train)

#Predict probabilities
logit_scores_proba = logit.predict_proba(X_test)
preds = logit_scores_proba[:,1]  # predictions

log_auc = round(roc_auc_score(y_test,preds),5)

#Printing area under curve
print('AUC Score : ',  log_auc)

AUC Score :  0.64475


#### Hyperparameter tuning

In [11]:
# Initialize logistic regression model
model = LogisticRegression(solver='liblinear', n_jobs=-1)

# Define hyperparameters to tune
hyperparameters = {'C': np.logspace(-4, 4, 10), 'penalty': ['l1', 'l2'],
                   'max_iter': list(range(100,800,100))}

# Define scorer based on AUC
scorer = make_scorer(roc_auc_score, needs_proba=True)

## GRIDSEARCH

# Initialize GridSearchCV
gridsearch = GridSearchCV(model, hyperparameters, scoring=scorer, cv=5)

# Fit GridSearchCV on training data
gridsearch.fit(X_train, y_train)

# Get the best model
best_model = gridsearch.best_estimator_


# Predict probabilities on test data using the best model
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate AUC
log_grid_auc = round(roc_auc_score(y_test, y_pred_proba),5)

print(f'The AUC of the test set through gridsearch tuning is {log_grid_auc}')

## RANDOMSEARCH

# Initialize logistic regression model
model = LogisticRegression(solver='liblinear', n_jobs=-1)


# Initialize RandomizedSearchCV
randomsearch = RandomizedSearchCV(model, hyperparameters, scoring=scorer, cv=5)

# Fit RandomizedSearchCV on training data
randomsearch.fit(X_train, y_train)

# Get the best model
best_model = randomsearch.best_estimator_


# Predict probabilities on test data using the best model
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate AUC
log_random_auc = round(roc_auc_score(y_test, y_pred_proba),5)

print(f'The AUC of the test set through randomsearch tuning is {log_random_auc}')


The AUC of the test set through gridsearch tuning is 0.66018
The AUC of the test set through randomsearch tuning is 0.66017


#### Random Forests

In [12]:
# Define the model
rf = RandomForestClassifier()

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'n_jobs': [-1]
}

# Define scorer based on AUC
scorer = make_scorer(roc_auc_score, needs_proba=True)

# Initialize RandomizedSearchCV
randomsearch = RandomizedSearchCV(rf, param_grid, scoring=scorer, cv=5, verbose = 1)

# Fit RandomizedSearchCV on training data
randomsearch.fit(X_train, y_train)

# Get the best model
best_model = randomsearch.best_estimator_

# Get the best parameters
best_params = randomsearch.best_params_

#print(f"Best parameters: {best_params}")

# Fit the model with the best parameters
rf_best = RandomForestClassifier(**best_params)
rf_best.fit(X_train, y_train)

# Make predictions
y_pred = rf_best.predict(X_test)
y_pred_proba = rf_best.predict_proba(X_test)[:, 1]

# Evaluate the model
rf_auc = round(roc_auc_score(y_test, y_pred_proba),5)

print(f"AUC: {rf_auc}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
AUC: 0.81637


#### XGBoost

In [13]:
# Define the model. If GPU is present, use GPU.
model = XGBClassifier(tree_method='hist', device = 'cuda')

# Define the parameters for the RandomizedSearch
params = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5]
}

#Initialize randomized search
random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=5,
                                   scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

# Fit the model to the training data
random_search.fit(X_train, y_train)

# Make predictions
y_pred = random_search.predict(X_test)
y_pred_proba = random_search.predict_proba(X_test)[:, 1]

# Evaluate the model
xg_auc = roc_auc_score(y_test, y_pred_proba)

print(f"AUC: {xg_auc}")

Fitting 5 folds for each of 5 candidates, totalling 25 fits




AUC: 0.8155738280198175


### Now we use scaled data

In [15]:
# first make a deep copy of the data
df2=df.copy(deep=True)

# Make a scaler object : i.e. the StandardScaler object below which will know what to do make the standardization in the way we want
scaler=StandardScaler()

# fit the scaler : calling 'fit' means, the case of standardizing here, knowing the columns and the mean, standard deviation of the each of the columns
scaler.fit(df2.iloc[:,2:9])

# scaling the data
df_scale =scaler.transform(df2.iloc[:,2:9])

In [21]:
#Split the data into training and testing
y=df['SeriousDlqin2yrs']

X_train_s, X_test_s, y_train, y_test = train_test_split(df_scale,y ,
                                   random_state=14,
                                   test_size=0.2,
                                   shuffle=True)

#### Linear Regression

In [19]:
# Linear Regression with standardized data

l_reg = LinearRegression(copy_X=True, fit_intercept=True)
l_reg.fit(X_train_s,y_train)

# Calculate the AUC
l_reg_auc_s = roc_auc_score(y_test, l_reg.predict(X_test_s))
l_reg_auc_s

0.6852609404024366

#### Ridge Regularization

In [20]:
reg_ridge = Ridge(alpha=0.2, fit_intercept=True, copy_X=True)
reg_ridge.fit(X_train_s, y_train)

# Calculate the AUC
ridge_auc_s = roc_auc_score(y_test, reg_ridge.predict(X_test_s))
ridge_auc_s

0.6852609130934181

#### Lasso Regularization

In [22]:
reg_lasso = Lasso(alpha=0.2, fit_intercept=True, copy_X=True)
reg_lasso.fit(X_train_s, y_train)

# Calculate the AUC
lasso_auc_s = roc_auc_score(y_test, reg_lasso.predict(X_test_s))
lasso_auc_s

0.5

#### Logisitic Regression

In [23]:
#Define the model with random hyperparameters
logit = LogisticRegression(n_jobs = -1)

#Fit data into the model
logit.fit(X_train_s, y_train)

#Predict probabilities
logit_scores_proba = logit.predict_proba(X_test_s)
preds = logit_scores_proba[:,1]  # predictions

log_auc_s = round(roc_auc_score(y_test,preds),5)

#Printing area under curve
print('AUC Score : ',  log_auc_s)

AUC Score :  0.65712


#### Hyperparameter tuning

In [25]:
# Initialize logistic regression model
model = LogisticRegression(solver='liblinear')

# Define hyperparameters to tune
hyperparameters = {'C': np.logspace(-4, 4, 10), 'penalty': ['l1', 'l2'],
                   'max_iter': list(range(100,800,100))}

# Define scorer based on AUC
scorer = make_scorer(roc_auc_score, needs_proba=True)

## GRIDSEARCH

# Initialize GridSearchCV
gridsearch = GridSearchCV(model, hyperparameters, scoring=scorer, cv=5)

# Fit GridSearchCV on training data
gridsearch.fit(X_train_s, y_train)

# Get the best model
best_model = gridsearch.best_estimator_


# Predict probabilities on test data using the best model
y_pred_proba = best_model.predict_proba(X_test_s)[:, 1]

# Calculate AUC
log_grid_auc_s = round(roc_auc_score(y_test, y_pred_proba),5)

print(f'The AUC of the test set through gridsearch tuning is {log_grid_auc_s}')

## RANDOMSEARCH

# Initialize logistic regression model
model = LogisticRegression(solver='liblinear')


# Initialize RandomizedSearchCV
randomsearch = RandomizedSearchCV(model, hyperparameters, scoring=scorer, cv=5)

# Fit RandomizedSearchCV on training data
randomsearch.fit(X_train_s, y_train)

# Get the best model
best_model = randomsearch.best_estimator_


# Predict probabilities on test data using the best model
y_pred_proba = best_model.predict_proba(X_test_s)[:, 1]

# Calculate AUC
log_random_auc_s = round(roc_auc_score(y_test, y_pred_proba),5)

print(f'The AUC of the test set through randomsearch tuning is {log_random_auc_s}')


The AUC of the test set through gridsearch tuning is 0.67749
The AUC of the test set through randomsearch tuning is 0.66404


#### Random Forests

In [26]:
# Define the model
rf = RandomForestClassifier()

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'n_jobs': [-1]
}

# Define scorer based on AUC
scorer = make_scorer(roc_auc_score, needs_proba=True)

# Initialize RandomizedSearchCV
randomsearch = RandomizedSearchCV(rf, param_grid, scoring=scorer, cv=5, verbose = 1)

# Fit RandomizedSearchCV on training data
randomsearch.fit(X_train_s, y_train)

# Get the best model
best_model = randomsearch.best_estimator_

# Get the best parameters
best_params = randomsearch.best_params_

#print(f"Best parameters: {best_params}")

# Fit the model with the best parameters
rf_best = RandomForestClassifier(**best_params)
rf_best.fit(X_train_s, y_train)

# Make predictions
y_pred = rf_best.predict(X_test_s)
y_pred_proba = rf_best.predict_proba(X_test_s)[:, 1]

# Evaluate the model
rf_auc_s = round(roc_auc_score(y_test, y_pred_proba),5)

print(f"AUC: {rf_auc_s}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
AUC: 0.8152


#### XGBoost

In [27]:
# Define the model. If GPU is present, use GPU.
model = XGBClassifier(tree_method='hist', device = 'cuda')

# Define the parameters for the RandomizedSearch
params = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5]
}

#Initialize randomized search
random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=5,
                                   scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)

# Fit the model to the training data
random_search.fit(X_train_s, y_train)

# Make predictions
y_pred = random_search.predict(X_test_s)
y_pred_proba = random_search.predict_proba(X_test_s)[:, 1]

# Evaluate the model
xg_auc_s = roc_auc_score(y_test, y_pred_proba)

print(f"AUC: {xg_auc_s}")

Fitting 5 folds for each of 5 candidates, totalling 25 fits




AUC: 0.8145396081811266
