In [124]:
import pandas as pd
import numpy as np
import scorecardpy as sc
import EScorecard as es
import json
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from scipy.stats import uniform, randint
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
warnings.filterwarnings("ignore")

In [76]:
df_all = pd.read_csv('data/final_features.csv')
train_flag = pd.read_csv('data/train/train_flag.csv')

In [77]:
df_all = pd.merge(df_all, train_flag[['uid','TARGET']], on='uid', how='inner')

In [78]:
df_all.head()

Unnamed: 0,uid,total_overdue,avg_overdue,max_overdue,min_overdue,std_overdue,num_payments,max_recent_overdue,overall_overdue_trend,max_consecutive_on_time,...,sum_revolving_loans_enquiry,sum_unknown_type_of_loan_enquiry,avg_loan_duration,total_loan_duration,longest_loan_duration,shortest_loan_duration,loans_still_open,proportion_open_loans,days_since_last_loan,TARGET
0,AAA09044550,68,2.106618,44,0,6.656192,33,0,0,9,...,94000,98000,511.0,1022.0,518.0,504.0,0,0.0,2236,0
1,AAA10545297,0,0.0,0,0,0.0,6,0,0,6,...,99000,134000,,0.0,,,1,1.0,1590,0
2,AAA14112888,0,0.0,0,0,0.0,3,0,0,3,...,0,17000,92.0,92.0,92.0,92.0,0,0.0,1610,0
3,AAA20326915,14477,23.810855,917,0,35.8034,151,0,0,48,...,0,0,387.4,1937.0,1450.0,31.0,3,0.375,1591,0
4,AAA31604840,0,0.0,0,0,0.0,78,0,0,43,...,0,184000,463.5,927.0,549.0,378.0,3,0.6,1487,0


In [79]:
df_all['TARGET'].mean()

0.07706839110745899

## Feature selection

In [80]:
# Function to calculate WoE and IV for a single feature
def calculate_woe_iv(df, feature, target):
    # Create bins for the feature if it's continuous
    df[feature+'_bin'] = pd.qcut(df[feature].rank(method="first"), q=10, duplicates="drop")
    # Calculate the number of events (bad loans) and non-events (good loans) in each bin
    grouped = df.groupby(feature+'_bin')[target].agg(['count', 'sum'])
    grouped.columns = ['Total', 'Bad']
    grouped['Good'] = grouped['Total'] - grouped['Bad']
    # Avoid division by zero
    grouped['Bad'] = grouped['Bad'].replace(0, 0.5)
    grouped['Good'] = grouped['Good'].replace(0, 0.5)
    # Calculate distribution of events and non-events
    grouped['Bad_dist'] = grouped['Bad'] / grouped['Bad'].sum()
    grouped['Good_dist'] = grouped['Good'] / grouped['Good'].sum()
    
    grouped['WoE'] = np.log(grouped['Good_dist'] / grouped['Bad_dist'])
    grouped['IV'] = (grouped['Good_dist'] - grouped['Bad_dist']) * grouped['WoE']
    iv = grouped['IV'].sum()
    
    return iv, grouped[['WoE', 'IV']]

In [81]:
# Loop through each feature and calculate IV
iv_dict = {}
for col in df_all.columns:
    if col != 'TARGET':  
        iv, woe_table = calculate_woe_iv(df_all, col, 'TARGET')
        iv_dict[col] = iv

df_IV = pd.DataFrame(list(iv_dict.items()), columns=['Feature', 'IV']).sort_values(by='IV', ascending=False)

In [82]:
# select top 60 features wrt IV
top_60 = list(df_IV.head(60).Feature.values)

In [83]:
df_filtered = df_all[['uid'] + top_60 + ['TARGET']]

In [84]:
corr_matrix = df_filtered[top_60].corr().abs()
iv_dict = df_IV.set_index('Feature')['IV'].to_dict()  

features_to_drop = set()

In [85]:
# Iterate over the correlation matrix to find highly correlated pairs
for i in range(len(corr_matrix.columns)):
    for j in range(i + 1, len(corr_matrix.columns)):
        feature_1 = corr_matrix.columns[i]
        feature_2 = corr_matrix.columns[j]
        
        # Check if correlation is above 0.5 and neither feature is already flagged to be dropped
        if corr_matrix.iloc[i, j] > 0.5 and feature_1 not in features_to_drop and feature_2 not in features_to_drop:
            # Compare IV values and select the feature to keep
            if iv_dict[feature_1] > iv_dict[feature_2]:
                features_to_drop.add(feature_2)
            else:
                features_to_drop.add(feature_1)

In [86]:
filtered_features = [feature for feature in top_60 if feature not in features_to_drop]
df = df_filtered[['uid'] + filtered_features + ['TARGET']]

In [88]:
df = df.set_index('uid')

In [89]:
target = 'TARGET'

In [94]:
df.replace('missing', np.nan, inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')

In [95]:
X = df.drop(columns=['TARGET'])
y = df['TARGET']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


## Training and validation

In [125]:
# These parameters are gained after randomized search performed bwloe
xgb_model = xgb.XGBClassifier(
    subsample=1.0,
    reg_lambda=0.1,
    reg_alpha=0,
    n_estimators=100,
    min_child_weight=2,
    max_depth=4,
    learning_rate=0.01,
    gamma=0.2,
    colsample_bytree=1.0,
    use_label_encoder=False,  
    eval_metric='logloss'    
)


xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1.0,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='logloss', gamma=0.2, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.01, max_bin=256,
              max_cat_to_onehot=4, max_delta_step=0, max_depth=4, max_leaves=0,
              min_child_weight=2, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=0.1, ...)

In [126]:
y_train_pred = xgb_model.predict_proba(X_train)[:, 1]  
y_val_pred = xgb_model.predict_proba(X_val)[:, 1] 

In [127]:
train_auc = roc_auc_score(y_train, y_train_pred)
val_auc = roc_auc_score(y_val, y_val_pred)

In [128]:
print(f'Training AUC: {train_auc:.4f}')
print(f'Validation AUC: {val_auc:.4f}')

Training AUC: 0.6332
Validation AUC: 0.6244


## Hyperparameter tuning

In [101]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__max_depth': [3, 4, 5, 6],
    'classifier__min_child_weight': [1, 2, 5],
    'classifier__gamma': [0, 0.1, 0.2],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.8, 1.0],
    'classifier__reg_alpha': [0, 0.01, 0.1],
    'classifier__reg_lambda': [0.1, 1, 10]
}

In [102]:
# Set up the randomized search with cross-validation
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# random_search = RandomizedSearchCV(model_xgb, param_distributions=param_grid, 
#                                    n_iter=100, scoring='roc_auc', 
#                                    cv=5, verbose=1, n_jobs=-1, random_state=42)

random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=50, scoring='roc_auc', 
                                   cv=3, verbose=2, random_state=42, n_jobs=-1)

In [103]:
# Fit the model
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Parameters: { "classifier__colsample_bytree", "classifier__gamma", "classifier__learning_rate", "classifier__max_depth", "classifier__min_child_weight", "classifier__n_estimators", "classifier__reg_alpha", "classifier__reg_lambda", "classifier__subsample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           callbacks=None, colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric='logloss', gamma=0,
                                           gpu_id=-1, grow_policy='depthwise',
                                           importance_type=None,
                                           interaction_constraints='',
                                           learning_rate=0.300000012,
                                           max_bin...
                                                                         1.0],
                                        'classifier__gamma'

In [104]:
# Step 6: Print the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation AUC: ", random_search.best_score_)


Best parameters found:  {'classifier__subsample': 1.0, 'classifier__reg_lambda': 0.1, 'classifier__reg_alpha': 0, 'classifier__n_estimators': 100, 'classifier__min_child_weight': 2, 'classifier__max_depth': 4, 'classifier__learning_rate': 0.01, 'classifier__gamma': 0.2, 'classifier__colsample_bytree': 1.0}
Best cross-validation AUC:  0.6213497704039089


In [105]:
print(f'Validation AUC after hyperparameter tuning: {val_auc:.4f}')


Validation AUC after hyperparameter tuning: 0.6286


## Test data

In [114]:
df_test = pd.read_csv('data/final_test_features.csv')
test_flag = pd.read_csv('data/test/test_flag.csv')

In [121]:
df_test = df_test.set_index('uid')

In [159]:
df_test = df_test[filtered_features]

In [161]:
df_test.replace('missing', np.nan, inplace=True)
df_test = df_test.apply(pd.to_numeric, errors='coerce')

In [162]:
# df_test['predictions'] = xgb_model.predict(df_test)

df_test['pred'] = xgb_model.predict_proba(df_test)[:, 1]

In [170]:
df_test.reset_index()[['uid','pred']].to_csv('Tej_Patel.csv', index=False)