In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import recall_score, precision_score, f1_score, fbeta_score, accuracy_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.combine import SMOTEENN, SMOTETomek
import xgboost as xgb
from imblearn.pipeline import make_pipeline
from collections import defaultdict

# Load and prepare data
customer_df = pd.read_csv('./Churn_Modelling.csv')
customer_df = customer_df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

# Define features
categorical_features = ['Geography', 'Gender']
numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance',
                    'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

# Split data
X = customer_df.drop('Exited', axis=1)
y = customer_df['Exited']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Standardize numeric features
def Standard_Scaler(df, col_names):
    scaler = StandardScaler().fit(df[col_names])
    df[col_names] = scaler.transform(df[col_names])
    return df

X_train = Standard_Scaler(X_train, numeric_features)
X_test = Standard_Scaler(X_test, numeric_features)

# One-hot encode categorical features
def one_hot_encode(df, col_names):
    df_encoded = pd.get_dummies(df, columns=col_names, drop_first=True, dtype='float64')
    return df_encoded

X_train = one_hot_encode(X_train, categorical_features)
X_test = one_hot_encode(X_test, categorical_features)

# Function to train and evaluate the model
def evaluate_model(pipeline, X_train, y_train, X_test, y_test):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

    # Calculate metrics
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    f2 = fbeta_score(y_test, y_pred, beta=2)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    return recall, precision, f1, f2, accuracy, roc_auc

# Define XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Evaluate models with different sampling techniques
results = defaultdict(list)

# No Sampling
pipeline = make_pipeline(xgb_model)
results['No Sampling'] = evaluate_model(pipeline, X_train, y_train, X_test, y_test)

# Random Oversampling
pipeline = make_pipeline(RandomOverSampler(random_state=42), xgb_model)
results['Random Oversampling'] = evaluate_model(pipeline, X_train, y_train, X_test, y_test)

# SMOTE Oversampling
pipeline = make_pipeline(SMOTE(random_state=42), xgb_model)
results['SMOTE Oversampling'] = evaluate_model(pipeline, X_train, y_train, X_test, y_test)

# SMOTE + Tomek Links
pipeline = make_pipeline(SMOTETomek(random_state=42), xgb_model)
results['SMOTE + Tomek Links'] = evaluate_model(pipeline, X_train, y_train, X_test, y_test)

# SMOTEENN
pipeline = make_pipeline(SMOTEENN(random_state=42), xgb_model)
results['SMOTEENN'] = evaluate_model(pipeline, X_train, y_train, X_test, y_test)

# Class Weights
xgb_model_weighted = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                                       scale_pos_weight=y_train.value_counts()[0]/y_train.value_counts()[1],
                                       random_state=42)
pipeline = make_pipeline(xgb_model_weighted)
results['Class Weights'] = evaluate_model(pipeline, X_train, y_train, X_test, y_test)

# Convert results to DataFrame
columns = ['Recall', 'Precision', 'F1 Score', 'F2 Score', 'Accuracy', 'ROC-AUC']
df_results = pd.DataFrame.from_dict(results, orient='index', columns=columns).reset_index()
df_results.rename(columns={'index': 'XGBoost with'}, inplace=True)

# Display the DataFrame with the results
print(df_results)


          XGBoost with    Recall  Precision  F1 Score  F2 Score  Accuracy  \
0          No Sampling  0.562654   0.399651  0.467347  0.520218    0.7390   
1  Random Oversampling  0.702703   0.321348  0.441018  0.567911    0.6375   
2   SMOTE Oversampling  0.879607   0.264597  0.406818  0.600470    0.4780   
3  SMOTE + Tomek Links  0.877150   0.264444  0.406375  0.599396    0.4785   
4             SMOTEENN  0.872236   0.344995  0.494429  0.668047    0.6370   
5        Class Weights  0.707617   0.336056  0.455696  0.579477    0.6560   

    ROC-AUC  
0  0.705269  
1  0.714167  
2  0.746690  
3  0.748527  
4  0.827034  
5  0.721225  


In [2]:
df_results

Unnamed: 0,XGBoost with,Recall,Precision,F1 Score,F2 Score,Accuracy,ROC-AUC
0,No Sampling,0.562654,0.399651,0.467347,0.520218,0.739,0.705269
1,Random Oversampling,0.702703,0.321348,0.441018,0.567911,0.6375,0.714167
2,SMOTE Oversampling,0.879607,0.264597,0.406818,0.60047,0.478,0.74669
3,SMOTE + Tomek Links,0.87715,0.264444,0.406375,0.599396,0.4785,0.748527
4,SMOTEENN,0.872236,0.344995,0.494429,0.668047,0.637,0.827034
5,Class Weights,0.707617,0.336056,0.455696,0.579477,0.656,0.721225


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import recall_score, precision_score, f1_score, fbeta_score, accuracy_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.combine import SMOTEENN, SMOTETomek
import xgboost as xgb
from imblearn.pipeline import make_pipeline
from collections import defaultdict

# Load and prepare data
customer_df = pd.read_csv('./Churn_Modelling.csv')
customer_df = customer_df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

# Define features
categorical_features = ['Geography', 'Gender']
numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance',
                    'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

# Split data
X = customer_df.drop('Exited', axis=1)
y = customer_df['Exited']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Standardize numeric features
def Standard_Scaler(df, col_names):
    scaler = StandardScaler().fit(df[col_names])
    df[col_names] = scaler.transform(df[col_names])
    return df

X_train = Standard_Scaler(X_train, numeric_features)
X_test = Standard_Scaler(X_test, numeric_features)

# One-hot encode categorical features
def one_hot_encode(df, col_names):
    df_encoded = pd.get_dummies(df, columns=col_names, drop_first=True, dtype='float64')
    return df_encoded

X_train = one_hot_encode(X_train, categorical_features)
X_test = one_hot_encode(X_test, categorical_features)

# Define parameter grid for GridSearchCV
param_grid = {
    'xgbclassifier__n_estimators': [100, 200],
    'xgbclassifier__max_depth': [3, 6, 9],
    'xgbclassifier__learning_rate': [0.01, 0.1, 0.2],
    'xgbclassifier__subsample': [0.7, 0.8, 1.0],
}

# Define cross-validation strategy
cv = KFold(n_splits=3, shuffle=True, random_state=42)

# Function to train and evaluate the model with hyperparameter tuning
def evaluate_model_with_tuning(pipeline, param_grid, X_train, y_train, X_test, y_test):
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    f2 = fbeta_score(y_test, y_pred, beta=2)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    return recall, precision, f1, f2, accuracy, roc_auc, grid_search.best_params_

# Evaluate models with different sampling techniques
results = defaultdict(list)
best_params = {}

# No Sampling
pipeline = make_pipeline(xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
results['No Sampling'], best_params['No Sampling'] = evaluate_model_with_tuning(pipeline, param_grid, X_train, y_train, X_test, y_test)[:6], evaluate_model_with_tuning(pipeline, param_grid, X_train, y_train, X_test, y_test)[6]

# Random Oversampling
pipeline = make_pipeline(RandomOverSampler(random_state=42), xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
results['Random Oversampling'], best_params['Random Oversampling'] = evaluate_model_with_tuning(pipeline, param_grid, X_train, y_train, X_test, y_test)[:6], evaluate_model_with_tuning(pipeline, param_grid, X_train, y_train, X_test, y_test)[6]

# SMOTE Oversampling
pipeline = make_pipeline(SMOTE(random_state=42), xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
results['SMOTE Oversampling'], best_params['SMOTE Oversampling'] = evaluate_model_with_tuning(pipeline, param_grid, X_train, y_train, X_test, y_test)[:6], evaluate_model_with_tuning(pipeline, param_grid, X_train, y_train, X_test, y_test)[6]

# SMOTE + Tomek Links
pipeline = make_pipeline(SMOTETomek(random_state=42), xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
results['SMOTE + Tomek Links'], best_params['SMOTE + Tomek Links'] = evaluate_model_with_tuning(pipeline, param_grid, X_train, y_train, X_test, y_test)[:6], evaluate_model_with_tuning(pipeline, param_grid, X_train, y_train, X_test, y_test)[6]

# SMOTEENN
pipeline = make_pipeline(SMOTEENN(random_state=42), xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
results['SMOTEENN'], best_params['SMOTEENN'] = evaluate_model_with_tuning(pipeline, param_grid, X_train, y_train, X_test, y_test)[:6], evaluate_model_with_tuning(pipeline, param_grid, X_train, y_train, X_test, y_test)[6]

# Class Weights
xgb_model_weighted = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                                       scale_pos_weight=y_train.value_counts()[0]/y_train.value_counts()[1],
                                       random_state=42)
pipeline = make_pipeline(xgb_model_weighted)
results['Class Weights'], best_params['Class Weights'] = evaluate_model_with_tuning(pipeline, param_grid, X_train, y_train, X_test, y_test)[:6], evaluate_model_with_tuning(pipeline, param_grid, X_train, y_train, X_test, y_test)[6]

# Convert results to DataFrame
columns = ['Recall', 'Precision', 'F1 Score', 'F2 Score', 'Accuracy', 'ROC-AUC']
df_results = pd.DataFrame.from_dict(results, orient='index', columns=columns).reset_index()
df_results.rename(columns={'index': 'XGBoost with'}, inplace=True)

# Display the DataFrame with the results
print("Best Parameters for each sampling technique:")
print(best_params)
print("\nEvaluation Metrics:")
print(df_results)


Best Parameters for each sampling technique:
{'No Sampling': {'xgbclassifier__learning_rate': 0.1, 'xgbclassifier__max_depth': 3, 'xgbclassifier__n_estimators': 100, 'xgbclassifier__subsample': 0.8}, 'Random Oversampling': {'xgbclassifier__learning_rate': 0.1, 'xgbclassifier__max_depth': 3, 'xgbclassifier__n_estimators': 100, 'xgbclassifier__subsample': 0.7}, 'SMOTE Oversampling': {'xgbclassifier__learning_rate': 0.1, 'xgbclassifier__max_depth': 3, 'xgbclassifier__n_estimators': 200, 'xgbclassifier__subsample': 0.8}, 'SMOTE + Tomek Links': {'xgbclassifier__learning_rate': 0.1, 'xgbclassifier__max_depth': 3, 'xgbclassifier__n_estimators': 200, 'xgbclassifier__subsample': 0.8}, 'SMOTEENN': {'xgbclassifier__learning_rate': 0.1, 'xgbclassifier__max_depth': 3, 'xgbclassifier__n_estimators': 200, 'xgbclassifier__subsample': 0.7}, 'Class Weights': {'xgbclassifier__learning_rate': 0.1, 'xgbclassifier__max_depth': 3, 'xgbclassifier__n_estimators': 100, 'xgbclassifier__subsample': 1.0}}

Evaluat

In [4]:
df_results

Unnamed: 0,XGBoost with,Recall,Precision,F1 Score,F2 Score,Accuracy,ROC-AUC
0,No Sampling,0.535627,0.441296,0.483907,0.513666,0.7675,0.736105
1,Random Oversampling,0.810811,0.293333,0.430809,0.599346,0.564,0.737041
2,SMOTE Oversampling,0.891892,0.292742,0.440801,0.632845,0.5395,0.787706
3,SMOTE + Tomek Links,0.874693,0.304014,0.451204,0.635941,0.567,0.796771
4,SMOTEENN,0.874693,0.342967,0.492734,0.667667,0.6335,0.843514
5,Class Weights,0.823096,0.2827,0.420854,0.59545,0.539,0.734095
