In [4]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import RFE
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from scipy.stats import randint, uniform


#from collections import Counter


# Step 2: Load and Explore Dataset

def load_data(file_path):
    data = pd.read_csv(file_path)

    return data 
   

def preprocess_and_split_data(data, target_column):
    
    ''' This is another way to implement label encoder
    le = preprocessing.LabelEncoder()
    for column_name in data.columns:
       if data[column_name].dtype == object:
          data[column_name] = le.fit_transform(data[column_name])
     else:
        pass
    '''
    
    X = data.drop(columns=[target_column])  # Drop the target column to get the features
    y = data[target_column]  # Extract the target variable

     # Feature Engineering
    X['balance_salary_ratio'] = X['balance'] / (X['estimated_salary'] + 1)
    X['age_group'] = pd.cut(X['age'], bins=[0, 30, 45, 60, 100], labels=[0, 1, 2, 3])

    le = LabelEncoder()
    for column in X.select_dtypes(include=['object']):  # For object columns only
        X[column] = le.fit_transform(X[column])
    
    # Encode the target variable if it's not already numeric
    if y.dtype == 'object':
        y = le.fit_transform(y)
    y = le.fit_transform(y)


    # Standardization
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    #Convert float columns to integers
    #for column in X.select_dtypes(include=['float']):  # For float columns only
    #    X[column] = X[column].astype(int)  # Convert to integer
   
    # Feature Selection
    rfe = RFE(estimator=RandomForestClassifier(n_estimators=10, random_state=15), n_features_to_select=8)
    X = pd.DataFrame(rfe.fit_transform(X, y), columns=X.columns[rfe.support_])

    # Handle Class Imbalance
    smote = SMOTE(random_state=15)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    return train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=15, stratify=y_resampled)  # Split the data into training and  testing sets,and use stratify to approximately maintain the imbalanced ratio of taget column




'''
# Step 3: Train Random Forest Model
def train_random_forest(X_train, y_train):
    
    #print("counter before SMOTE:", Counter(y_train))
    #Apply SMOTE for resampling
    #smote = SMOTE(random_state=15)
    #X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    #print("Counter after SMOTE:", Counter(y_resampled))
    
    rf = RandomForestClassifier(random_state=15, class_weight='balanced')  # Create an instance of RandomForestClassifier with fixed random state
    rf.fit(X_train, y_train)  # Fit the model to the training data
    print("these are the params",rf.get_params())
    return rf  # Return the trained model
'''

'''
# Step 3: Train Random Forest Model with Hyperparameter Tuning
def train_random_forest_with_tuning(X_train, y_train):
    # Define hyperparameter space for Random Search
    param_dist = {
    'n_estimators': randint(50, 200),  # Number of trees in the forest
    'max_depth': [None] + list(range(5, 20)),  # Depth of each tree
    'min_samples_split': randint(2, 10),  # Minimum number of samples required to split an internal node
    'min_samples_leaf': randint(1, 10),  # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
    'criterion': ['gini', 'entropy']  # Function to measure the quality of a split
    }
    
    # Create RandomizedSearchCV object and fit it to training data
    
    rf = RandomForestClassifier(random_state=15, class_weight='balanced')
    randomized_search = RandomizedSearchCV(estimator=rf,
                                        param_distributions=param_dist,
                                        n_iter=100,  # Number of parameter settings sampled
                                        cv=5,  # Cross-validation splitting strategy
                                        scoring='accuracy',  # Metric to optimize
                                        n_jobs=-1,  # Use all available cores
                                        random_state=15)
    
    randomized_search.fit(X_train, y_train)

    # Return the best model found by RandomizedSearchCV
    return randomized_search.best_estimator_, randomized_search.best_params_, randomized_search.best_score_

'''

# Step 3: Train XGBoost Model with Hyperparameter Tuning
def train_xgboost_with_tuning(X_train, y_train):
    param_dist = {
        'n_estimators': randint(100, 1000),
        'max_depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.3),
        'subsample': uniform(0.6, 0.4),
        'colsample_bytree': uniform(0.6, 0.4)
    }

    xgb = XGBClassifier(random_state=15, use_label_encoder=False, eval_metric='logloss')
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)
    
    randomized_search = RandomizedSearchCV(
        estimator=xgb,
        param_distributions=param_dist,
        n_iter=100,
        cv=cv,
        scoring='accuracy',
        n_jobs=-1,
        random_state=15
    )
    
    randomized_search.fit(X_train, y_train)
    return randomized_search.best_estimator_, randomized_search.best_params_, randomized_search.best_score_

# Step 4: Evaluate Model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)  # Get the model's predictions on the test data

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Calculate F1 score (weighted average for imbalanced classes)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Calculate precision
    precision = precision_score(y_test, y_pred, average='weighted')

    # Calculate recall
    recall = recall_score(y_test, y_pred, average='weighted')

    # Specificity (True Negative Rate)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()  # Get the confusion matrix
    specificity = tn / (tn + fp)  # Calculate specificity

    # Display metrics
    print("Accuracy:", accuracy)
    print("F1 Score (Weighted):", f1)
    print("Precision (Weighted):", precision)
    print("Recall (Weighted):", recall)
    print("Specificity:", specificity)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))  # Detailed classification report



#print("\nResults with SMOTE and class_weight='balanced':")
#print(classification_report(y_test, rf_pred))



# Step 5: Main Workflow
def run_analysis(file_path, target_column):
    # Load data
    data = load_data(file_path)

    #this is part of EDA, sort it out later
    print("Data type:", type(data))
    print("Data columns:", data.columns)
    print("First few rows of data:\n", data.head())
    print("Null values in each column:\n", data.isnull().sum())

    #process and split data
    X_train, X_test, y_train, y_test = preprocess_and_split_data(data, target_column)
    
    # EDA as well
    print("Shape of X_train:", X_train.shape)
    print("Shape of X_test:", X_test.shape)

     # Train the Random Forest model with hyperparameter tuning
    best_model, best_params, best_score = train_xgboost_with_tuning(X_train, y_train)

    # Evaluate the best model on test data
    print(f"\nEvaluation Metrics for {file_path} using randomsearchcv:\n")
    evaluate_model(best_model, X_test, y_test)

    # Print best parameters and score from tuning
    print("Best hyperparameters:", best_params)
    print("Best cross-validated score:", best_score)

    '''
    # Train the Random Forest model
    model = train_random_forest(X_train, y_train)
    
    # Evaluate the model and print metrics
    print(f"\nEvaluation Metrics for {file_path}:\n")
    evaluate_model(model, X_test, y_test)
    '''


# Step 6: Test with different datasets
file_path = './BankCustomerChurnPrediction.csv'  # Replace with actual file path of Dataset 1
target_column = 'churn'  # Replace with the actual target column name
run_analysis(file_path, target_column)


#this is part of EDA
#data = pd.read_csv(file_path)



XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/tusharjoshi/.pyenv/versions/3.12.7/lib/python3.12/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: '@rpath/libomp.dylib'\n  Referenced from: '/Users/tusharjoshi/.pyenv/versions/3.12.7/lib/python3.12/site-packages/xgboost/lib/libxgboost.dylib'\n  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/Users/tusharjoshi/.pyenv/versions/3.12.7/lib/libomp.dylib' (no such file), '/Users/tusharjoshi/.pyenv/versions/3.12.7/lib/libomp.dylib' (no such file), '/Users/tusharjoshi/.pyenv/versions/3.12.7/lib/libomp.dylib' (no such file), '/Users/tusharjoshi/.pyenv/versions/3.12.7/lib/libomp.dylib' (no such file), '/Users/tusharjoshi/.pyenv/versions/3.12.7/lib/libomp.dylib' (no such file), '/usr/local/lib/libomp.dylib' (no such file), '/usr/lib/libomp.dylib' (no such file)"]


In [None]:
import struct

print(f"Python Architecture: {struct.calcsize('P') * 8}-bit")


Python Architecture: 64-bit


In [None]:
data=load_data(file_path)
data.info()
categorical = [var for var in data.columns if data[var].dtype=='O']
print('There are {} categorical variables\n'.format(len(categorical)))

print('The categorical variables are :\n\n', categorical)
print(data[categorical].head())

numerical = [var for var in data.columns if data[var].dtype!='O']

print('There are {} numerical variables\n'.format(len(numerical)))

print('The numerical variables are :\n\n', numerical)
data[numerical].head()