In [23]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

#from collections import Counter
#from imblearn.over_sampling import SMOTE


# Step 2: Load and Explore Dataset

def load_data(file_path):
    data = pd.read_csv(file_path)
    return data 
   

def preprocess_and_split_data(data, target_column):
    
    ''' This is another way to implement label encoder
    le = preprocessing.LabelEncoder()
    for column_name in data.columns:
       if data[column_name].dtype == object:
          data[column_name] = le.fit_transform(data[column_name])
     else:
        pass
    '''
    
    X = data.drop(columns=[target_column])  # Drop the target column to get the features
    y = data[target_column]  # Extract the target variable

    le = LabelEncoder()
    for column in X.select_dtypes(include=['object']):
        X[column] = le.fit_transform(X[column])
    
    y = le.fit_transform(y)
    
    return train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)  # Split the data into training and  testing sets,and use stratify to approximately maintain the imbalanced ratio of taget column




'''
# Step 3: Train Random Forest Model
def train_random_forest(X_train, y_train):
    
    #print("counter before SMOTE:", Counter(y_train))
    #Apply SMOTE for resampling
    #smote = SMOTE(random_state=15)
    #X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    #print("Counter after SMOTE:", Counter(y_resampled))
    
    rf = RandomForestClassifier(random_state=15, class_weight='balanced')  # Create an instance of RandomForestClassifier with fixed random state
    rf.fit(X_train, y_train)  # Fit the model to the training data
    print("these are the params",rf.get_params())
    return rf  # Return the trained model
'''
# Step 3: Train Random Forest Model with Hyperparameter Tuning
def train_random_forest_with_tuning(X_train, y_train):
    # Define hyperparameter space for Random Search
    param_dist = {
    'n_estimators': randint(50, 200),  # Number of trees in the forest
    'max_depth': [None] + list(range(5, 20)),  # Depth of each tree
    'min_samples_split': randint(2, 10),  # Minimum number of samples required to split an internal node
    'min_samples_leaf': randint(1, 10),  # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
    'criterion': ['gini', 'entropy']  # Function to measure the quality of a split
    }
    
    # Create RandomizedSearchCV object and fit it to training data
    
    rf = RandomForestClassifier(random_state=15, class_weight='balanced')
    randomized_search = RandomizedSearchCV(estimator=rf,
                                        param_distributions=param_dist,
                                        n_iter=100,  # Number of parameter settings sampled
                                        cv=5,  # Cross-validation splitting strategy
                                        scoring='accuracy',  # Metric to optimize
                                        n_jobs=-1,  # Use all available cores
                                        random_state=15)
    
    randomized_search.fit(X_train, y_train)

    # Return the best model found by RandomizedSearchCV
    return randomized_search.best_estimator_, randomized_search.best_params_, randomized_search.best_score_


# Step 4: Evaluate Model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)  # Get the model's predictions on the test data

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Calculate F1 score (weighted average for imbalanced classes)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Calculate precision
    precision = precision_score(y_test, y_pred, average='weighted')

    # Calculate recall
    recall = recall_score(y_test, y_pred, average='weighted')

    # Specificity (True Negative Rate)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()  # Get the confusion matrix
    specificity = tn / (tn + fp)  # Calculate specificity

    # Display metrics
    print("Accuracy:", accuracy)
    print("F1 Score (Weighted):", f1)
    print("Precision (Weighted):", precision)
    print("Recall (Weighted):", recall)
    print("Specificity:", specificity)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))  # Detailed classification report



#print("\nResults with SMOTE and class_weight='balanced':")
#print(classification_report(y_test, rf_pred))



# Step 5: Main Workflow
def run_analysis(file_path, target_column):
    # Load data
    data = load_data(file_path)

    #this is part of EDA, sort it out later
    print("Data type:", type(data))
    print("Data columns:", data.columns)
    print("First few rows of data:\n", data.head())
    print("Null values in each column:\n", data.isnull().sum())

    #process and split data
    X_train, X_test, y_train, y_test = preprocess_and_split_data(data, target_column)
    
    # EDA as well
    print("Shape of X_train:", X_train.shape)
    print("Shape of X_test:", X_test.shape)

     # Train the Random Forest model with hyperparameter tuning
    best_model, best_params, best_score = train_random_forest_with_tuning(X_train, y_train)

    # Evaluate the best model on test data
    print(f"\nEvaluation Metrics for {file_path} using randomsearchcv:\n")
    evaluate_model(best_model, X_test, y_test)

    # Print best parameters and score from tuning
    print("Best hyperparameters:", best_params)
    print("Best cross-validated score:", best_score)

    '''
    # Train the Random Forest model
    model = train_random_forest(X_train, y_train)
    
    # Evaluate the model and print metrics
    print(f"\nEvaluation Metrics for {file_path}:\n")
    evaluate_model(model, X_test, y_test)
    '''


# Step 6: Test with different datasets
file_path = './BankCustomerChurnPrediction.csv'  # Replace with actual file path of Dataset 1
target_column = 'churn'  # Replace with the actual target column name
run_analysis(file_path, target_column)


#this is part of EDA
#data = pd.read_csv(file_path)



Data type: <class 'pandas.core.frame.DataFrame'>
Data columns: Index(['customer_id', 'credit_score', 'country', 'gender', 'age', 'tenure',
       'balance', 'products_number', 'credit_card', 'active_member',
       'estimated_salary', 'churn'],
      dtype='object')
First few rows of data:
    customer_id  credit_score country  gender  age  tenure    balance  \
0     15634602           619  France  Female   42       2       0.00   
1     15647311           608   Spain  Female   41       1   83807.86   
2     15619304           502  France  Female   42       8  159660.80   
3     15701354           699  France  Female   39       1       0.00   
4     15737888           850   Spain  Female   43       2  125510.82   

   products_number  credit_card  active_member  estimated_salary  churn  
0                1            1              1         101348.88      1  
1                1            0              1         112542.58      0  
2                3            1              0       

150 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
106 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/tusharjoshi/.pyenv/versions/3.12.7/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tusharjoshi/.pyenv/versions/3.12.7/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/tusharjoshi/.pyenv/versions/3.12.7/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/tusharjoshi/.pyenv/versions/3.12.7/lib/python3.12/site-


Evaluation Metrics for ./BankCustomerChurnPrediction.csv using randomsearchcv:

Accuracy: 0.863
F1 Score (Weighted): 0.8472151520774641
Precision (Weighted): 0.8558884814061137
Recall (Weighted): 0.863
Specificity: 0.970495919648462

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.97      0.92      1593
           1       0.79      0.44      0.57       407

    accuracy                           0.86      2000
   macro avg       0.83      0.71      0.74      2000
weighted avg       0.86      0.86      0.85      2000

Best hyperparameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 109}
Best cross-validated score: 0.858375


In [24]:
data=load_data(file_path)
data.info()
categorical = [var for var in data.columns if data[var].dtype=='O']
print('There are {} categorical variables\n'.format(len(categorical)))

print('The categorical variables are :\n\n', categorical)
print(data[categorical].head())

numerical = [var for var in data.columns if data[var].dtype!='O']

print('There are {} numerical variables\n'.format(len(numerical)))

print('The numerical variables are :\n\n', numerical)
data[numerical].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       10000 non-null  int64  
 1   credit_score      10000 non-null  int64  
 2   country           10000 non-null  object 
 3   gender            10000 non-null  object 
 4   age               10000 non-null  int64  
 5   tenure            10000 non-null  int64  
 6   balance           10000 non-null  float64
 7   products_number   10000 non-null  int64  
 8   credit_card       10000 non-null  int64  
 9   active_member     10000 non-null  int64  
 10  estimated_salary  10000 non-null  float64
 11  churn             10000 non-null  int64  
dtypes: float64(2), int64(8), object(2)
memory usage: 937.6+ KB
There are 2 categorical variables

The categorical variables are :

 ['country', 'gender']
  country  gender
0  France  Female
1   Spain  Female
2  France  Female
3  Fran

Unnamed: 0,customer_id,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,43,2,125510.82,1,1,1,79084.1,0
