In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from imblearn.over_sampling import SMOTE


# Step 2: Load and Explore Dataset
def load_data(file_path, target_column):
    data = pd.read_csv(file_path)  # Read the CSV file into a DataFrame
    #le = preprocessing.LabelEncoder()
    #for column_name in data.columns:
     #   if data[column_name].dtype == object:
      #      data[column_name] = le.fit_transform(data[column_name])
       # else:
        #    pass
    
    X = data.drop(columns=[target_column])  # Drop the target column to get the features
    y = data[target_column]  # Extract the target variable

    le = LabelEncoder()
    for column in X.select_dtypes(include=['object']):
        X[column] = le.fit_transform(X[column])
    
    y = le.fit_transform(y)
    
    return train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)  # Split the data into training and  testing sets,and use stratify to approximately maintain the imbalanced ratio of taget column




In [5]:
# Step 3: Train Random Forest Model
def train_random_forest(X_train, y_train):
    '''
    print("counter before SMOTE:", Counter(y_train))
    #Apply SMOTE for resampling
    smote = SMOTE(random_state=15)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    print("Counter after SMOTE:", Counter(y_resampled))
    '''
    rf = RandomForestClassifier(random_state=15, class_weight='balanced')  # Create an instance of RandomForestClassifier with fixed random state
    rf.fit(X_train, y_train)  # Fit the model to the training data
    print("these are the params",rf.get_params())
    return rf  # Return the trained model
    



In [3]:

# Step 4: Evaluate Model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)  # Get the model's predictions on the test data

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Calculate F1 score (weighted average for imbalanced classes)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Calculate precision
    precision = precision_score(y_test, y_pred, average='weighted')

    # Calculate recall
    recall = recall_score(y_test, y_pred, average='weighted')

    # Specificity (True Negative Rate)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()  # Get the confusion matrix
    specificity = tn / (tn + fp)  # Calculate specificity

    # Display metrics
    print("Accuracy:", accuracy)
    print("F1 Score (Weighted):", f1)
    print("Precision (Weighted):", precision)
    print("Recall (Weighted):", recall)
    print("Specificity:", specificity)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))  # Detailed classification report



#print("\nResults with SMOTE and class_weight='balanced':")
#print(classification_report(y_test, rf_pred))







In [6]:

# Step 5: Main Workflow
def run_analysis(file_path, target_column):
    # Load data and split it
    X_train, X_test, y_train, y_test = load_data(file_path, target_column)
    
    # Train the Random Forest model
    model = train_random_forest(X_train, y_train)
    
    # Evaluate the model and print metrics
    print(f"\nEvaluation Metrics for {file_path}:\n")
    evaluate_model(model, X_test, y_test)

# Step 6: Test with different datasets
file_path = "./Bank Customer Churn Prediction.csv"  # Replace with actual file path of Dataset 1
target_column = 'churn'  # Replace with the actual target column name
run_analysis(file_path, target_column)


these are the params {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 15, 'verbose': 0, 'warm_start': False}

Evaluation Metrics for ./Bank Customer Churn Prediction.csv:

Accuracy: 0.8605
F1 Score (Weighted): 0.8440601301415581
Precision (Weighted): 0.8527940395752897
Recall (Weighted): 0.8605
Specificity: 0.9698681732580038

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.97      0.92      1593
           1       0.79      0.43      0.56       407

    accuracy                           0.86      2000
   macro avg       0.83      0.70      0.74      2000
weighted avg       0.85      0.86      0.84