### Import dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



### Load and understand data

In [None]:
df = pd.read_csv("//wsl.localhost/Ubuntu-24.04/home/matthew/repos/Churn_Prediction_System/Data/customer_churn_dataset.csv")


In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# drop customerID column because it is not not needed for modelling
df.drop(columns=["customerID"], inplace=True)



In [None]:
df.head(2)

In [None]:
# check unique values for each column
for col in df.columns:
    print(col, df[col].unique())
    print("." * 100)

In [None]:
## check for missing values
df.isnull().sum()

In [None]:
df[df["TotalCharges"] == " "]

In [None]:
# check number of emtpy spaces in TotalCharges column

len(df[df["TotalCharges"] == " "])

In [None]:
# replace the emtpy spaces with 0.0
df["TotalCharges"] = df["TotalCharges"].replace({" " : "0.0"})

In [None]:
# convert all the values of the TotalCharges column to float
df['TotalCharges'] = df["TotalCharges"].astype(float)
#df

In [None]:
df.info()

In [None]:
# check the class distribution of target column
print(df["Churn"].value_counts())

#### Insights so far
* customerID has been removed because it is not need for modelling
* No missing values in the dataset
* Missing values in the TotalCharges colunm were replaced with 0s
* class inbalance in Churn column

### Exploratory data analysis

In [None]:
df.columns

In [None]:
df.head(2)

### Check basic statistic of the numerical columns of the  dataset

In [None]:

df.describe()

### Analyse numerical features

In [None]:
def plot_histogram(df, column_name):
    plt.figure(figsize=(5, 3))
    sns.histplot(df[column_name], kde=True)
    plt.title(f"Distribution of {column_name}")
    
    # compute mean and median values for the columns
    col_mean  = df[column_name].mean()
    col_median = df[column_name].median()
    
    # add vertical lines for mean and median
    plt.axvline(col_mean, color="red", linestyle="--", label="Mean")
    plt.axvline(col_median, color="green", linestyle="-", label="Median")
    
    plt.legend()
    plt.show()
    
plot_histogram(df,"tenure")
    

In [None]:
plot_histogram(df, "TotalCharges")

In [None]:
def plot_boxplot(df, column_name):
    plt.Figure(figsize=(5,3))
    sns.boxplot(y=df[column_name])
    plt.title(f"Boxplot of {column_name}")
    plt.ylabel(column_name)
    plt.show()

In [None]:
plot_boxplot(df, "tenure")

In [None]:
plot_boxplot(df,"TotalCharges")

In [None]:
plot_boxplot(df, "MonthlyCharges")

### correlation headmap for numerical columns

In [None]:
plt.figure(figsize=(8,4))
sns.heatmap(df[["tenure","MonthlyCharges","TotalCharges"]].corr(),annot=True,cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

### Categorical Features Analysis

In [None]:
df.columns

In [None]:
df.info()

#### count plot for categorical data

In [None]:
obj_cols = df.select_dtypes(include="object").columns.to_list()
obj_cols = ["SeniorCitizen"] + obj_cols

for col in obj_cols:
    plt.figure(figsize=(5,3))
    sns.countplot(x=df[col])
    plt.title(f"Count Plot of {col}")
    plt.show()


obj_cols

### preprocess the data

In [None]:
df.head(3)

##### label encoding of target column

In [None]:
df["Churn"] = df["Churn"].replace({"Yes": 1, "No":0})

In [None]:
df.head(3)

In [None]:
print(df["Churn"].value_counts())

### label Encoding of categorical Features

In [None]:
# identify columns with object data types
object_columns = df.select_dtypes(include="object").columns
print(object_columns)

In [None]:
# initialize a dictionary to save encoders
encoders = {}

# apply label encoding and store the encoders
for column in object_columns:
    label_encoder = LabelEncoder()
    df[column] = label_encoder.fit_transform(df[column])
    encoders[column] = label_encoder
    
    # save the encoders to a pickle file
    with open("encoders.pkl", "wb") as f:
        pickle.dump(encoders, f)
        
    

In [None]:
encoders

In [None]:
df.head(3)

### Training and test data split

In [None]:
# split features and target
x = df.drop(columns="Churn")
y = df["Churn"] # target feature



In [None]:
print(x)


In [None]:
print(y)

### Encode target labels

In [None]:
label_encode = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

### split training and test data

In [None]:
# 80% of tha data for training and 20% for testing
x_train, x_test, y_train, y_test = train_test_split(x,y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [None]:
print(y_train.shape)

### synthetic minority oversampling technigue using SMOTE

In [None]:
smote = SMOTE(random_state=42)

In [None]:
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

#### Check new class distribution

In [None]:

print(pd.Series(y_train_smote).value_counts())

#### visualize class distribution before and after SMOTE 

In [None]:
sns.countplot(x=y_train)
plt.title("Before SMOTE")
plt.show()

sns.countplot(x=y_train_smote)
plt.title("After SMOTE")
plt.show()


### Train the model

##### Training Models

##### Define  Models and Parameter Grids



In [None]:

models = {
    "Decision Trees": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}

param_grids = {
    "Decision Trees": {
        "max_depth": [None, 5, 10],
        "min_samples_split": [2, 5, 10],
    },
    "Random Forest": {
        "n_estimators": [100, 200],
        "max_depth": [None, 10, 20],
    },
    "XGBoost": {
        "n_estimators": [100, 200],
        "learning_rate": [0.01, 0.1, 0.2],
    }
}


#### Perform RandomizedSearchCV for Each Mode

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report

cv_scores = {}

for model_name, model in models.items():
    print(f"Tuning {model_name}...")

    search = RandomizedSearchCV(
        model,
        param_grids[model_name],
        cv=5,
        n_iter=10,
        scoring="accuracy",
        n_jobs=-1,
        random_state=42
    )
    
    search.fit(x_train_smote, y_train_smote)  # Using SMOTE-resampled data
    
    print(f"Best parameters for {model_name}: {search.best_params_}")
    print(f"Best cross-validation accuracy: {search.best_score_:.2f}")
    
    cv_scores[model_name] = {
        "best_estimator": search.best_estimator_,
        "best_params": search.best_params_,
        "best_score": search.best_score_
    }
    print("-" * 100)
    
    
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")



#### Evaluate Best Model on Test Set

In [None]:
best_rf = cv_scores["Random Forest"]["best_estimator"]
y_pred_rf = best_rf.predict(x_test)

print("Evaluation on Test Set:")
print(classification_report(y_test, y_pred_rf))


##### random forest gives the highest accuracy compared to other models with default parameters

In [None]:
# saving Random Forest's best estimator from cv_scores dictionary
best_model = cv_scores["Random Forest"]["best_estimator"]

model_data = {
    "model": best_model,
    "feature_names": x.columns.tolist(),
    "encoder":label_encode # saving the encoder
}

with open("customer_churn_model.pkl", "wb") as f:
    pickle.dump(model_data, f)



#### load the saved model and build a predictive system

In [None]:
# Load the model
with open("customer_churn_model.pkl", "rb") as f:
    model_data = pickle.load(f)

loaded_model = model_data["model"]
feature_names = model_data["feature_names"]
label_encode = model_data["encoder"]


#### Visualizing Feature Importance

In [None]:
# Random forest
feature_importances = pd.Series(loaded_model.feature_importances_, index=feature_names)
feature_importances = feature_importances.sort_values(ascending=True)


plt.figure(figsize=(5, 3))
feature_importances.plot(kind='barh')
plt.title("Feature Importance (Random Forest)")
plt.xlabel("Importance Score")
plt.show()

# save to csv
feature_importances_df = feature_importances.reset_index()
feature_importances_df.columns = ['Feature', 'Importance']

feature_importances_df.to_csv(r"\\wsl.localhost\Ubuntu-24.04\home\matthew\repos\Churn_Prediction_System\Jupyter_Notebook\feature_importance.csv", index=False)


In [None]:
# XGBoost
xgb = XGBClassifier()
xgb.fit(x_train, y_train)

plot_importance(xgb)
plt.show()


#### Visualizing Model Evaluation Results

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred = loaded_model.predict(x_test)

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()


### prepare and encode new input before prediction

In [None]:
with open("encoders.pkl", "rb") as f:
    encoders = pickle.load(f)


In [None]:
def apply_encodings(input_df, encoders):
    
   # apply label encodings to all columns in encoders dictionary.
    for column, encoder in encoders.items():
        if column in input_df.columns:
            input_df[column] = encoder.transform(input_df[column])
    return input_df


In [None]:
# New customer data dictionary
input_data = {
    'gender': ['Male'],
    'SeniorCitizen': [0],
    'Partner': ['Yes'],
    'Dependents': ['No'],
    'tenure': [5],
    'PhoneService': ['Yes'],
    'MultipleLines': ['No'],
    'InternetService': ['Fiber optic'],
    'OnlineSecurity': ['No'],
    'OnlineBackup': ['Yes'],
    'DeviceProtection': ['No'],
    'TechSupport': ['No'],
    'StreamingTV': ['No'],
    'StreamingMovies': ['No'],
    'Contract': ['Month-to-month'],
    'PaperlessBilling': ['Yes'],
    'PaymentMethod': ['Electronic check'],
    'MonthlyCharges': [70.35],
    'TotalCharges': [350.5],
}

# convert to DataFrame
input_df = pd.DataFrame(input_data)

# ensure columns match order
input_df = input_df[feature_names]

# apply all encodings at once
input_df = apply_encodings(input_df, encoders)


In [None]:
# check data types of columns to be sure all are encoded successfully

print(input_df.dtypes)
print(input_df.head())


#### make a prediction

In [None]:
# Predict
prediction = loaded_model.predict(input_df)
print("Churn Prediction:", prediction[0])  # 0 = No, 1 = Yes

#### code to to generate csv for powerBI

In [None]:

# load the trained model and feature names
with open("customer_churn_model.pkl", "rb") as f:
    model_data = pickle.load(f)

model = model_data["model"]
feature_names = model_data["feature_names"]

with open("encoders.pkl", "rb") as f:
    encoders = pickle.load(f)

batch_data = pd.read_csv(r"\\wsl.localhost\Ubuntu-24.04\home\matthew\repos\Churn_Prediction_System\Data\customer_churn_dataset.csv")

# clean whitespace and convert columns
batch_data.replace(r'^\s*$', pd.NA, regex=True, inplace=True)
batch_data['TotalCharges'] = pd.to_numeric(batch_data['TotalCharges'], errors='coerce')
batch_data['MonthlyCharges'] = pd.to_numeric(batch_data['MonthlyCharges'], errors='coerce')
batch_data.fillna(0, inplace=True)

# select only the feature columns for prediction
batch_data_features = batch_data[feature_names].copy()

batch_data_encoded = apply_encodings(batch_data_features, encoders)

batch_data['Churn Prediction'] = model.predict(batch_data_encoded[feature_names])

# select columns to export for Power BI
powerbi_data = batch_data[['customerID','gender', 'SeniorCitizen', 'Partner',
                           'tenure', 'Contract', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', "Churn"]]

powerbi_data.to_csv("/home/matthew/repos/Churn_Prediction_System/churn_predictions_for_powerbi.csv", index=False)

print("Churn predictions CSV for Power BI generated successfully.")
