# Customer Churn Predictions 
(End-to-end customer churn prediction project) 
### Using Telco Customer Churn dataset 

In [1]:
# Check to make sure dpendencies are installed

In [3]:
# Importing libraries 
import os 
import warnings 
warnings.filterwarnings('ignore') 

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from pathlib import Path 


In [4]:
# Setting random_state for reproducibility 
random_state = 42 
np.random.seed(random_state) 


In [None]:
# Defining folder paths 
# Current directory --> Notebooks/ 
#data_path = "../Data/telco_churn_data.csv" 
#visuals_path = "../Visuals"
#model_path = "../Models"   

data_path = Path('../Data/telco_churn_data.csv') 
visuals_path = Path('../Visuals') 
models_path = Path('../Models') 

visuals_path.mkdir(parents=True, exist_ok=True) 
models_path.mkdir(parents=True, exist_ok=True) 

print("Data Path --> ", data_path) 

# Function to save data file(s) to Data/ folder 
def save_data(data: pd.DataFrame, file_name: str): 
    os.makedirs(save_path, exist_ok=True) 
    file_name = os.path.join(save_path, f"{file_name}.csv") 
    data.to_csv(file_path, index=False) 
    print(f"Saved -> {file_path}") 


Data Path -->  Data/telco_churn_data.csv


In [None]:
# Plotting settings 
sns.set_style('whitegrid') 
plt.rcParams['figure.figsize'] = (10,6) 


In [None]:
# Loading data 

# file path check 
if not data_path.exists(): 
    raise FileNotFoundError(f"Expected data at {data_path}.") 

telco_data = pd.read_csv(data_path) 
telco_data.head() 
print("\nData Info: ")
print(telco_data.info())  



In [None]:
data = telco_data.copy() 

# Data checks 
print("\nMissing values: ") 
print(data.isna().sum()) 

# Checking if there are duplicate customerID values 
print("\nAll unique customerID --> ", data['customerID'].nunique() == len(data)) 

# Making sure TotalCharges is numeric 
if data['TotalCharges'].dtype != 'int': 
    data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce') 
    print("\nTotalCharges NA count (After numeric conversion): ", data['TotalCharges'].isna().sum())

# Checking if any rows have tenure == 0 & TotalCharges == NaN 
# Possible for new customers 
new_cust = (data['tenure'] == 0) & (data['TotalCharges'].isna()) 
print("\nRows with tenure == 0 & TotalCharges == NaN : ", new_cust.sum()) 



In [None]:
# Saving cleaned data to Data/ folder 
save_data(data, "cleaned_raw_data") 


### Exploratory Data Analysis (EDA) 
Looking at group churn rates, population comparisons, visualziations, and interpretations. 

In [None]:
# Finding baseline churn rate (Total churn rate) 
print("Baseline churn:") 
print(data['Churn'].value_counts(normalize=True)) 


In [None]:
# Churn distribution 
ax = sns.countplot(x='Churn', data=data) 
ax.set_title("Churn Distribution (count)") 
plt.show() 


In [None]:
# Savings churn distribution figure to Visuals/ folder 
ax = sns.countplot(x='Churn', data=data) 
ax.set_title("Churn Distribution (count)") 
plt.savefig(visuals_path / 'churn_distribution.png', dpi=150, bbox_inches='tight') 


In [None]:
# Creating helper function to calculate cross-tabulations and group churn rates 
def group_churn(data, group_column, target='Churn'): 
    cross_tab = pd.crosstab(data[group_column], data[target], normalize='index') 
    cross_tab = cross_tab.rename(columns={'No': 'churn_no_pct', 'Yes': 'churn_yes_pct'}) 
    # Population comparison 
    population_share = data[group_column].value_counts(normalize=True).rename('population_share') 
    result = cross_tab.join(population_share) 
    result['group_churn'] = result['churn_yes_pct'] 
    result['population_churn_share'] = result['population_share'] * result['churn_yes_pct'] 
    return result 



In [None]:
# 



In [None]:
# Loading data 
data_path = "../Data/telco_churn_data.csv"
telco_data = pd.read_csv(data_path)  
print(telco_data.info()) 
print("\n", telco_data.head())


# Saving cleaned data to data folder 
def save_data(data: pd.DataFrame, file_name: str): 
    os.makedirs(save_path, exist_ok=True) 
    file_path = os.path.join(save_path, f"{file_name}.csv") 
    data.to_csv(file_path, index=False) 
    print(f"Saved -> {file_path}") 

#save_path = "../Data/" 
#save_data(cleaned_data, "cleaned_churn_data") 

In [None]:
# Machine Learning Libraries 
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV  
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import (
    roc_auc_score, 
    average_precision_score, 
    roc_curve, 
    precision_recall_curve, 
    classification_report, 
    confusion_matrix
)
from imblearn.over_sampling import SMOTE 
from imblearn.pipeline import Pipeline as ImbPipeline 

import joblib 

# Heavy Libraries 
try: 
    import xgboost as xgb 
except Exception: 
    xgb = None 

try: 
    import shap 
except Exception: 
    shap = None 
