In [None]:
#Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

In [None]:
df= pd.read_csv('TCC.csv')

In [None]:
df = df.drop(columns=["customerID"])

In [None]:
df.isnull().sum()

In [None]:
print(df.apply(lambda x: x.unique()))

In [None]:
df[df["MonthlyCharges"]==" "]

In [None]:
df[df["tenure"]==" "]

In [None]:
df[df["TotalCharges"]==" "]

In [None]:
df["TotalCharges"] = df["TotalCharges"].replace(" ", 0.0)

In [None]:
df[df["TotalCharges"]==" "]

In [None]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

In [None]:
print(df["TotalCharges"].dtype)
print(df["TotalCharges"].isna().sum())  

In [None]:
print(df["Churn"].value_counts())

In [None]:
df.describe()

In [None]:
# Numerical Features Analysis

In [None]:
sns.set_theme(style="whitegrid")

In [None]:
#histograms
def histplot(df,column_name):
    plt.figure(figsize=(6, 4))
    sns.histplot(df[column_name],color="#FF9999",edgecolor="PINK",kde=True)
    plt.title(f'Distribution of {column_name}')
    plt.axvline(df[column_name].mean(), color="#F7DC84",  label="Mean")
    plt.axvline(df[column_name].median(), color="LIGHTBLUE", label="Median")
    plt.legend()
    plt.show()

In [None]:
histplot(df,'MonthlyCharges')

In [None]:
histplot(df,'TotalCharges')

In [None]:
histplot(df,'tenure')

In [None]:
#boxplots
def boxplot(df,column_name):
    plt.figure(figsize=(6, 4))
    sns.boxplot(y=df[column_name],width=0.1,color="#FF9999",fliersize=5,linewidth=2)
    plt.title(f'Box Plot of {column_name}')
    sns.despine()
    plt.tight_layout()
    plt.show()


In [None]:
boxplot(df,'MonthlyCharges')

In [None]:
boxplot(df,'TotalCharges')

In [None]:
boxplot(df,'tenure')

In [None]:
#correlation matrix  / heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(df[['tenure','MonthlyCharges','TotalCharges']].corr(),annot=True,fmt='.3f')
plt.title('Correlation Heatmap for Numerical Features')
plt.show()

In [None]:
# Categorical Features Analysis

In [None]:
df.dtypes

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cat_cols = df.select_dtypes(include=['object']).columns.tolist()
cat_cols = ['SeniorCitizen'] + cat_cols 

n_cols = 3
n_rows = (len(cat_cols) + n_cols - 1) // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 4 * n_rows))
axes = axes.flatten()

for i, col in enumerate(cat_cols):
    sns.countplot(data=df, x=col, ax=axes[i], alpha=0.7)
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].set_title(col)

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
df["Churn"] = df["Churn"].replace({"Yes": 1, "No": 0})

In [None]:
df.head(1)

In [None]:
encoded={}
for column in cat_cols:
    label_encoder=LabelEncoder()
    df[column] = label_encoder.fit_transform(df[column])
    encoded[column] = label_encoder
    
with open("encoded.pkl", "wb") as f:
  pickle.dump(encoded, f)

In [None]:
#train/test

In [None]:
X = df.drop(columns=["Churn"])
y = df["Churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#oversampling using SMOTE 

In [None]:
smote=SMOTE(random_state=42)
X_train_smote,y_train_smote=smote.fit_resample(X_train,y_train)

In [None]:
print(y_train_smote.value_counts()) #churn values are now equal (not unbalanced like before)

In [None]:
#model training

In [None]:
crossval_scores = {}

In [None]:
models={'DecisionTree':DecisionTreeClassifier(random_state=42),
        'RandomForest':RandomForestClassifier(random_state=42),
        'XGBoost':xgb(random_state=42),
        'LogisticRegression':LogisticRegression(random_state=42,max_iter=1000)}

In [None]:
for i, j in models.items():
    score = cross_val_score(j, X_train_smote, y_train_smote, cv=5, scoring="accuracy")
    cv_scores[i] = score
    print(f"{i} cross validation accuracy: {np.mean(scores)}")
    print()