In [None]:
# ======== Basic Imports ========
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# ======== Load Dataset ========
df = pd.read_csv("../data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

print("Initial shape of dataset:", df.shape)
df.head()

In [None]:
# ======== Basic Cleaning ========

# TotalCharges has some spaces, so converting to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Dropping rows where TotalCharges is missing
df = df.dropna()

# customerID is just an identifier, not useful for prediction
df = df.drop(columns=["customerID"])

print("Shape after cleaning:", df.shape)

# Just checking class distribution
print("Churn distribution:\n", df["Churn"].value_counts())
print("\nPercentage distribution:")
print(df["Churn"].value_counts(normalize=True) * 100)

In [None]:
# ======== Encoding Categorical Columns ========

cat_cols = df.select_dtypes(include=["object"]).columns
encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

In [None]:
# ======== Train-Validation Split ========

X = df.drop(columns=["Churn"])
y = df["Churn"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training samples:", X_train.shape[0])
print("Validation samples:", X_val.shape[0])

In [None]:
# ======== Feature Scaling ========

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [None]:
# ======== Baseline Model: Logistic Regression ========

log_model = LogisticRegression()
log_model.fit(X_train_scaled, y_train)

y_pred_log = log_model.predict(X_val_scaled)

print("\n--- Logistic Regression Results ---")
print("Accuracy:", accuracy_score(y_val, y_pred_log))
print(classification_report(y_val, y_pred_log))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_log))

In [None]:
# ======== Improved Model: Random Forest ========

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

y_pred_rf = rf_model.predict(X_val_scaled)

print("\n--- Random Forest Results ---")
print("Accuracy:", accuracy_score(y_val, y_pred_rf))
print(classification_report(y_val, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_rf))