In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load dataset
df = pd.read_excel("../data/raw/Telecom_churn.xlsx")

# Drop irrelevant columns
drop_cols = ["CustomerID", "Count", "Country", "City", "Zip Code",
             "Lat Long", "Latitude", "Longitude", "Churn Score", "CLTV", "Churn Reason"]
df = df.drop(columns=drop_cols, errors='ignore')

# Convert numeric columns and handle missing values
num_cols = ['Tenure Months', 'Monthly Charges', 'Total Charges']
for col in num_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')  # convert invalid to NaN
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Binary encoding for yes/no and other binary columns
binary_cols = ['Partner', 'Dependents', 'Senior Citizen', 'Phone Service', 'Multiple Lines', 'Paperless Billing']
for col in binary_cols:
    if col in df.columns:
        df[col] = df[col].map({'Yes':1, 'No':0, 'Male':1, 'Female':0, 1:1, 0:0})

# Identify categorical columns (excluding binary columns)
categorical_cols = ['Gender', 'Internet Service', 'Online Security', 'Online Backup',
                    'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies',
                    'Contract', 'Payment Method', 'State']
existing_cats = [col for col in categorical_cols if col in df.columns]

# One-hot encoding for categorical features
encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
X_cat = encoder.fit_transform(df[existing_cats]) if existing_cats else np.array([]).reshape(len(df),0)

# Select numerical columns (exclude categorical and target)
numerical_cols = df.drop(existing_cats + ['Churn Value', 'Churn Label'], axis=1, errors='ignore').columns
X_num = df[numerical_cols].values

# Combine numeric + categorical features
if X_cat.size:
    X_combined = np.hstack([X_num, X_cat])
else:
    X_combined = X_num

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_combined)

# Target variable
if 'Churn Value' in df.columns:
    y = df['Churn Value'].map({True:1, False:0})
else:
    y = df['Churn Label'].map({'Yes':1, 'No':0})

print("Processed feature matrix shape:", X_scaled.shape)
print("Target vector shape:", y.shape)

# Save scaler and encoder for future use
joblib.dump(scaler, "../models/scaler_demo.pkl")
joblib.dump(encoder, "../models/encoder_demo.pkl")
print("Scaler and encoder saved.")

Processed feature matrix shape: (7043, 29)
Target vector shape: (7043,)
Scaler and encoder saved.
