In [1]:
# 02_preprocessing.ipynb
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
import joblib

# Load dataset
df = pd.read_excel("../data/raw/Telecom_churn.xlsx")

# Drop irrelevant columns
drop_cols = ["CustomerID", "Count", "Country", "City", "Zip Code",
             "Lat Long", "Latitude", "Longitude", "Churn Score", "CLTV", "Churn Reason"]
df = df.drop(columns=drop_cols, errors='ignore')

# Binary encoding
binary_cols = ['Partner', 'Dependents', 'Senior Citizen', 'Phone Service', 'Multiple Lines', 'Paperless Billing']
for col in binary_cols:
    if col in df.columns:
        df[col] = df[col].map({'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0, 1:1, 0:0})

# One-hot encode categorical features
categorical_cols = ['Gender', 'Internet Service', 'Online Security', 'Online Backup',
                    'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies',
                    'Contract', 'Payment Method']
existing_cats = [col for col in categorical_cols if col in df.columns]
encoder = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore')
X_cat = encoder.fit_transform(df[existing_cats]) if existing_cats else np.array([]).reshape(len(df),0)

# Numerical columns
numerical_cols = df.drop(existing_cats + ['Churn Value', 'Churn Label'], axis=1, errors='ignore').columns
X_num = df[numerical_cols].values

# Combine numerical + categorical
X_combined = np.hstack([X_num, X_cat]) if X_cat.size else X_num

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_combined)

# Target
if 'Churn Value' in df.columns:
    y = df['Churn Value'].map({True:1, False:0})
else:
    y = df['Churn Label'].map({'Yes':1, 'No':0})

print("Processed feature matrix shape:", X_scaled.shape)
print("Target vector shape:", y.shape)

# Save scaler and encoder for future use
joblib.dump(scaler, "../models/scaler_demo.pkl")
joblib.dump(encoder, "../models/encoder_demo.pkl")
print("Scaler and encoder saved.")


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'