In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load raw data
df = pd.read_csv("../data/raw/churn.csv")

# Fix TotalCharges column
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Drop rows where TotalCharges is NaN
df = df.dropna(subset=["TotalCharges"])

# Encode target
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

# Encode categorical features
le = LabelEncoder()
for col in df.select_dtypes(include="object").columns:
    if col != "customerID":
        df[col] = le.fit_transform(df[col])

# Final checks
print("Shape after preprocessing:", df.shape)
print(df.isnull().sum())

# Save processed dataset
df.to_csv("../data/processed/churn_processed.csv", index=False)


Shape after preprocessing: (7032, 21)
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64
