In [1]:
import numpy as np
import pandas as pd

In [2]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

print("PROJECT_ROOT:", PROJECT_ROOT)
print("src exists:", (PROJECT_ROOT / "src").exists())

PROJECT_ROOT: C:\Users\User\Predicting-Customer-Churn-in-Telecom_worksheet
src exists: True


In [3]:
from src.config import DATA_RAW, DATA_PROCESSED
from src.data_preprocessing import load_raw_telco, clean_telco
from src.feature_engineering import build_processed_frame

In [4]:
# 1) Load + clean raw data
df_raw = load_raw_telco(DATA_RAW)
df_clean = clean_telco(df_raw)
df_clean.shape, df_clean.columns

((7043, 21),
 Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
        'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
        'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
        'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
       dtype='object'))

In [5]:
# 2) Build schema-locked processed frame
df_processed = build_processed_frame(df_clean)

df_processed.shape, df_processed.columns[:10]

((7043, 28),
 Index(['customerID', 'Churn_Yes', 'SeniorCitizen', 'tenure', 'MonthlyCharges',
        'TotalCharges', 'charges_ratio', 'gender_Male', 'Partner_Yes',
        'Dependents_Yes'],
       dtype='object'))

In [6]:
df_processed.columns

Index(['customerID', 'Churn_Yes', 'SeniorCitizen', 'tenure', 'MonthlyCharges',
       'TotalCharges', 'charges_ratio', 'gender_Male', 'Partner_Yes',
       'Dependents_Yes', 'PhoneService_Yes', 'MultipleLines_Yes',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_Yes', 'OnlineBackup_Yes', 'DeviceProtection_Yes',
       'TechSupport_Yes', 'StreamingTV_Yes', 'StreamingMovies_Yes',
       'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
       'HighSpender', 'HighChurnRisk'],
      dtype='object')

In [7]:
# 3) Quick sanity checks
from src.config import FEATURE_COLUMNS, LABEL_COL, ID_COL

assert list(df_processed.columns[:2]) == [ID_COL, LABEL_COL]
assert list(df_processed.columns[2:]) == FEATURE_COLUMNS
assert "Churn_Yes" not in FEATURE_COLUMNS
print("✅ Schema checks passed.")

✅ Schema checks passed.


In [8]:
df_processed["TotalCharges"] = pd.to_numeric(
    df_processed["TotalCharges"], errors="coerce"
)
df_processed["TotalCharges"] = df_processed["TotalCharges"].fillna(
    df_processed["TotalCharges"].median()
)

In [9]:
# 4) Save processed file
DATA_PROCESSED.parent.mkdir(parents=True, exist_ok=True)
df_processed.to_csv(DATA_PROCESSED, index=False)
print(f"✅ Saved: {DATA_PROCESSED}")

✅ Saved: C:\Users\User\Predicting-Customer-Churn-in-Telecom_worksheet\data\processed\telco_churn_processed.csv
