In [19]:
import numpy as np 
import pandas as pd 
import os

In [9]:
def load_data(filepath: str) -> pd.DataFrame:
    """
    Load telecom churn dataset.
    """
    return pd.read_csv(filepath)

df = load_data("../data/telco_churn.csv")


In [13]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean raw telecom data:
    - Remove whitespace from column names
    - Convert 'TotalCharges' to numeric
    - Drop missing or invalid rows
    """
    df.columns = df.columns.str.strip()

    # Handle TotalCharges column
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

    # Drop rows with missing TotalCharges (after coercion)
    df = df.dropna(subset=['TotalCharges'])
    
    #removing customer IDs
    df.drop(columns=['customerID'],inplace=True)

    #converting the target to binary
    df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})


    #one hot encoding the categorical variables
    df = pd.get_dummies(df)
    
    return df

In [17]:
clean_data(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['customerID'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,True,False,False,True,True,...,False,True,False,False,False,True,False,False,True,False
1,0,34,56.95,1889.50,0,False,True,True,False,True,...,False,False,True,False,True,False,False,False,False,True
2,0,2,53.85,108.15,1,False,True,True,False,True,...,False,True,False,False,False,True,False,False,False,True
3,0,45,42.30,1840.75,0,False,True,True,False,True,...,False,False,True,False,True,False,True,False,False,False
4,0,2,70.70,151.65,1,True,False,True,False,True,...,False,True,False,False,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,1990.50,0,False,True,False,True,False,...,True,False,True,False,False,True,False,False,False,True
7039,0,72,103.20,7362.90,0,True,False,False,True,False,...,True,False,True,False,False,True,False,True,False,False
7040,0,11,29.60,346.45,0,True,False,False,True,False,...,False,True,False,False,False,True,False,False,True,False
7041,1,4,74.40,306.60,1,False,True,False,True,True,...,False,True,False,False,False,True,False,False,False,True


In [21]:
def save_processed_data(df: pd.DataFrame, output_path: str):
    """
    Save the cleaned and processed dataset to a CSV file.

    """
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    df.to_csv(output_path, index=False)


In [25]:
def run_etl(input_path: str, output_path: str):
    """
    ETL pipeline runner.
    """
    try:
        df = load_data(input_path)
        df_cleaned = clean_data(df)
        save_processed_data(df_cleaned, output_path)
        print(f"ETL pipeline complete. Processed file saved to: {output_path}")
    except Exception as e:
        print(f"ETL pipeline failed: {e}")
