In [2]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from pathlib import Path
import joblib
import argparse



In [6]:
from google.colab import files
uploaded = files.upload()


Saving customer_churn_dataset-testing-master.csv to customer_churn_dataset-testing-master.csv


In [8]:

num_cols = [
    "Age", "Tenure", "Usage Frequency", "Support Calls",
    "Payment Delay", "Total Spend", "Last Interaction"
]
cat_cols = ["Gender", "Subscription Type", "Contract Length"]

# Step 4: Build preprocessing pipeline
def build_pipeline(num_cols, cat_cols):
    numeric_pipeline = Pipeline([
        ("scaler", StandardScaler())
    ])
    categorical_pipeline = Pipeline([
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])
    preprocessor = ColumnTransformer([
        ("num", numeric_pipeline, num_cols),
        ("cat", categorical_pipeline, cat_cols)
    ])
    return preprocessor

# Step 5: ETL function
def main(args, num_cols, cat_cols):
    print(f" Loading dataset from: {args.input}")
    df = pd.read_csv(args.input)

    print(" Preparing features and target...")
    X = df.drop(columns=["Churn", "CustomerID"])
    y = df["Churn"].astype(int)

    print("Building preprocessing pipeline...")
    pipe = build_pipeline(num_cols, cat_cols)

    print("Transforming the dataset...")
    X_clean = pipe.fit_transform(X)

    print("Creating cleaned DataFrame...")
    df_clean = pd.DataFrame(X_clean.toarray() if hasattr(X_clean, "toarray") else X_clean,
                            columns=pipe.get_feature_names_out())
    df_clean["Churn"] = y.values

    print(f"Saving cleaned data to: {args.output}")
    Path(args.output).parent.mkdir(parents=True, exist_ok=True)
    df_clean.to_parquet(args.output)

    print(f" Saving preprocessing pipeline to: {args.model_out}")
    joblib.dump(pipe, args.model_out)

    print(" ✅ ETL process completed successfully.")

# Step 6: Simulate argparse for Colab
class Args:
    input = list(uploaded.keys())[0]  # Use uploaded filename
    output = "outputs/churn_clean.parquet"
    model_out = "outputs/preprocess_pipeline.joblib"

args = Args()

# Step 7: Run ETL
main(args, num_cols, cat_cols)


 Loading dataset from: customer_churn_dataset-testing-master.csv
 Preparing features and target...
Building preprocessing pipeline...
Transforming the dataset...
Creating cleaned DataFrame...
Saving cleaned data to: outputs/churn_clean.parquet
 Saving preprocessing pipeline to: outputs/preprocess_pipeline.joblib
 ✅ ETL process completed successfully.
