In [3]:
# California Housing ETL Pipeline - Google Colab Version

# ✅ STEP 1: Import libraries
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from google.colab import files

# ✅ STEP 2: Define ETL Functions
def extract_data():
    print("\n📦 Extracting data...")
    dataset = fetch_california_housing(as_frame=True)
    df = dataset.frame
    return df

def transform_data(df):
    print("\n🔧 Transforming data...")
    # Check for missing values
    if df.isnull().sum().sum() > 0:
        df = df.dropna()

    # Feature scaling (except target column)
    features = df.drop("MedHouseVal", axis=1)
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)

    # Combine scaled features with target
    scaled_df = pd.DataFrame(scaled_features, columns=features.columns)
    scaled_df["MedHouseVal"] = df["MedHouseVal"].values
    return scaled_df

def load_data(df, output_path="processed_data.csv"):
    print("\n💾 Saving data to disk...")
    df.to_csv(output_path, index=False)
    print(f"Data saved as: {output_path}")
    return output_path

# ✅ STEP 3: Run the ETL Pipeline
def run_etl_pipeline():
    df_raw = extract_data()
    df_transformed = transform_data(df_raw)
    csv_path = load_data(df_transformed)
    return csv_path

# ✅ STEP 4: Execute and Download
csv_path = run_etl_pipeline()

# ✅ STEP 5: Download the file
print("\n⬇️ Downloading the processed CSV file...")
files.download(csv_path)



📦 Extracting data...

🔧 Transforming data...

💾 Saving data to disk...
Data saved as: processed_data.csv

⬇️ Downloading the processed CSV file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>