In [27]:
# Import necessary libraries
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Function to load data
def load_data(file_path):

    return pd.read_csv(file_path)

# Function to preprocess data
def preprocess_data(data, target_column):

    # Separate features and target
    X = data.drop(target_column, axis=1)
    y = data[target_column]

    # Define categorical and numerical columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

    # Preprocessing for numerical data
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    # Preprocess categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine preprocessors in a column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

    return preprocessor, X, y

# Function to create a pipeline
def create_pipeline(preprocessor):
    """Create a pipeline for the ETL process."""
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor)
    ])
    return pipeline

# Function to save transformed data
def save_transformed_data(transformed_data, output_path):
    """Save the transformed data to a CSV file."""
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    transformed_data.to_csv(output_path, index=False)

# Main function
def main():
    # File paths
    input_file_path = '/content/California_data.csv'
    output_file_path = '/content/transformed_data.csv'
    # Load the data
    data = load_data(input_file_path)

    # Preprocess the data
    target_column = 'AveOccup'  # Replace with your target column
    preprocessor, X, y = preprocess_data(data, target_column)

    # Create pipeline
    pipeline = create_pipeline(preprocessor)

    # Fit and transform the data
    X_transformed = pipeline.fit_transform(X)

    # Convert transformed data back to DataFrame
    feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
    X_transformed_df = pd.DataFrame(X_transformed, columns=feature_names)

    # Save the transformed data
    save_transformed_data(X_transformed_df, output_file_path)
    print(f"Transformed data saved to {output_file_path}")

# Run the script
if __name__ == "__main__":
    main()


Transformed data saved to /content/transformed_data.csv
