### DATA PIPELINE DEVELOPMENT

In [2]:
# etl_pipeline.py

import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# 1. Extract - Load the data
def extract_data(file_path):
    return pd.read_csv(file_path)


# 2. Transform - Preprocess the data
def transform_data(df):
    # Drop non-informative columns if any (like PassengerId, Name, Ticket)
    df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])  # adjust if not present

    # Optional: Separate target
    if 'Survived' in df.columns:
        target = df['Survived']
        df = df.drop(columns=['Survived'])
    else:
        target = None

    # Define numeric and categorical features
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = df.select_dtypes(include=['object']).columns

    # Pipelines for preprocessing
    numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer([
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

    processed_data = preprocessor.fit_transform(df)
    feature_names = preprocessor.get_feature_names_out()

    # Rebuild dataframe with column names
    processed_df = pd.DataFrame(processed_data, columns=feature_names)

    # Add target back if available
    if target is not None:
        processed_df['Survived'] = target.reset_index(drop=True)

    return processed_df


# 3. Load - Save the transformed data
def load_data(data, output_path):
    data.to_csv(output_path, index=False)
    print(f"Data saved to {output_path}")


# Main Function
def main():
    input_path = 'Titanic-Dataset.csv'
    output_path = 'transformed_data.csv'

    df = extract_data(input_path)
    transformed_df = transform_data(df)
    load_data(transformed_df, output_path)


if __name__ == '__main__':
    main()


Data saved to transformed_data.csv
