In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

def load_data(file_path):
    """Load data from a CSV file."""
    return pd.read_csv(file_path)

def preprocess_data(df, target_column):
    """Preprocess the dataset."""
    # Separate features and target
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    # Identify numerical and categorical columns
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X.select_dtypes(include=['object']).columns
    
    # Preprocessing for numerical data
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    
    # Create and return the preprocessing pipeline
    return preprocessor, X, y

def transform_data(preprocessor, X):
    """Transform the dataset using the preprocessor."""
    return preprocessor.fit_transform(X)

def split_data(X, y):
    """Split the dataset into training and testing sets."""
    return train_test_split(X, y, test_size=0.2, random_state=42)

def main():
    # Request user input for file paths and columns
    file_path = input("Enter the path to the dataset file (e.g., data/dataset.csv): ")
    target_column = input("Enter the name of the target column: ")
    output_path = input("Enter the output path to save the processed data (e.g., data/): ")
    
    # Load the dataset
    df = load_data(file_path)
    
    # Preprocess the data
    preprocessor, X, y = preprocess_data(df, target_column)
    
    # Transform the data
    X_transformed = transform_data(preprocessor, X)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = split_data(X_transformed, y)
    
    # Save the preprocessed and transformed data
    pd.DataFrame(X_train).to_csv(output_path + 'X_train.csv', index=False)
    pd.DataFrame(X_test).to_csv(output_path + 'X_test.csv', index=False)
    pd.DataFrame(y_train).to_csv(output_path + 'y_train.csv', index=False)
    pd.DataFrame(y_test).to_csv(output_path + 'y_test.csv', index=False)
    
    print("ETL process completed successfully!")

if __name__ == "__main__":
    main()

Enter the path to the dataset file (e.g., data/dataset.csv):  diabetes.csv
Enter the name of the target column:  Outcome
Enter the output path to save the processed data (e.g., data/):  ETL_Finished


ETL process completed successfully!
