In [4]:
# Import necessary libraries
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [12]:
# Function to load data
def load_data(file_path):

    return pd.read_csv(file_path)

print(load_data)

<function load_data at 0x7b72f3d95ee0>


In [18]:
# Function to preprocess data
def preprocess_data(data, target_column):

    # Separate features and target
    X = data.drop(target_column, axis=1)
    y = data[target_column]

    # Define categorical and numerical columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

    # Preprocessing for numerical data
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    # Preprocess categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine preprocessors in a column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

    return preprocessor, X, y

target_column = 'target'

# Preprocess the data
preprocessor, X, y = preprocess_data(data, target_column)

# Print the results
print("Features (X):")
print(X)

print("\nTarget (y):")
print(y)

print("\nPreprocessor:")
print(preprocessor)

Features (X):
    age  gender  income
0  25.0    Male   50000
1  30.0  Female   60000
2  22.0    Male   45000
3  35.0  Female   70000
4   NaN  Female   80000

Target (y):
0    0
1    1
2    0
3    1
4    0
Name: target, dtype: int64

Preprocessor:
ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 Index(['age', 'income'], dtype='object')),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 Index(['gender'], dtype='object'))])


In [19]:
# Function to create a pipeline
def create_pipeline(preprocessor):
    """Create a pipeline for the ETL process."""
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor)
    ])
    return pipeline

# Function to save transformed data
def save_transformed_data(transformed_data, output_path):
    """Save the transformed data to a CSV file."""
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    transformed_data.to_csv(output_path, index=False)

# Main function
def main():
    # File paths
    input_file_path = '/content/California_data.csv'
    output_file_path = '/content/transformed_data.csv'
    # Load the data
    data = load_data(input_file_path)

    # Preprocess the data
    target_column = 'AveOccup'  # Replace with your target column
    preprocessor, X, y = preprocess_data(data, target_column)

    # Create pipeline
    pipeline = create_pipeline(preprocessor)

    # Fit and transform the data
    X_transformed = pipeline.fit_transform(X)

    # Convert transformed data back to DataFrame
    feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
    X_transformed_df = pd.DataFrame(X_transformed, columns=feature_names)

    # Save the transformed data
    save_transformed_data(X_transformed_df, output_file_path)
    print(f"Transformed data saved to {output_file_path}")

# Run the script
if __name__ == "__main__":
    main()

Transformed data saved to /content/transformed_data.csv


In [21]:

# Function to create a pipeline
def create_pipeline(preprocessor):
    """Create a pipeline for the ETL process."""
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor)
    ])
    return pipeline

# Function to save transformed data
def save_transformed_data(transformed_data, output_path):
    """Save the transformed data to a CSV file."""
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    transformed_data.to_csv(output_path, index=False)

# Main function
def main():
    # File paths
    input_file_path = '/content/California_data.csv'  # Replace with your actual file path
    output_file_path = '/content/transformed_data.csv'  # Output file path

    # Load the data
    data = load_data(input_file_path)
    print("Loaded Data:")
    print(data.head())  # Print the first few rows of the loaded data

    # Preprocess the data
    target_column = 'AveOccup'  # Replace with your target column
    preprocessor, X, y = preprocess_data(data, target_column)

    print("\nFeatures (X) before transformation:")
    print(X.head())  # Print the first few rows of the features

    # Create pipeline
    pipeline = create_pipeline(preprocessor)

    # Fit and transform the data
    X_transformed = pipeline.fit_transform(X)

    # Convert transformed data back to DataFrame
    feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
    X_transformed_df = pd.DataFrame(X_transformed, columns=feature_names)

    print("\nTransformed Features:")
    print(X_transformed_df.head())  # Print the first few rows of the transformed features

    # Save the transformed data
    save_transformed_data(X_transformed_df, output_file_path)
    print(f"\nTransformed data saved to {output_file_path}")

# Run the script
if __name__ == "__main__":
    main()

Loaded Data:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  

Features (X) before transformation:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  Latitude  Longitude  \
0  8.3252      41.0  6.984127   1.023810       322.0     37.88    -122.23   
1  8.3014      21.0  6.238137   0.971880      2401.0     37.86    -122.22   
2  7.2574      52.0  8.288136   1.073446       496.0     37.85    -122.24   
3  5.6431      52