In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from pandas.api.types import is_categorical_dtype

def one_hot_encode(df, columns):
    """
    Preprocesses categorical columns in a DataFrame using OneHotEncoder.

    Parameters:
        df (pandas.DataFrame): The DataFrame to preprocess.
        columns (list of str): The names of the categorical columns to encode.

    Returns:
        pandas.DataFrame: The preprocessed DataFrame with the categorical columns
            one-hot encoded and dropped.

    Raises:
        ValueError: If any of the specified columns do not exist in the DataFrame.
        ValueError: If any of the specified columns do not contain categorical data.
    """
    # Check that all specified columns exist in the DataFrame
    missing_columns = set(columns) - set(df.columns)
    if missing_columns:
        raise ValueError(f"Columns {missing_columns} not found in DataFrame")

    # Check that all specified columns contain categorical data
    non_categorical_columns = [col for col in columns if not (is_categorical_dtype(df[col]) or df[col].dtype == object)]
    if non_categorical_columns:
        raise ValueError(f"Columns {non_categorical_columns} do not contain categorical data")

    encoder = OneHotEncoder(drop='first', sparse=False)
    encoded_array = encoder.fit_transform(df[columns])

    # Create a DataFrame with the one-hot encoded arrays and feature names
    encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(columns))

    # Concatenate the original DataFrame and the encoded DataFrame
    df = pd.concat([df, encoded_df], axis=1)

    # Drop the original categorical columns
    df.drop(columns, axis=1, inplace=True)

    return df



In [None]:
df = one_hot_encode(df, ['Sex', 'Housing', 'Purpose'])
df.head()