In [18]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

In [44]:
def impute_missing_values(df):
    """
    Impute missing values in the DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame with missing values.

    Returns:
    pd.DataFrame: DataFrame with missing values imputed.
    """
    # replace None with np.nan for consistency
    df = df.replace({None: np.nan})

    # Create a copy of the DataFrame to avoid modifying the original
    df_imputed = df.copy(deep=True)

    # Separate columns by type
    numeric_cols = df.select_dtypes(include=["number"]).columns
    categorical_cols = df.select_dtypes(include=["object"]).columns

    # Imputer for numerical columns
    numeric_imputer = SimpleImputer(strategy="mean")
    if numeric_cols.empty:
        print("No numeric columns to impute.")
    else:
        print(f"Numeric columns to impute: {numeric_cols.tolist()}")
        df_imputed[numeric_cols] = pd.DataFrame(        # DataFrame wrapping to ensure correct shape
            numeric_imputer.fit_transform(df[numeric_cols]),
            columns=numeric_cols,
            index=df.index
        )

    # Imputer for categorical columns
    categorical_imputer = SimpleImputer(strategy="constant", fill_value="NA")
    if categorical_cols.empty:
        print("No categorical columns to impute.")
    else:
        print(f"Categorical columns to impute: {categorical_cols.tolist()}")
        df_imputed[categorical_cols] = pd.DataFrame(
            categorical_imputer.fit_transform(df[categorical_cols]),
            columns=categorical_cols,
            index=df.index
        )

    return df_imputed


def one_hot_encoding(df):
    """
    Perform one-hot encoding on categorical columns in the DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame to encode.

    Returns:
    pd.DataFrame: DataFrame with one-hot encoded categorical columns.
    """
    return pd.get_dummies(df, drop_first=True)


In [46]:
def test_impute_missing_values():
    # Create a sample DataFrame with missing values
    data = {
        "A": [1, 2, None, 4],
        "B": ["cat", None, "dog", np.nan],
        "C": [None, 2.5, 3.5, np.nan],
    }
    df = pd.DataFrame(data)

    # Impute missing values
    df_imputed = impute_missing_values(df)

    # Check if missing values are imputed correctly
    assert df_imputed["A"].isnull().sum() == 0
    assert df_imputed["B"].isnull().sum() == 0
    assert df_imputed["C"].isnull().sum() == 0

test_impute_missing_values()

Numeric columns to impute: ['A', 'C']
Categorical columns to impute: ['B']


In [63]:
def test_one_hot_encoding():
    # Create a sample DataFrame with categorical data
    data = {
        "A": ["cat", "mouse", "dog", "horse"],
        "B": [1, 2, 3, 4]
    }
    df = pd.DataFrame(data)

    # Perform one-hot encoding
    df_encoded = one_hot_encoding(df)

    # Check if one-hot encoding is applied correctly
    assert "A_cat" not in df_encoded.columns
    assert "A_dog" in df_encoded.columns
    assert "A_mouse" in df_encoded.columns
    assert df_encoded.shape[1] == 4 # 2 original columns + 2 new one-hot encoded columns (dropping first category)

test_one_hot_encoding()