# Mean / median imputation in Scikit-learn 
We will use the data from the 
[Housing Dataset](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data?select=train.csv)

In [None]:
import io
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
# Limit our data and use only these columns
cols_to_use = [
    "TotalBsmtSF",
    "GrLivArea",
    "BsmtUnfSF",
    "LotFrontage",
    "MasVnrArea",
    "GarageYrBlt",
    "SalePrice",
]

In [None]:
# Load the House Prices dataset.
data = pd.read_csv(io.BytesIO(uploaded['houseprice.csv']), usecols=cols_to_use)
data.head()

In [None]:
# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("SalePrice", axis=1),  
    data["SalePrice"],  
    test_size=0.3,  
    random_state=42
    )

X_train.shape, X_test.shape

In [None]:
# let's check the misssing data

X_train.isnull().mean()

In [None]:
# Capture the variables to impute in a list.
vars_to_impute = [var for var in X_train.columns if X_train[var].isnull().sum() > 0]
vars_to_impute

# SimpleImputer - default

In [None]:
imputer = SimpleImputer(strategy="median")
# We fit the imputer to the train set.
# The imputer will learn the median of all variables.
imputer.fit(X_train)

**For Mean Imputation use:**
```python 
imputer = SimpleImputer(strategy="mean")
```

In [None]:
# Learnt Medians
imputer.statistics_

In [None]:
# Validate that the median from SimpleImputer
X_train.median()

In [None]:
# Now we impute the train and test sets.
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

X_train

In [None]:
# Encode Train Set back to a Dataframe
X_train = pd.DataFrame(
    X_train,
    columns=imputer.get_feature_names_out(),
)

X_train.head()

In [None]:
# Let's explore the change in the distributions after the median imputation

X_train.hist(bins=50, figsize=(10, 10))
plt.show()

# SimpleImputer - dataframe

In [None]:
# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("SalePrice", axis=1),  
    data["SalePrice"],  
    test_size=0.3,  
    random_state=42
    )

X_train.shape, X_test.shape

In [None]:
imputer = SimpleImputer(strategy="mean").set_output(transform="pandas")
imputer.fit(X_train)

In [None]:
# Learnt Means
imputer.statistics_

In [None]:
# Impute the Data
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

# Dataframe
X_train.head()

# SimpleImputer - feature subsets

In [None]:
# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("SalePrice", axis=1),  
    data["SalePrice"],  
    test_size=0.3,  
    random_state=42
    )

X_train.shape, X_test.shape

### Impute Specific Columns with different strategies

- `Mean` -> **LotFrontage** 
- `Median` -> **MasVnrArea** & **GarageYrBlt** 

In [None]:
imputer = ColumnTransformer(
    transformers=[
        ("mean_imputer", SimpleImputer(strategy="mean"), ["LotFrontage"]),
        (
            "median_imputer",
            SimpleImputer(strategy="median"),
            ["MasVnrArea", "GarageYrBlt"],
        ),
    ],
    remainder="passthrough", # Untransformed columns will be added to the final dataframe
    # verbose_feature_names_out=False #Uncomment in order to remove prefix in transformed df
)
# Return all columns that including the ones that are not transformed

In [None]:
imputer.set_output(transform="pandas")

In [None]:
imputer.fit(X_train)

In [None]:
# Explore the Imputers
imputer.transformers

In [None]:
# Mean Imputer Statistics
imputer.named_transformers_["mean_imputer"].statistics_

In [None]:
# Median Imputer Statistics
imputer.named_transformers_["median_imputer"].statistics_

In [None]:
# Impute Data
X_train = imputer.transform(X_train)

# check resulting dataframe
X_train.head()

In [None]:
# Impute the test set
X_test = imputer.transform(X_test)
X_test.head()

In [None]:
X_test.isnull().sum()