## Arbitrary value imputation - Pandas
We will use the data from the 
[Housing Dataset](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data?select=train.csv)

In [None]:
import io
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
# Limit our data and use only these columns
cols_to_use = [
    "TotalBsmtSF",
    "GrLivArea",
    "BsmtUnfSF",
    "LotFrontage",
    "MasVnrArea",
    "GarageYrBlt",
    "SalePrice",
]

In [None]:
# Load the House Prices dataset.
data = pd.read_csv(io.BytesIO(uploaded['houseprice.csv']), usecols=cols_to_use)
data.head()

In [None]:
# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("SalePrice", axis=1),  
    data["SalePrice"],  
    test_size=0.3,  
    random_state=42
    )

X_train.shape, X_test.shape

In [None]:
# Find missing data
X_train.isnull().mean()

In [None]:
# Capture the variables to impute in a list.
vars_to_impute = [var for var in X_train.columns if X_train[var].isnull().sum() > 0]
vars_to_impute

In [None]:
# Capture the median of the 3 variables in a dictionary
imputation_dict = X_train[vars_to_impute].median().to_dict()
imputation_dict

In [None]:
# Let's plot the distributions of the variables.
X_train[vars_to_impute].hist(bins=50, figsize=(10, 8))
plt.show()

In [None]:
X_train[vars_to_impute].agg(["min", "max"])

In [None]:
imputation_dict = {
    "LotFrontage": 999,
    "MasVnrArea": 1999,
    "GarageYrBlt": 2999,
}

imputation_dict

In [None]:
# Replace missing data
X_train.fillna(imputation_dict, inplace=True)
X_test.fillna(imputation_dict, inplace=True)

In [None]:
# Validate Replacement for Train Data
X_train.isnull().sum()

In [None]:
# Validate Replacement for Train Data
X_test.isnull().sum()

In [None]:
# Let's plot the distributions of the variables.
X_train[vars_to_impute].hist(bins=50, figsize=(10, 8))
plt.show()