## Mean / Median imputation per group - pandas

To download the House Prices dataset, please refer to the lecture **Datasets** in **Section 2** of this course.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# We'll use the following variables,
# 3 of which contain NA.

cols_to_use = [
    "LotShape",
    "OverallQual",
    "TotalBsmtSF",
    "LotFrontage",
    "MasVnrArea",
    "GarageYrBlt",
    "SalePrice",
]

In [3]:
# Let's load the House Prices dataset.

data = pd.read_csv("../../houseprice.csv", usecols=cols_to_use)

data.head()

Unnamed: 0,LotFrontage,LotShape,OverallQual,MasVnrArea,TotalBsmtSF,GarageYrBlt,SalePrice
0,65.0,Reg,7,196.0,856,2003.0,208500
1,80.0,Reg,6,0.0,1262,1976.0,181500
2,68.0,IR1,7,162.0,920,2001.0,223500
3,60.0,IR1,7,0.0,756,1998.0,140000
4,84.0,IR1,8,350.0,1145,2000.0,250000


**Remember that the mean or the median that we will use to replace the NA are calculated using the train set.**

In [4]:
# Let's separate into training and testing sets.

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("SalePrice", axis=1),
    data["SalePrice"],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((1022, 6), (438, 6))

In [5]:
# Find missing data

X_train.isnull().mean()

LotFrontage    0.184932
LotShape       0.000000
OverallQual    0.000000
MasVnrArea     0.004892
TotalBsmtSF    0.000000
GarageYrBlt    0.052838
dtype: float64

In [6]:
# Capture the variables to impute in a list.

vars_to_impute = [var for var in X_train.columns if X_train[var].isnull().sum() > 0]

vars_to_impute

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [7]:
# let's find out if the mean value differs

X_train.groupby("LotShape")[["LotFrontage", "MasVnrArea", "GarageYrBlt"]].mean()

Unnamed: 0_level_0,LotFrontage,MasVnrArea,GarageYrBlt
LotShape,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
IR1,74.158371,131.869565,1984.081325
IR2,80.55,84.827586,1987.689655
IR3,119.2,0.0,1984.666667
Reg,67.18569,90.045526,1974.126456


In [8]:
# Number of observations per category

X_train["LotShape"].value_counts()

Reg    642
IR1    345
IR2     29
IR3      6
Name: LotShape, dtype: int64

In [9]:
imputation_dict = {}

for i in X_train["LotShape"].unique():

    imputation_dict[i] = (
        X_train[X_train["LotShape"] == i][vars_to_impute].mean().to_dict()
    )

imputation_dict

{'Reg': {'LotFrontage': 67.18568994889267,
  'MasVnrArea': 90.04552590266876,
  'GarageYrBlt': 1974.126455906822},
 'IR1': {'LotFrontage': 74.15837104072398,
  'MasVnrArea': 131.8695652173913,
  'GarageYrBlt': 1984.0813253012047},
 'IR2': {'LotFrontage': 80.55,
  'MasVnrArea': 84.82758620689656,
  'GarageYrBlt': 1987.6896551724137},
 'IR3': {'LotFrontage': 119.2,
  'MasVnrArea': 0.0,
  'GarageYrBlt': 1984.6666666666667}}

To perform mean imputation instead of median, we just replace the previous code by: `imputation_dict = X_train[vars_to_impute].mean().to_dict()`


In [10]:
X_train.head(10)

Unnamed: 0,LotFrontage,LotShape,OverallQual,MasVnrArea,TotalBsmtSF,GarageYrBlt
64,,Reg,7,573.0,1057,1998.0
682,,Reg,6,0.0,1291,1996.0
960,50.0,IR1,5,0.0,858,
1384,60.0,Reg,6,0.0,560,1939.0
1100,60.0,Reg,2,0.0,290,1930.0
416,74.0,Reg,6,203.0,672,1978.0
1034,50.0,Reg,5,0.0,920,1938.0
853,,IR1,6,115.0,1127,1964.0
472,35.0,Reg,6,80.0,547,2005.0
1011,75.0,Reg,5,0.0,0,


In [11]:
# Replace missing data

for i in imputation_dict.keys():

    X_train[X_train["LotShape"] == i] = X_train[X_train["LotShape"] == i].fillna(
        imputation_dict[i]
    )
    X_test[X_test["LotShape"] == i] = X_test[X_test["LotShape"] == i].fillna(
        imputation_dict[i]
    )

In [12]:
# Corroborate replacement

X_train.head(10)

Unnamed: 0,LotFrontage,LotShape,OverallQual,MasVnrArea,TotalBsmtSF,GarageYrBlt
64,67.18569,Reg,7,573.0,1057,1998.0
682,67.18569,Reg,6,0.0,1291,1996.0
960,50.0,IR1,5,0.0,858,1984.081325
1384,60.0,Reg,6,0.0,560,1939.0
1100,60.0,Reg,2,0.0,290,1930.0
416,74.0,Reg,6,203.0,672,1978.0
1034,50.0,Reg,5,0.0,920,1938.0
853,74.158371,IR1,6,115.0,1127,1964.0
472,35.0,Reg,6,80.0,547,2005.0
1011,75.0,Reg,5,0.0,0,1974.126456


In [13]:
# Corroborate replacement

X_test.isnull().sum()

LotFrontage    0
LotShape       0
OverallQual    0
MasVnrArea     0
TotalBsmtSF    0
GarageYrBlt    0
dtype: int64