In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
df = pd.read_csv("train.csv")
df.head()
# House Price Advanced Regression 

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [10]:
df = df.loc[:,["BsmtQual","FireplaceQu","GarageType","SalePrice"]]
df.head()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice
0,Gd,,Attchd,208500
1,Gd,TA,Attchd,181500
2,Gd,TA,Attchd,223500
3,TA,Gd,Detchd,140000
4,Gd,TA,Attchd,250000


In [13]:
df.isnull().sum().sort_values(ascending = True)

SalePrice        0
BsmtQual        37
GarageType      81
FireplaceQu    690
dtype: int64

## 1.Frequent Category Imputation
If the feature has very less missing vals then we can replace those missing values with the most frequent occuring value

#### Advantages:
1. Easy to implement and Fast also

##### Disadvantages:
1. Poor if the missing value % is high in a feature
2. It distorts the relation of the most frequent label

In [15]:
df.groupby(["BsmtQual"])["BsmtQual"].count().sort_values(ascending = False)

BsmtQual
TA    649
Gd    618
Ex    121
Fa     35
Name: BsmtQual, dtype: int64

In [16]:
df["BsmtQual"].value_counts()

TA    649
Gd    618
Ex    121
Fa     35
Name: BsmtQual, dtype: int64

In [20]:
df["GarageType"].value_counts()

Attchd     870
Detchd     387
BuiltIn     88
Basment     19
CarPort      9
2Types       6
Name: GarageType, dtype: int64

In [18]:
df["FireplaceQu"].value_counts()

Gd    380
TA    313
Fa     33
Ex     24
Po     20
Name: FireplaceQu, dtype: int64

In [21]:
def impute_nan(df,variable):
    most_freq = df[variable].value_counts().index[0]
    df[variable +"_cfe"] = df[variable].fillna(most_freq)
    # here i am creating new features but oin real time replace the actual dataset

In [22]:
for features in ["BsmtQual","FireplaceQu","GarageType"]:
    impute_nan(df,features)

In [23]:
df.head()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice,BsmtQual_cfe,FireplaceQu_cfe,GarageType_cfe
0,Gd,,Attchd,208500,Gd,Gd,Attchd
1,Gd,TA,Attchd,181500,Gd,TA,Attchd
2,Gd,TA,Attchd,223500,Gd,TA,Attchd
3,TA,Gd,Detchd,140000,TA,Gd,Detchd
4,Gd,TA,Attchd,250000,Gd,TA,Attchd


In [28]:
df["BsmtQual"].mode()[0]

'TA'

## 2.Adding a Variable to Capture NAN

#### Advantages
1. Captures Importance of Missing Vals
2. Can be used even if % of missing vals in a feature is large

#### Disadv
1. Creates new Feature

In [29]:
df["BsmtQual_var"] = np.where(df["BsmtQual"].isnull(),1,0)
# Now we can replace the missing vals in the feature with the most freq val
# The newly created col captures the importance of the missing vals in the data

In [30]:
df.head()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice,BsmtQual_cfe,FireplaceQu_cfe,GarageType_cfe,BsmtQual_var
0,Gd,,Attchd,208500,Gd,Gd,Attchd,0
1,Gd,TA,Attchd,181500,Gd,TA,Attchd,0
2,Gd,TA,Attchd,223500,Gd,TA,Attchd,0
3,TA,Gd,Detchd,140000,TA,Gd,Detchd,0
4,Gd,TA,Attchd,250000,Gd,TA,Attchd,0


### 3.Also if a feature has more frequent Categories, we can replace the NAN with a category

In [34]:
def impute_nan_cat(df,variable):
    df[variable + "_nancat"] = np.where(df[variable].isnull(),"Missing", df[variable])

In [35]:
impute_nan_cat(df,"FireplaceQu")

In [36]:
df.head()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice,BsmtQual_cfe,FireplaceQu_cfe,GarageType_cfe,BsmtQual_var,FireplaceQu_nancat
0,Gd,,Attchd,208500,Gd,Gd,Attchd,0,Missing
1,Gd,TA,Attchd,181500,Gd,TA,Attchd,0,TA
2,Gd,TA,Attchd,223500,Gd,TA,Attchd,0,TA
3,TA,Gd,Detchd,140000,TA,Gd,Detchd,0,Gd
4,Gd,TA,Attchd,250000,Gd,TA,Attchd,0,TA
