# Data Preprocessing Pipeline

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
def data_preprocessing_pipeline(data):
    numeric_features = data.select_dtypes(include=['float','int']).columns
    categorical_features =data.select_dtypes(include=['object']).columns
    #identify categorical and numeric features.
    data[numeric_features] = data[numeric_features].fillna(data[numeric_features].mean())
    #handles missing value in numeric functions
    #detect and handle outliers in numeric features using IQR
    for feature in numeric_features:
        Q1=data[feature].quantile(0.25)
        Q3=data[feature].quantile(0.75)
        IQR=Q3-Q1
        lower_bound=Q1-(1.5*IQR)
        upper_bound=Q3+(1.5*IQR)
        data[feature]=np.where((data[feature]<lower_bound) | (data[feature]>upper_bound),data[feature].mean(),data[feature])
        
    #normalize numeric feature
    scaler=StandardScaler()
    scaled_data=scaler.fit_transform(data[numeric_features])
    data[numeric_features] =scaler.transform(data[numeric_features])
    
    #handle missing value in categorical features
    data[categorical_features] = data[categorical_features].fillna(data[categorical_features].mode().iloc[0])
    
    return data

In [3]:
data =pd.read_csv('melb_data.csv')

In [4]:
data.drop(['Lattitude','Longtitude','Regionname','Propertycount'],inplace=True,axis='columns')

In [5]:
print('Residency Chart')
data.head(20)

Residency Chart


Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,1.0,94.0,,,Yarra
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra
5,Abbotsford,129 Charles St,2,h,941000.0,S,Jellis,7/05/2016,2.5,3067.0,2.0,1.0,0.0,181.0,,,Yarra
6,Abbotsford,124 Yarra St,3,h,1876000.0,S,Nelson,7/05/2016,2.5,3067.0,4.0,2.0,0.0,245.0,210.0,1910.0,Yarra
7,Abbotsford,98 Charles St,2,h,1636000.0,S,Nelson,8/10/2016,2.5,3067.0,2.0,1.0,2.0,256.0,107.0,1890.0,Yarra
8,Abbotsford,6/241 Nicholson St,1,u,300000.0,S,Biggin,8/10/2016,2.5,3067.0,1.0,1.0,1.0,0.0,,,Yarra
9,Abbotsford,10 Valiant St,2,h,1097000.0,S,Biggin,8/10/2016,2.5,3067.0,3.0,1.0,2.0,220.0,75.0,1900.0,Yarra


In [6]:
cleaned_data = data_preprocessing_pipeline(data)

In [10]:
print('After Preprocessing Data')
cleaned_data.head(20)

After Preprocessing Data


Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
0,Abbotsford,85 Turner St,-1.03477,h,1.109108,S,Biggin,3/12/2016,-1.569191,-0.514306,-1.017322,-0.810449,-0.677116,-0.760039,0.383399,-0.082431,Yarra
1,Abbotsford,25 Bloomburg St,-1.03477,h,0.108352,S,Biggin,4/02/2016,-1.569191,-0.514306,-1.017322,-0.810449,-2.093574,-0.921361,-3.000815,-0.082431,Yarra
2,Abbotsford,5 Charles St,0.218131,h,1.075374,SP,Biggin,4/03/2017,-1.569191,-0.514306,0.237907,0.796158,-2.093574,-0.998515,0.29214,-0.082431,Yarra
3,Abbotsford,40 Federation La,0.218131,h,-0.307692,PI,Biggin,4/03/2017,-1.569191,-0.514306,0.237907,0.796158,-0.677116,-1.138795,0.383399,-0.082431,Yarra
4,Abbotsford,55a Park St,1.471032,h,1.378974,VB,Nelson,4/06/2016,-1.569191,-0.514306,0.237907,-0.810449,0.739342,-1.047613,-0.078897,-0.082431,Yarra
5,Abbotsford,129 Charles St,-1.03477,h,-0.103043,S,Jellis,7/05/2016,-1.569191,-0.514306,-1.017322,-0.810449,-2.093574,-0.833686,0.383399,-0.082431,Yarra
6,Abbotsford,124 Yarra St,0.218131,h,1.999667,S,Nelson,7/05/2016,-1.569191,-0.514306,1.493136,0.796158,-2.093574,-0.609238,0.383399,-0.082431,Yarra
7,Abbotsford,98 Charles St,-1.03477,h,1.459934,S,Nelson,8/10/2016,-1.569191,-0.514306,-1.017322,-0.810449,0.739342,-0.570661,-1.702185,-0.082431,Yarra
8,Abbotsford,6/241 Nicholson St,-2.287671,u,-1.54458,S,Biggin,8/10/2016,-1.569191,-0.514306,-2.272552,-0.810449,-0.677116,-1.468453,0.383399,-0.082431,Yarra
9,Abbotsford,10 Valiant St,-1.03477,h,0.247783,S,Biggin,8/10/2016,-1.569191,-0.514306,0.237907,-0.810449,0.739342,-0.696913,0.383399,-0.082431,Yarra
