In [1]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("Melbourne_housing_FULL.csv")
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [3]:
df.shape

(34857, 21)

### DATA PREPROCESSING

In [4]:
# lets use only the important columns
df2 = df[['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount','Distance', 'CouncilArea', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'Price']]

df2.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,126.0,,
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,202.0,,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,0.0,156.0,79.0,1035000.0
3,Abbotsford,3,u,VB,Rounds,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,1.0,0.0,,
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,0.0,134.0,150.0,1465000.0


Looks pretty good now!

In [5]:
# checking null values in columns
df2.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           3
Propertycount        3
Distance             1
CouncilArea          3
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
Price             7610
dtype: int64

Lets fill 'Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car' with zero's

In [6]:
fill_values = ['Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car']
df2[fill_values] = df[fill_values].fillna(0)

df2.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           3
Propertycount        0
Distance             0
CouncilArea          3
Bedroom2             0
Bathroom             0
Car                  0
Landsize         11810
BuildingArea     21115
Price             7610
dtype: int64

Lets fill 'Landsize' and 'BuildingArea' with **mean** or **median**

In [7]:
df2.Landsize.mean()

593.598993361392

In [8]:
df2.Landsize.median()

521.0

In [9]:
df2.BuildingArea.mean()

160.2564003565711

In [10]:
df2.BuildingArea.median()

136.0

Lets pick the mean cuz it has high values

In [11]:
df2['Landsize'] = df2['Landsize'].fillna(df2.Landsize.mean())
df2['BuildingArea'] = df2['BuildingArea'].fillna(df2.BuildingArea.mean())

df2.isna().sum()

Suburb              0
Rooms               0
Type                0
Method              0
SellerG             0
Regionname          3
Propertycount       0
Distance            0
CouncilArea         3
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
Price            7610
dtype: int64

In [12]:
df2.shape

(34857, 15)

now lets just drop all the null values we won't need them.

In [13]:
final_df = df2.dropna()

final_df.isna().sum()

Suburb           0
Rooms            0
Type             0
Method           0
SellerG          0
Regionname       0
Propertycount    0
Distance         0
CouncilArea      0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
Price            0
dtype: int64

uffff finally! data cleaning is such a mess...

In [14]:
final_df.shape

(27244, 15)

- df -> (34857, 21) Orginal dataset with all columns and rows.
- df2 -> (34857, 15) dataset with only important columns
- final_df -> (27244, 15) dataset after removing null value rows


Now, lets check columns with text

In [15]:
final_df.select_dtypes(include=['object']).columns

Index(['Suburb', 'Type', 'Method', 'SellerG', 'Regionname', 'CouncilArea'], dtype='object')

### OneHotEncoding

In [16]:
final_df = pd.get_dummies(final_df, drop_first=True)

final_df.head()

Unnamed: 0,Rooms,Propertycount,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price,Suburb_Aberfeldie,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
1,2,4019.0,2.5,2.0,1.0,1.0,202.0,160.2564,1480000.0,0,...,0,0,0,0,0,0,0,0,1,0
2,2,4019.0,2.5,2.0,1.0,0.0,156.0,79.0,1035000.0,0,...,0,0,0,0,0,0,0,0,1,0
4,3,4019.0,2.5,3.0,2.0,0.0,134.0,150.0,1465000.0,0,...,0,0,0,0,0,0,0,0,1,0
5,3,4019.0,2.5,3.0,2.0,1.0,94.0,160.2564,850000.0,0,...,0,0,0,0,0,0,0,0,1,0
6,4,4019.0,2.5,3.0,1.0,2.0,120.0,142.0,1600000.0,0,...,0,0,0,0,0,0,0,0,1,0


Imp: **drop_first** is used cuz there is a rule to drop one dummy column after OneHotEncoding

### TRAINING

In [17]:
X = final_df.drop('Price', axis='columns')
y = final_df.Price

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

In [19]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train,y_train)

In [20]:
# training data
lr.score(X_train, y_train)

0.6827792395792723

In [21]:
# testing data
lr.score(X_test, y_test)

0.13853683161455688

#### Here, we can see the problem "Overfitting".
- Training score is **68%** but test score is **13.85%** which is very low.
- The model performs well on training data but not on the testing/unseen datata

### L1 Regularization (Lasso Regression Model)

In [24]:
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=50, max_iter=100, tol=0.1)
lasso_reg.fit(X_train, y_train)

In [25]:
# training data
lasso_reg.score(X_train, y_train)

0.6766985624766824

In [26]:
# testing data
lasso_reg.score(X_test, y_test)

0.6636111369404488

As we can see L1 Regularization boosted the model accuracy for both training and testing data

### L2 Regularization (Ridge Regression Model)

In [28]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=50, max_iter=100, tol=0.1)
ridge_reg.fit(X_train, y_train)

In [29]:
# training data
ridge_reg.score(X_train, y_train)

0.6622376739684328

In [30]:
# testing data
ridge_reg.score(X_test, y_test)

0.6670848945194959

**As we can see L1 and L2 Regularization boosted the model accuracy for both training and testing data. Also that L1 & L2 Regularizations are used in Neural Networks too.**