In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply
import seaborn as sns
from sklearn.model_selection import train_test_split


In [2]:
# Suppress Warnings for clean notebook
import warnings
warnings.filterwarnings('ignore')

In [3]:
# read dataset
dataset = pd.read_csv("C:/Users/NEELAM SUSHMITHA/OneDrive/Documents/Melbourne_housing_FULL.csv")
print(dataset.head())

       Suburb             Address  Rooms Type      Price Method SellerG  \
0  Abbotsford       68 Studley St      2    h        NaN     SS  Jellis   
1  Abbotsford        85 Turner St      2    h  1480000.0      S  Biggin   
2  Abbotsford     25 Bloomburg St      2    h  1035000.0      S  Biggin   
3  Abbotsford  18/659 Victoria St      3    u        NaN     VB  Rounds   
4  Abbotsford        5 Charles St      3    h  1465000.0     SP  Biggin   

        Date  Distance  Postcode  ...  Bathroom  Car  Landsize  BuildingArea  \
0  3/09/2016       2.5    3067.0  ...       1.0  1.0     126.0           NaN   
1  3/12/2016       2.5    3067.0  ...       1.0  1.0     202.0           NaN   
2  4/02/2016       2.5    3067.0  ...       1.0  0.0     156.0          79.0   
3  4/02/2016       2.5    3067.0  ...       2.0  1.0       0.0           NaN   
4  4/03/2017       2.5    3067.0  ...       2.0  0.0     134.0         150.0   

   YearBuilt         CouncilArea Lattitude  Longtitude             R

In [4]:
#The .nunique() function is used to count the number of unique values in each column (or row) of a dataFrame.

print(dataset.nunique())

Suburb             351
Address          34009
Rooms               12
Type                 3
Price             2871
Method               9
SellerG            388
Date                78
Distance           215
Postcode           211
Bedroom2            15
Bathroom            11
Car                 15
Landsize          1684
BuildingArea       740
YearBuilt          160
CouncilArea         33
Lattitude        13402
Longtitude       14524
Regionname           8
Propertycount      342
dtype: int64


In [5]:
# Remove the columns which are unnecessary
columns_to_drop = ['Address', 'Date', 'Postcode', 'YearBuilt', 'Longtitude', 'Lattitude']
dataset = dataset.drop(columns=[col for col in columns_to_drop if col in dataset.columns])


In [6]:
print(dataset.columns)

Index(['Suburb', 'Rooms', 'Type', 'Price', 'Method', 'SellerG', 'Distance',
       'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea',
       'CouncilArea', 'Regionname', 'Propertycount'],
      dtype='object')


In [7]:
#Checking for Nan values
dataset.isna().sum()

Suburb               0
Rooms                0
Type                 0
Price             7610
Method               0
SellerG              0
Distance             1
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
CouncilArea          3
Regionname           3
Propertycount        3
dtype: int64

In [8]:
#Handling Missing values
cols_to_fill_zero = ['Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car']
dataset[cols_to_fill_zero] = dataset[cols_to_fill_zero].fillna(0)
dataset.isna().sum()


Suburb               0
Rooms                0
Type                 0
Price             7610
Method               0
SellerG              0
Distance             0
Bedroom2             0
Bathroom             0
Car                  0
Landsize         11810
BuildingArea     21115
CouncilArea          3
Regionname           3
Propertycount        0
dtype: int64

In [9]:
# Replacing null values with their imputed mean for continuous value columns
dataset['Landsize'] = dataset['Landsize'].fillna(dataset.Landsize.mean())
dataset['BuildingArea'] = dataset['BuildingArea'].fillna(dataset.BuildingArea.mean())
dataset.isna().sum()


Suburb              0
Rooms               0
Type                0
Price            7610
Method              0
SellerG             0
Distance            0
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
CouncilArea         3
Regionname          3
Propertycount       0
dtype: int64

In [10]:
#Drop NA values of Price, since it's our predictive variable we won't impute it
dataset.dropna(inplace=True)
dataset.isna().sum()

Suburb           0
Rooms            0
Type             0
Price            0
Method           0
SellerG          0
Distance         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
CouncilArea      0
Regionname       0
Propertycount    0
dtype: int64

In [11]:
#Let's one hot encode the categorical features
dataset = pd.get_dummies(dataset, drop_first=True)
dataset.head()

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Propertycount,Suburb_Aberfeldie,...,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
1,2,1480000.0,2.5,2.0,1.0,1.0,202.0,160.2564,4019.0,False,...,False,True,False,False,True,False,False,False,False,False
2,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,4019.0,False,...,False,True,False,False,True,False,False,False,False,False
4,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,4019.0,False,...,False,True,False,False,True,False,False,False,False,False
5,3,850000.0,2.5,3.0,2.0,1.0,94.0,160.2564,4019.0,False,...,False,True,False,False,True,False,False,False,False,False
6,4,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,4019.0,False,...,False,True,False,False,True,False,False,False,False,False


In [12]:
X = dataset.drop("Price", axis=1)
y = dataset["Price"]

In [13]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
from sklearn.linear_model import LinearRegression
reg=LinearRegression().fit(train_X, train_y)

In [15]:
reg.score(test_X, test_y)

0.6710114344346452

In [16]:
reg.score(train_X, train_y)

0.6746078592844227

In [17]:
#Here training score is 67% but test score is 67% which is very low

In [18]:
#Using Lasso(L1 Regularized) Regression Model
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=50, max_iter=100, tol=0.1)
lasso_reg.fit(train_X, train_y)

In [19]:
lasso_reg.score(test_X, test_y)

0.6803288342232796

In [20]:
lasso_reg.score(train_X, train_y)

0.6702768387033115

In [21]:
#Using Ridge (L2 Regularized) Regression Model
from sklearn.linear_model import Ridge
ridge_reg= Ridge(alpha=50, max_iter=100, tol=0.1)
ridge_reg.fit(train_X, train_y)

In [22]:
ridge_reg.score(test_X, test_y)

0.6718534090692831

In [23]:
ridge_reg.score(train_X, train_y)

0.6568699146148917

In [24]:
#We see that Lasso and Ridge Regularizations prove to be beneficial when our Simple Linear Regression Model overfits.
#These results may not be that contrast but significant in most cases.
#Also that L1 & L2 Regularizations are used in Neural Networks too.