In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import train_test_split


In [2]:
# load data
data = pd.read_csv(r"E:\AI\LibrariesForAI\Supervised_Learning\10_Lasso_Ridge\Melbourne_housing_FULL.csv")
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [3]:
data.shape

(34857, 21)

In [4]:
data.nunique()

Suburb             351
Address          34009
Rooms               12
Type                 3
Price             2871
Method               9
SellerG            388
Date                78
Distance           215
Postcode           211
Bedroom2            15
Bathroom            11
Car                 15
Landsize          1684
BuildingArea       740
YearBuilt          160
CouncilArea         33
Lattitude        13402
Longtitude       14524
Regionname           8
Propertycount      342
dtype: int64

In [5]:
# let's use limited columns which makes more sense for serving our purpose
cols_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 
               'Distance', 'CouncilArea', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'Price']
data = data[cols_to_use]
data.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,126.0,,
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,202.0,,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,0.0,156.0,79.0,1035000.0
3,Abbotsford,3,u,VB,Rounds,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,1.0,0.0,,
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,0.0,134.0,150.0,1465000.0


In [6]:
data.shape

(34857, 15)

In [7]:
data.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           3
Propertycount        3
Distance             1
CouncilArea          3
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
Price             7610
dtype: int64

In [8]:
data['Regionname'] = data['Regionname'].fillna(data['Regionname'].mode()[0])
data['CouncilArea'] = data['CouncilArea'].fillna(data['CouncilArea'].mode()[0])
data['Propertycount'] = data['Propertycount'].fillna(data['Propertycount'].median())
data['Distance'] = data['Distance'].fillna(data['Distance'].median())
data['Bedroom2'] = data['Bedroom2'].fillna(data['Bedroom2'].mode()[0])
data['Bathroom'] = data['Bathroom'].fillna(data['Bathroom'].mode()[0])
data['Car'] = data['Car'].fillna(data['Car'].mode()[0])
data['Landsize'] = data['Landsize'].fillna(data['Landsize'].median())
data['BuildingArea'] = data['BuildingArea'].fillna(data['BuildingArea'].median())


In [9]:
data.isna().sum()


Suburb              0
Rooms               0
Type                0
Method              0
SellerG             0
Regionname          0
Propertycount       0
Distance            0
CouncilArea         0
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
Price            7610
dtype: int64

In [10]:
data.dropna(inplace=True)                                                                         

In [11]:
data.shape

(27247, 15)

In [12]:
data.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,202.0,136.0,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,0.0,156.0,79.0,1035000.0
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,0.0,134.0,150.0,1465000.0
5,Abbotsford,3,h,PI,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,1.0,94.0,136.0,850000.0
6,Abbotsford,4,h,VB,Nelson,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,1.0,2.0,120.0,142.0,1600000.0


In [13]:
data = pd.get_dummies(data,drop_first=True)
data.head()

Unnamed: 0,Rooms,Propertycount,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price,Suburb_Aberfeldie,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
1,2,4019.0,2.5,2.0,1.0,1.0,202.0,136.0,1480000.0,0,...,0,0,0,0,0,0,0,0,1,0
2,2,4019.0,2.5,2.0,1.0,0.0,156.0,79.0,1035000.0,0,...,0,0,0,0,0,0,0,0,1,0
4,3,4019.0,2.5,3.0,2.0,0.0,134.0,150.0,1465000.0,0,...,0,0,0,0,0,0,0,0,1,0
5,3,4019.0,2.5,3.0,2.0,1.0,94.0,136.0,850000.0,0,...,0,0,0,0,0,0,0,0,1,0
6,4,4019.0,2.5,3.0,1.0,2.0,120.0,142.0,1600000.0,0,...,0,0,0,0,0,0,0,0,1,0


In [14]:
X=data.drop('Price',axis=1)
y=data['Price']

In [15]:
# Split the X and Y

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=2)


In [16]:
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [17]:
# Linear Regression

lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)

LinearRegression()

In [18]:
lin_reg.predict(X_test)

array([1677268.17753601, 1051939.94999695,  909473.57939148, ...,
        765347.93437195, 1161992.73905945,  471363.90751648])

In [19]:
trainAccuracy = lin_reg.score(X_train,y_train)
testAccuracy =lin_reg.score(X_test,y_test)

In [20]:
print(f"Train Accuarcy : {trainAccuracy}")
print(f"Test Accuarcy : {testAccuracy}")

Train Accuarcy : 0.6834667479492478
Test Accuarcy : -261222485097.0673


In [21]:
X_train.shape

(19072, 745)

In [22]:
# alpha=[0.1,0.01,0.001,1,5,10,50,100]

# gridsearchcv

# output -- best Alpha value


In [23]:
#### Using Ridge (L2 Regularized) Regression Model
ridge_reg=Ridge(alpha=0.1)
ridge_reg.fit(X_train,y_train)


Ridge(alpha=0.1)

In [24]:
# Alpha 50
trainAccuracy = ridge_reg.score(X_train,y_train)
testAccuracy =ridge_reg.score(X_test,y_test)
print(f"Train Accuarcy : {trainAccuracy}")
print(f"Test Accuarcy : {testAccuracy}")

Train Accuarcy : 0.683369069118993
Test Accuarcy : 0.6618733144481624


In [25]:
# Alpha 10
trainAccuracy = ridge_reg.score(X_train,y_train)
testAccuracy =ridge_reg.score(X_test,y_test)
print(f"Train Accuarcy : {trainAccuracy}")
print(f"Test Accuarcy : {testAccuracy}")

Train Accuarcy : 0.683369069118993
Test Accuarcy : 0.6618733144481624


In [26]:
# Alpha 0.1
trainAccuracy = ridge_reg.score(X_train,y_train)
testAccuracy =ridge_reg.score(X_test,y_test)
print(f"Train Accuarcy : {trainAccuracy}")
print(f"Test Accuarcy : {testAccuracy}")

Train Accuarcy : 0.683369069118993
Test Accuarcy : 0.6618733144481624


In [27]:
#### Using Lasso (L1 Regularized) Regression Model

lasso_reg = Lasso(alpha=10)
lasso_reg.fit(X_train, y_train)

trainAccuracy = lasso_reg.score(X_train,y_train)
testAccuracy =lasso_reg.score(X_test,y_test)
print(f"Train Accuarcy : {trainAccuracy}")
print(f"Test Accuarcy : {testAccuracy}")

KeyboardInterrupt: 