In [16]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import timeit
def load_housing_data():
    return pd.read_csv("dataset/housing.csv")

In [17]:
#load data from dataset
housing=load_housing_data()


In [18]:
#set index
housing=housing.reset_index()
#Data 
housing.head()

Unnamed: 0,index,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [19]:
# Split data into training/test set using StratifiedShuffleSplit
housing["income_cat"]=np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"]<5,5.0,inplace=True)

from sklearn.model_selection import StratifiedShuffleSplit

split=StratifiedShuffleSplit(n_splits=3,test_size=0.2,random_state=42)

for train_index,test_index in split.split(housing,housing["income_cat"]):
        strat_train_set=housing.loc[train_index]
        strat_test_set=housing.loc[test_index]


In [20]:
for set in strat_train_set,strat_test_set :
    set.drop(columns=['income_cat'])

# Data Preprocessing

In [21]:
#Extract Label attributes from Training Data

housing=strat_train_set.drop("median_house_value",axis=1)
housing_labels=strat_train_set["median_house_value"].copy()
housing_num=housing.drop("ocean_proximity",axis=1)

In [22]:
from sklearn.preprocessing import Imputer

#Fill Missing value with median of data

imputer = Imputer(strategy="median")
housing_num=housing.drop("ocean_proximity",axis=1)
imputer.fit(housing_num)
X=imputer.transform(housing_num)
housing_tr=pd.DataFrame(X,columns=housing_num.columns)


In [23]:
from sklearn.preprocessing import LabelEncoder

#Convert categorical attribute(ocean_proximity) into numerical values using label encoding

encoder=LabelEncoder()
housing_cat=housing["ocean_proximity"]
housing_cat_encoded=encoder.fit_transform(housing_cat)
housing_tr_1=pd.DataFrame(housing_cat_encoded,columns=["ocean_proximity"])

In [24]:
#Perform scaling on Numerical Attribute

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(housing_tr)
housing_tr=scaler.transform(housing_tr)
housing_tr=pd.DataFrame(housing_tr,columns=housing_num.columns)


In [25]:

housing_tr=housing_tr.join(housing_tr_1,lsuffix='_housing_tr', rsuffix='_housing_tr_1')


# Lasso Regression

In [27]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

# Apply Lasso Regression using Liberary 

#store current time
start_time=timeit.default_timer()

lassoreg = Lasso(alpha=2,normalize=True, max_iter=1e5)
lassoreg.fit(housing_tr,housing_labels)

#store end time
end_time=timeit.default_timer()

y_pred = lassoreg.predict(housing_tr)
lin_mse=mean_squared_error(housing_labels,y_pred)
lin_rmse=np.sqrt(lin_mse)

print("Training Time",end_time-start_time)
print("Mean Squared Error: ",lin_mse)
print("Root mean squared Error: ",lin_rmse)


('Training Time', 0.08852410316467285)
('Mean Squared Error: ', 4819819886.332079)
('Root mean squared Error: ', 69424.92265989268)
