In [23]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import timeit

In [24]:
#load data from dataset
def load_housing_data():
    return pd.read_csv("dataset/housing.csv")

In [25]:
housing=load_housing_data()
#set Index
housing=housing.reset_index()
#Data
housing.head()


Unnamed: 0,index,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [26]:
# Split data into training/test set using StratifiedShuffleSplit

housing["income_cat"]=np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"]<5,5.0,inplace=True)

from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(n_splits=3,test_size=0.2,random_state=42)
for train_index,test_index in split.split(housing,housing["income_cat"]):
        strat_train_set=housing.loc[train_index]
        strat_test_set=housing.loc[test_index]


In [27]:
for set in strat_train_set,strat_test_set :
    set.drop(columns=['income_cat'])

# Data Preprocessing

In [28]:
#Extract Label attributes from Training Data

housing=strat_train_set.drop("median_house_value",axis=1)
housing_labels=strat_train_set["median_house_value"].copy()
housing_num=housing.drop("ocean_proximity",axis=1)

In [29]:
from sklearn.preprocessing import Imputer

#Fill Missing value with median of data

imputer = Imputer(strategy="median")
housing_num=housing.drop("ocean_proximity",axis=1)
imputer.fit(housing_num)
X=imputer.transform(housing_num)
housing_tr=pd.DataFrame(X,columns=housing_num.columns)

In [30]:
from sklearn.preprocessing import LabelEncoder

#Convert categorical attribute(ocean_proximity) into numerical values using label encoding

encoder=LabelEncoder()
housing_cat=housing["ocean_proximity"]
housing_cat_encoded=encoder.fit_transform(housing_cat)
housing_tr_1=pd.DataFrame(housing_cat_encoded,columns=["ocean_proximity"])

In [31]:
#Merge Numerical attributes and Categorical Attributes

housing_tr.join(housing_tr_1,lsuffix='_housing_tr', rsuffix='_housing_tr_1')
size1=housing_tr['latitude'].size
ones_matrix=np.ones(size1)
ones_matrix_1=ones_matrix.reshape((size1,1))


In [32]:
#Append 1's matrix in housing_matrix for increasing dimension

housing_matrix=housing_tr.as_matrix()
housing_matrix_1=np.append(ones_matrix_1,housing_matrix,axis=1)
X_matrix=housing_matrix_1

# Linear Regression using Closed form

In [33]:
#hosung_matrix_1 is X_matrix which contains trianing data and Y_matrix contains label attributes

#Linear Regression Closed Form

#store current time
start_time=timeit.default_timer()

housing_matrix_1=X_matrix
Y_matrix=housing_labels.as_matrix()
housing_transpose=housing_matrix_1.transpose()
housing_product=np.dot(housing_transpose,housing_matrix_1)

#Calculate Sudo inverse

housing_inverse=np.linalg.pinv(housing_product)
into_XTranspose=np.dot(housing_inverse,housing_transpose)
into_Y=np.dot(into_XTranspose,Y_matrix)

#store end time
end_time=timeit.default_timer()


In [35]:
#Use Theta to calculate root_min_square_error

W=into_Y

print("Theta:",W)

productXW=np.dot(housing_matrix_1,W)
minusXWY=productXW-Y_matrix
cost=np.dot(minusXWY.transpose(),minusXWY)
cost=np.sqrt(cost/size1)

print("Root Mean Squared Error:",cost)
print("Training time ",end_time-start_time)

('Theta:', array([-3.46114066e+06,  5.60010740e-01, -4.10074814e+04, -4.08286771e+04,
        1.24324794e+03, -8.89131442e+00,  8.95779857e+01, -3.75089782e+01,
        7.74822946e+01,  3.46304082e+04,  1.14404436e+04]))
('Root Mean Squared Error:', 69390.76219781501)
('Training time ', 0.002814054489135742)
