In [40]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import timeit

In [41]:
def load_housing_data():
    return pd.read_csv("dataset/housing.csv")

In [42]:
#load data from dataset
housing=load_housing_data()
#set Index
housing=housing.reset_index()
#Data
housing.head()

Unnamed: 0,index,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [43]:
# Split data into training/test set using StratifiedShuffleSplit
housing["income_cat"]=np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"]<5,5.0,inplace=True)
from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(n_splits=3,test_size=0.2,random_state=42)
for train_index,test_index in split.split(housing,housing["income_cat"]):
        strat_train_set=housing.loc[train_index]
        strat_test_set=housing.loc[test_index]



In [44]:
for set in strat_train_set,strat_test_set :
    set.drop(columns=['income_cat'])

# Data Preprocessing

In [45]:
#Extract Label attributes from Training Data

housing=strat_train_set.drop("median_house_value",axis=1)
housing_labels=strat_train_set["median_house_value"].copy()
housing_num=housing.drop("ocean_proximity",axis=1)

In [46]:
#Fill Missing value with median of data

from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")
housing_num=housing.drop("ocean_proximity",axis=1)
imputer.fit(housing_num)
X=imputer.transform(housing_num)
housing_tr=pd.DataFrame(X,columns=housing_num.columns)

In [47]:
#Convert categorical attribute(ocean_proximity) into numerical values using label encoding

from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
housing_cat=housing["ocean_proximity"]
housing_cat_encoded=encoder.fit_transform(housing_cat)
housing_tr_1=pd.DataFrame(housing_cat_encoded,columns=["ocean_proximity"])

In [48]:
#Perform scaling on Numerical Attribute

from sklearn.preprocessing import StandardScaler
scalar=StandardScaler()
scalar.fit(housing_tr)
housing_tr=scalar.transform(housing_tr)
housing_tr=pd.DataFrame(housing_tr,columns=housing_num.columns)


In [49]:
#Merge Numerical attributes and Categorical Attributes

housing_tr=housing_tr.join(housing_tr_1,lsuffix='_housing_tr', rsuffix='_housing_tr_1')
size1=housing_tr['latitude'].size
ones_matrix=np.ones(size1)
ones_matrix_1=ones_matrix.reshape((size1,1))


In [50]:
#Append 1's matrix in housing_matrix for increasing dimension

housing_matrix=housing_tr.as_matrix()
housing_matrix_1=np.append(ones_matrix_1,housing_matrix,axis=1)
X_matrix=housing_matrix_1

# Linear Regression Using Ridge Regression

In [51]:
#Linear Regression Using gradient descent

#store current time
start_time=timeit.default_timer()

Y_matrix=housing_labels.as_matrix()
theta=np.ones(X_matrix.shape[1])
print(Y_matrix)
m=len(Y_matrix)
alpha=.02
regC=0.2

#Calculate theta using gradient descent

for i in range(3000):
    cost_elem=np.dot(X_matrix,theta)-Y_matrix
    derivative=np.dot(cost_elem,X_matrix)  
    derivative=derivative+(regC*theta)
    theta=theta-2*((alpha/m)*derivative)

end_time=timeit.default_timer()

print("Theta",theta)

[122900. 243400. 245300. ... 238600. 456100. 114200.]
('Theta', array([ 2.06732415e+05,  3.33237617e+03, -8.18105560e+04, -8.68226312e+04,
        1.56142948e+04, -1.93670666e+04,  3.68937333e+04, -4.30326461e+04,
        3.04863137e+04,  6.57940884e+04,  1.20815240e+04,  8.60304606e+01]))


In [53]:
#Calculate Root_mean_square error

final_cost=np.dot(X_matrix,theta)-Y_matrix
final_cost=np.dot(final_cost,final_cost.transpose())
final_cost=np.sqrt((final_cost)/size1)
print("Training time",end_time-start_time)
print("Root Mean Squared error", final_cost)

('Training time', 0.49628710746765137)
('Root Mean Squared error', 69390.91940942075)
