In [35]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import timeit

In [36]:
#load data from dataset
def load_housing_data():
    return pd.read_csv("dataset/housing.csv")

In [37]:
housing=load_housing_data()
#set Index
housing=housing.reset_index()
#Data
housing.head()


Unnamed: 0,index,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [38]:
# Split data into training/test set using StratifiedShuffleSplit

housing["income_cat"]=np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"]<5,5.0,inplace=True)

from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(n_splits=3,test_size=0.2,random_state=42)
for train_index,test_index in split.split(housing,housing["income_cat"]):
        strat_train_set=housing.loc[train_index]
        strat_test_set=housing.loc[test_index]


In [39]:
for set in strat_train_set,strat_test_set :
    set.drop(columns=['income_cat'])

# Data Processing

In [40]:
#Extract Label attributes from Training Data

housing=strat_train_set.drop("median_house_value",axis=1)
housing_labels=strat_train_set["median_house_value"].copy()
housing_num=housing.drop("ocean_proximity",axis=1)

In [41]:
from sklearn.preprocessing import Imputer

#Fill Missing value with median of data

imputer = Imputer(strategy="median")
housing_num=housing.drop("ocean_proximity",axis=1)
imputer.fit(housing_num)
X=imputer.transform(housing_num)
housing_tr=pd.DataFrame(X,columns=housing_num.columns)

In [42]:
from sklearn.preprocessing import LabelEncoder

#Convert categorical attribute(ocean_proximity) into numerical values using label encoding

encoder=LabelEncoder()
housing_cat=housing["ocean_proximity"]
housing_cat_encoded=encoder.fit_transform(housing_cat)
housing_tr_1=pd.DataFrame(housing_cat_encoded,columns=["ocean_proximity"])

In [43]:
#Perform scaling on Numerical Attribute

from sklearn.preprocessing import StandardScaler
scalar=StandardScaler()
scalar.fit(housing_tr)
housing_tr=scalar.transform(housing_tr)
housing_tr=pd.DataFrame(housing_tr,columns=housing_num.columns)
print(housing_tr.head())

      index  longitude  latitude  housing_median_age  total_rooms  \
0 -0.527959   0.684384 -0.738029            1.464498    -0.626342   
1  0.739448   1.258295 -1.323678           -0.288400    -0.059509   
2 -0.098373  -1.087254  0.545714           -1.005495     0.556133   
3  0.160388   0.779204 -0.859844           -0.846141     0.481538   
4  0.071108   0.824119 -0.948863            0.667726    -0.251063   

   total_bedrooms  population  households  median_income  income_cat  
0       -0.650005   -0.204092   -0.625833      -0.753784   -0.954456  
1       -0.456875   -0.314209   -0.390613       0.985693    0.942051  
2        0.198812    0.325518    0.291527       0.269602   -0.006202  
3        0.375251    0.343871    0.398683      -0.113819   -0.006202  
4       -0.063463   -0.658543   -0.286070       0.650443    0.942051  


In [44]:
#Merge Numerical attributes and Categorical Attributes

housing_tr=housing_tr.join(housing_tr_1,lsuffix='_housing_tr', rsuffix='_housing_tr_1')
size1=housing_tr['latitude'].size
ones_matrix=np.ones(size1)
ones_matrix_1=ones_matrix.reshape((size1,1))


In [45]:
#Append 1's matrix in housing_matrix for increasing dimension

housing_matrix=housing_tr.as_matrix()
housing_matrix_1=np.append(ones_matrix_1,housing_matrix,axis=1)
X_matrix=housing_matrix_1

# Linear Regression Using Gradient Descent

In [46]:
#Linear Regression Using gradient descent

#store current time
start_time=timeit.default_timer()

Y_matrix=housing_labels.as_matrix()
theta=np.ones(X_matrix.shape[1])

m=len(Y_matrix)
alpha=.02

#Calculate theta using gradient descent

for i in range(3000):
    cost=np.dot(X_matrix,theta)-Y_matrix
    derivative=np.dot(cost,X_matrix)         
    theta=theta-2*((alpha/m)*derivative)
    
#end time
end_time=timeit.default_timer()
print("Theta",theta)

('Theta', array([ 2.06738167e+05,  3.33214674e+03, -8.18289003e+04, -8.68401239e+04,
        1.56126920e+04, -1.93680266e+04,  3.68992889e+04, -4.30378700e+04,
        3.04862395e+04,  6.57973110e+04,  1.20770370e+04,  8.32257926e+01]))


In [47]:
#Calculate Root_mean_square error

final_cost=np.dot(X_matrix,theta)-Y_matrix
final_cost=np.dot(final_cost,final_cost.transpose())
final_cost=np.sqrt((final_cost)/size1)
print("Training Time",end_time-start_time)
print("Root Mean Squared Error",final_cost)

('Training Time', 0.4388449192047119)
('Root Mean Squared Error', 69390.91295953286)
