In [17]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import timeit

In [18]:
#load data from dataset
def load_housing_data():
    return pd.read_csv("dataset/housing.csv")

In [19]:
housing=load_housing_data()
#set Index
housing=housing.reset_index()
#Data
housing.head()

Unnamed: 0,index,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [20]:
# Split data into training/test set using StratifiedShuffleSplit

housing["income_cat"]=np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"]<5,5.0,inplace=True)
from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(n_splits=3,test_size=0.2,random_state=42)
for train_index,test_index in split.split(housing,housing["income_cat"]):
        strat_train_set=housing.loc[train_index]
        strat_test_set=housing.loc[test_index]



In [21]:
for set in strat_train_set,strat_test_set :
    set.drop(columns=['income_cat'])

# Data Processing

In [22]:
#Extract Label attributes from Training Data
housing=strat_train_set.drop("median_house_value",axis=1)
housing_labels=strat_train_set["median_house_value"].copy()
housing_num=housing.drop("ocean_proximity",axis=1)

In [23]:
#Fill Missing value with median of data

from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")
housing_num=housing.drop("ocean_proximity",axis=1)
imputer.fit(housing_num)
X=imputer.transform(housing_num)
housing_tr=pd.DataFrame(X,columns=housing_num.columns)


In [24]:
#Convert categorical attribute(ocean_proximity) into numerical values using label encoding

from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
housing_cat=housing["ocean_proximity"]
housing_cat_encoded=encoder.fit_transform(housing_cat)
housing_tr_1=pd.DataFrame(housing_cat_encoded,columns=["ocean_proximity"])

In [25]:
#Perform scaling on Numerical Attribute

from sklearn.preprocessing import StandardScaler
scalar=StandardScaler()
print(scalar.fit(housing_tr))
housing_tr=scalar.transform(housing_tr)
housing_tr=pd.DataFrame(housing_tr,columns=housing_num.columns)
print(housing_tr)

StandardScaler(copy=True, with_mean=True, with_std=True)
          index  longitude  latitude  housing_median_age  total_rooms  \
0     -0.527959   0.684384 -0.738029            1.464498    -0.626342   
1      0.739448   1.258295 -1.323678           -0.288400    -0.059509   
2     -0.098373  -1.087254  0.545714           -1.005495     0.556133   
3      0.160388   0.779204 -0.859844           -0.846141     0.481538   
4      0.071108   0.824119 -0.948863            0.667726    -0.251063   
5     -0.104425  -1.032358  0.494177           -0.368078    -0.019449   
6     -0.033136   0.819129 -0.789566           -0.129046     0.889971   
7      0.991147  -0.877651  1.084511            0.588049    -1.194096   
8     -0.438679   0.714328 -0.775511            0.428695    -0.413147   
9      1.721025  -1.107216  1.435901            0.189663    -0.039709   
10     0.231678   0.784195 -0.827048           -0.686787    -0.152063   
11     1.085808  -1.456553  0.967381            1.145790     0.0127

In [26]:
#Merge Numerical attributes and Categorical Attributes

housing_tr=housing_tr.join(housing_tr_1,lsuffix='_housing_tr', rsuffix='_housing_tr_1')
size1=housing_tr['latitude'].size
ones_matrix=np.ones(size1)
ones_matrix_1=ones_matrix.reshape((size1,1))




In [27]:
#Append 1's matrix in housing_matrix for increasing dimension

housing_matrix=housing_tr.as_matrix()
housing_matrix_1=np.append(ones_matrix_1,housing_matrix,axis=1)
X_matrix=housing_matrix_1

# Linear Regression using Newton's method

In [28]:
# Calculate Hassian Metrix

#store current time
start_time=timeit.default_timer()

Y_matrix=housing_labels.as_matrix()
theta=np.ones(X_matrix.shape[1])
print(Y_matrix)
m=len(Y_matrix)
alpha=.02
XSquare=np.dot(X_matrix.transpose(),X_matrix)
cost=np.dot(X_matrix,theta)-Y_matrix
derivative=np.dot(cost,X_matrix)          
Xinv=np.linalg.pinv(XSquare)

#Find Theta

theta=theta-(np.dot(Xinv,derivative))

print('Theta',theta)

#store end time
end_time=timeit.default_timer()


[122900. 243400. 245300. ... 238600. 456100. 114200.]
('Theta', array([ 2.06754720e+05,  3.32103286e+03, -8.20957767e+04, -8.70912409e+04,
        1.55981238e+04, -1.93356258e+04,  3.75649359e+04, -4.28945222e+04,
        2.96522829e+04,  6.57816938e+04,  1.20757907e+04,  6.95412246e+01]))


In [30]:
final_cost=np.dot(X_matrix,theta)-Y_matrix
final_cost=np.dot(final_cost,final_cost.transpose())
final_cost=np.sqrt((final_cost)/size1)
print("Root Mean Squared Error",final_cost)
print("Training Time",end_time-start_time)

('Root Mean Squared Error', 69390.70256827217)
('Training Time', 0.02007579803466797)
