In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

# show up the USA housing dataset

# using pandas:
df = pd.read_csv('USA_Housing.csv')
print(df.head(5))

   Avg. Area Income  Avg. Area House Age  Avg. Area Number of Rooms  \
0      79545.458574             5.682861                   7.009188   
1      79248.642455             6.002900                   6.730821   
2      61287.067179             5.865890                   8.512727   
3      63345.240046             7.188236                   5.586729   
4      59982.197226             5.040555                   7.839388   

   Avg. Area Number of Bedrooms  Area Population         Price  
0                          4.09     23086.800503  1.059034e+06  
1                          3.09     40173.072174  1.505891e+06  
2                          5.13     36882.159400  1.058988e+06  
3                          3.26     34310.242831  1.260617e+06  
4                          4.23     26354.109472  6.309435e+05  


In [15]:
# using numpy:
columns = [0, 1, 2, 3, 4, 5]
raw_data = np.loadtxt('USA_Housing.csv', dtype='str', delimiter=',', usecols=columns)
header = raw_data[0]
raw_data = raw_data[1:].astype(float)
print('features: ', ', '.join(header[:-1]))
print('labels: ', header[-1])
print('total: ', len(raw_data))

features:  Avg. Area Income, Avg. Area House Age, Avg. Area Number of Rooms, Avg. Area Number of Bedrooms, Area Population
labels:  Price
total:  5000


In [17]:
# data preparation

# 1. randomize the dataset order
data = np.random.permutation(raw_data)

# 2. standardize the dataset
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(data)
data = ss.transform(data)

# 3. split data into training and testing part
ratio = 0.8
split_index = int(len(data) * ratio)
train, test = data[:split_index, :], data[split_index:, :]

X_train, y_train = train[:, :-1], train[:, -1].flatten()
X_test, y_test = test[:, :-1], test[:, -1].flatten()

print(X_train[:3], y_train[:3])

[[ 0.59845204  0.49458479 -0.50102415 -0.576436   -0.02464793]
 [ 0.79181445  0.29469692  0.67398752 -0.5116069  -0.83711922]
 [-0.45037171 -0.2999339  -0.55461532 -1.60559787 -0.3570006 ]] [ 0.1021356   0.62659557 -1.22866089]


In [19]:
# a. Analytical solution
# 1. calculating theta
# 2. compute X @ theta
# 3. evaluation with RMSE

# adding a constant column
X = np.concatenate([X_train, np.ones((len(X_train), 1))], axis=-1)

# calculate theta
theta = np.linalg.inv(X.T @ X) @ X.T @ y_train

# predict using analytical solution
x_test = np.concatenate([X_test, np.ones((len(X_test), 1))], axis=-1)
pred = x_test @ theta

# evaluate with RMSE
rmse_loss = np.sqrt(np.square(y_test - pred).mean())
print('RMSE: ', rmse_loss)

RMSE:  0.2864875860127391


In [20]:
# b. linear regression with sklearn
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

rmse_loss = np.sqrt(np.square(y_test - y_pred).mean())
print('RMSE: ', rmse_loss)

RMSE:  0.2864875860127391
