In [1]:
import tensorflow as tf

In [2]:
import numpy as np
import pandas as pd

In [3]:
# overview of the data
data = pd.read_csv("./dataset/new.csv", encoding='ISO-8859-1', low_memory=False)
data.head()

Unnamed: 0,url,id,Lng,Lat,Cid,tradeTime,DOM,followers,totalPrice,price,...,buildingType,constructionTime,renovationCondition,buildingStructure,ladderRatio,elevator,fiveYearsProperty,subway,district,communityAverage
0,https://bj.lianjia.com/chengjiao/101084782030....,101084782030,116.475489,40.01952,1111027376244,2016-08-09,1464.0,106,415.0,31680,...,1.0,2005,3,6,0.217,1.0,0.0,1.0,7,56021.0
1,https://bj.lianjia.com/chengjiao/101086012217....,101086012217,116.453917,39.881534,1111027381879,2016-07-28,903.0,126,575.0,43436,...,1.0,2004,4,6,0.667,1.0,1.0,0.0,7,71539.0
2,https://bj.lianjia.com/chengjiao/101086041636....,101086041636,116.561978,39.877145,1111040862969,2016-12-11,1271.0,48,1030.0,52021,...,4.0,2005,3,6,0.5,1.0,0.0,0.0,7,48160.0
3,https://bj.lianjia.com/chengjiao/101086406841....,101086406841,116.43801,40.076114,1111043185817,2016-09-30,965.0,138,297.5,22202,...,1.0,2008,1,6,0.273,1.0,0.0,0.0,6,51238.0
4,https://bj.lianjia.com/chengjiao/101086920653....,101086920653,116.428392,39.886229,1111027381174,2016-08-28,927.0,286,392.0,48396,...,4.0,1960,2,2,0.333,0.0,1.0,1.0,1,62588.0


In [4]:
# Returns the feature matrix and label matrix
def get_data(file, headers):
    df = pd.read_csv(file, encoding='ISO-8859-1', low_memory=False)

    # pop columns that are not needed
    pop = [0, 1, 6, 8, 9, 25]
    for i in pop:
        df.pop(headers[i])

    # clean up data
    x = df.to_numpy()
    x[:, 10] = [str[-2:] for str in x[:, 10]] # extract data from "floor" column
    x[:, 3] = [str[:4] for str in x[:, 3]] # extract year from "tradeTime" column
    m, n = x.shape
    #print(x.shape[0])


    # load labels
    y = np.loadtxt(file, delimiter=',', usecols=8, skiprows=1, encoding='ISO-8859-1')

    # delete the training examples from x is its "livingRoom" data is not numeric
    index = []
    for i in range(m):
        if not type(x[i, 6]) == int and not x[i, 6].isnumeric():
            index.append(i)

    x = np.delete(x, index, 0)
    y = np.delete(y, index, 0)

    # delete the training examples from x is its "constructionTime" data is not numeric
    index1 = []
    m, n = x.shape
    for i in range(m):
        if not x[i, 12].isnumeric() or float(x[i, 12]) < 10:
            index1.append(i)
            
    x = np.delete(x, index1, 0)
    y = np.delete(y, index1, 0)

    # convert all elements of x to float type
    x = x.astype(float)

    import math 

    # delete the training examples from x is its "buildingType" data is not numeric
    index4 = []
    for i in range(x.shape[0]):
        if math.isnan(x[i, 11]):
            index4.append(i)
            
    x = np.delete(x, index4, 0)
    y = np.delete(y, index4, 0)

    return x, y

In [13]:
from util import load_data, add_theta_0
from sklearn.preprocessing import StandardScaler

# read from csv file
with open("./dataset/new.csv", 'r', encoding='ISO-8859-1') as f:
    headers = f.readline().strip().split(',')

x_train, y_train = get_data("./dataset/new.csv", headers)
print("headers", headers)


# Prepare data for training
y_train = y_train.reshape(-1, 1)

# feature scaling and adding intercept
scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_train = add_theta_0(x_train)

X = tf.constant(x_train,tf.float32,name="X")
Y = tf.constant(y_train,tf.float32,name="Y")

print(x_train.shape)


headers ['url', 'id', 'Lng', 'Lat', 'Cid', 'tradeTime', 'DOM', 'followers', 'totalPrice', 'price', 'square', 'livingRoom', 'drawingRoom', 'kitchen', 'bathRoom', 'floor', 'buildingType', 'constructionTime', 'renovationCondition', 'buildingStructure', 'ladderRatio', 'elevator', 'fiveYearsProperty', 'subway', 'district', 'communityAverage']
(297990, 21)


In [6]:
import time
# Calculate parameter theta using the normal equation
def normal():
    start = time.perf_counter()
    X_T = tf.transpose(X)
    temp = tf.linalg.inv(tf.matmul(X_T,X))
    theta = tf.matmul(tf.matmul(temp,X_T),Y)
    #print(theta)
    elapsed = time.perf_counter() - start
    return theta

In [7]:
# Fitting the linear model using normal equation
theta = normal()

print("Parameter theta:", theta)
y_pred = tf.matmul(X, theta, name = "predictions")
error = y_pred - y_train
mse = tf.reduce_mean(tf.square(error), name = "mse")
print("Mse:", mse)

Parameter theta: tf.Tensor(
[[ 3.4918195e+02]
 [-2.4476831e+01]
 [-9.6867847e+00]
 [-1.0601074e+00]
 [ 9.6233177e+01]
 [ 6.7982512e+00]
 [ 1.3045802e+02]
 [-3.5065803e-01]
 [ 1.8096834e+00]
 [ 6.2409091e+00]
 [ 7.3082557e+00]
 [ 5.4321294e+00]
 [ 1.6037466e+01]
 [-3.7142811e+01]
 [-9.2561808e+00]
 [ 2.1460632e+01]
 [-4.7634259e-02]
 [ 2.2885075e+01]
 [-4.2841258e+00]
 [ 3.2152931e+01]
 [ 2.0267225e+01]], shape=(21, 1), dtype=float32)
Mse: tf.Tensor(22242.082, shape=(), dtype=float32)


In [8]:
# Prepare data for training through gradient descent
n_epoch = 100000
learning_rate = 0.5

X = tf.constant(x_train, dtype = tf.float32, name = "Scaled Features")
theta = tf.Variable((tf.random.uniform([21, 1], -1.0, 1.0)), name = "theta")
y_train = y_train.reshape(-1, 1)

# calculates the mean squared error (mse)
def mse(y_pred, X, theta):
    y_pred = tf.matmul(X, theta, name = "predictions")
    error = y_pred - y_train
    mse = tf.reduce_mean(tf.square(error), name = "mse")
    return mse

In [9]:
# Performing gradient descent
import time
start = time.perf_counter()

m = X.shape[0]

for epoch in range(n_epoch):
    theta_old = theta
    theta = theta + learning_rate / (epoch / 200 + 1) * tf.matmul(tf.transpose(X),(y_train - tf.matmul(X, theta))) / m
  
    if (np.linalg.norm(theta_old - theta) < 0.00001):
        print(theta)
        print("Epoch: ", epoch, "MSE: ", mse(y_pred, X, theta))
        break
    if epoch % 100 == 0:
        print("Epoch: ", epoch, "MSE: ", mse(y_pred, X, theta))


elapsed = time.perf_counter() - start
print('Converged in %.3f seconds.' % elapsed)

Epoch:  0 MSE:  tf.Tensor(61979.96, shape=(), dtype=float32)
Epoch:  100 MSE:  tf.Tensor(22242.088, shape=(), dtype=float32)
Epoch:  200 MSE:  tf.Tensor(22242.082, shape=(), dtype=float32)
tf.Tensor(
[[ 3.4917813e+02]
 [-2.4476881e+01]
 [-9.6868668e+00]
 [-1.0601144e+00]
 [ 9.6232864e+01]
 [ 6.7983642e+00]
 [ 1.3045776e+02]
 [-3.5029683e-01]
 [ 1.8097998e+00]
 [ 6.2409143e+00]
 [ 7.3082104e+00]
 [ 5.4320192e+00]
 [ 1.6037273e+01]
 [-3.7142662e+01]
 [-9.2557487e+00]
 [ 2.1460537e+01]
 [-4.7631659e-02]
 [ 2.2885010e+01]
 [-4.2841315e+00]
 [ 3.2152958e+01]
 [ 2.0267550e+01]], shape=(21, 1), dtype=float32)
Epoch:  256 MSE:  tf.Tensor(22242.082, shape=(), dtype=float32)
Converged in 4.115 seconds.


In [10]:
# Training using "LinearRegression" from sklearn library
from sklearn.linear_model import LinearRegression

x = x_train[:, 1:]
y = y_train
lin_reg = LinearRegression()
lin_reg.fit(x, y)

pred = lin_reg.predict(x)
mse = (pred - y).T.dot(pred - y) / y.shape[0]
print("mse:", mse[0][0])
print("Fitted Parameters:", lin_reg.coef_[0])
print("Intercept:", lin_reg.intercept_[0])

mse: 22242.0821052123
Fitted Parameters: [-2.44768881e+01 -9.68685755e+00 -1.06010890e+00  9.62328578e+01
  6.79837585e+00  1.30458135e+02 -3.50506079e-01  1.80973851e+00
  6.24092443e+00  7.30811050e+00  5.43196088e+00  1.60372575e+01
 -3.71427551e+01 -9.25573528e+00  2.14605608e+01 -4.76318403e-02
  2.28849956e+01 -4.28415295e+00  3.21529560e+01  2.02675482e+01]
Intercept: 349.17813584963136


In [11]:
# Performing 10-fold cross validation using sklearn library
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lin_reg, x, y, cv=10, scoring='neg_mean_squared_error')
print(scores)
print("mean:", -np.mean(scores))

[-33881.40091096 -42569.47319146 -44622.13078846 -36809.47728364
  -8102.61137508 -14618.32702782 -23925.96115811 -11775.84606723
 -19506.66689499 -22261.73458781]
mean: 25807.36292855777


In [12]:
# Performing stratified 10-fold cross validation using sklearn library
# the word "stratified" means each sampling is based on the shape of the overall distribution
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits = 10)

# Perform training and testing on 1 fold of the data
def training(x_train, y_train, x_test, y_test, fold_no):
    linReg = LinearRegression()
    linReg.fit(x_train, y_train)
    
    # calculate mse
    pred = linReg.predict(x_test)
    m = y_test.shape
    mse = (pred - y_test).T.dot(pred - y_test) / m[0]
    print("mse:", mse[0][0])
    return mse[0][0]
    
fold_no = 1
sum = 0
y = np.round(y)

# Split the data according to the sahpe of the data and perform training and testing on each sample
for train_index,test_index in skf.split(x, y):
    x_train = x[train_index, :]
    x_test = x[test_index, :]
    y_train = y[train_index]
    y_test = y[test_index]
    sum += training(x_train, y_train, x_test, y_test, fold_no)
    fold_no += 1

print("Mean mse:", sum / (fold_no - 1))

mse: 24732.411834301
mse: 22865.11756303002
mse: 23106.70534762041
mse: 23865.856226561853
mse: 21745.48862163844
mse: 22552.810575896394
mse: 21568.692861615436
mse: 21045.652095742305
mse: 23191.992408824488
mse: 25251.960686672708
Mean mse: 22992.668822190306
