## House price prediction without using scikit-learn library ##

### Single feature linear regression ###

In [467]:
import numpy as np
import pandas as pd

In [810]:
df = pd.read_csv("data.csv") # Data is "House Price Prediction" by EMRE ARSLAN from Kaggle
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [811]:
X = df['sqft_living'] / 100 # Regularization
X.head()

0    13.4
1    36.5
2    19.3
3    20.0
4    19.4
Name: sqft_living, dtype: float64

In [812]:
y = df['price'] / 100000 # Regularization
y.head()

0     3.13
1    23.84
2     3.42
3     4.20
4     5.50
Name: price, dtype: float64

In [813]:
def split(x, y, rate):
    m = x.shape[0]
    # For more readability
    n1 = int(m*rate)
    n2 = int(n1+m*(1-rate)/2)
    n3 = int(n2+m*(1-rate)/2)
    # Shuffle the data
    x = x.sample(frac=1, random_state=42).reset_index(drop=True)
    y = y.sample(frac=1, random_state=42).reset_index(drop=True)
    x_train = x[:n1]
    y_train = y[:n1]
    x_test = x[n1:n2].reset_index(drop=True)
    y_test = y[n1:n2].reset_index(drop=True)
    x_cv = x[n2:n3].reset_index(drop=True)
    y_cv = y[n2:n3].reset_index(drop=True)
    return np.array(x_train), np.array(x_test), np.array(x_cv), np.array(y_train), np.array(y_test), np.array(y_cv)

In [814]:
x_train, x_test, x_cv, y_train, y_test, y_cv = split(X, y, 0.8)

In [815]:
def cost(x, y, w, b, lambd=0): 
    m = x.shape[0] 
    cost=0
    for i in range(m):
        prediction = w*x[i] + b
        cost += (prediction - y[i]) ** 2
    cost += lambd * np.sum(w**2) # L2 regularization
    return cost/(2*m)

In [816]:
cost(x_train, y_train, 0, 0, 0.1)

33.16027510626682

In [817]:
def gradient(x, y, w, b): 
    m = x.shape[0]
    derivative_w = 0
    derivative_b = 0
    for i in range(m):
        prediction = w*x[i] + b
        derivative_w += (prediction - y[i]) * x[i]
        derivative_b += prediction - y[i]
    return derivative_w/m, derivative_b/m

In [818]:
gradient(x_train, y_train, 0, 0)

(-141.19872239293863, -5.528300827272399)

In [819]:
def gradient_descent(x, y, w, b, cost_function, gradient_function, alpha, num_iters, lambd=0): 
    m = len(x)
    for i in range(num_iters):
        derivative_w, derivative_b = gradient_function(x, y, w, b)  
        w = w - alpha * derivative_w               
        b = b - alpha * derivative_b               
        cost =  cost_function(x, y, w, b, lambd)
        if i% int(num_iters/10) == 0:
            print(f"Iteration {i:4}: Cost {float(cost):8.2f}   ")
    print(f"Iteration {num_iters:4}: Cost {float(cost):8.2f}   ")
    return w, b

In [836]:
x_train, x_test, x_cv, y_train, y_test, y_cv = split(X, y, 0.8)
initial_w = 0.
initial_b = 0.
iterations = 100
alpha = 0.0004 # Optimal value

In [837]:
w, b = gradient_descent(x_train, y_train, initial_w, initial_b, cost, gradient, alpha, iterations)

Iteration    0: Cost    26.06   
Iteration   10: Cost    15.22   
Iteration   20: Cost    15.15   
Iteration   30: Cost    15.15   
Iteration   40: Cost    15.15   
Iteration   50: Cost    15.15   
Iteration   60: Cost    15.15   
Iteration   70: Cost    15.15   
Iteration   80: Cost    15.15   
Iteration   90: Cost    15.15   
Iteration  100: Cost    15.15   


In [833]:
print(f"Train data final cost: {cost(x_train, y_train, w, b, 0)}")
print(f"Test data final cost: {cost(x_test, y_test, w, b, 0)}")
print(f"Cross Validation data final cost: {cost(x_cv, y_cv, w, b, 0)}")

Train data final cost: 15.151149263692789
Test data final cost: 5.062087877373735
Cross Validation data final cost: 3.227535594480098


### Multiple feature linear regression ###

In [735]:
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [736]:
# Drop useless features
df2 = df.drop(['date', 'waterfront', 'view', 'yr_built', 'yr_renovated', 'street', 'city', 'statezip', 'country'], axis=1)
df2.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,condition,sqft_above,sqft_basement
0,313000.0,3.0,1.5,1340,7912,1.5,3,1340,0
1,2384000.0,5.0,2.5,3650,9050,2.0,5,3370,280
2,342000.0,3.0,2.0,1930,11947,1.0,4,1930,0
3,420000.0,3.0,2.25,2000,8030,1.0,4,1000,1000
4,550000.0,4.0,2.5,1940,10500,1.0,4,1140,800


In [737]:
X = df2.drop(['price'], axis=1)
X[['sqft_living', 'sqft_lot', 'sqft_above']] /= 1000 # Regularization
X.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,condition,sqft_above,sqft_basement
0,3.0,1.5,1.34,7.912,1.5,3,1.34,0
1,5.0,2.5,3.65,9.05,2.0,5,3.37,280
2,3.0,2.0,1.93,11.947,1.0,4,1.93,0
3,3.0,2.25,2.0,8.03,1.0,4,1.0,1000
4,4.0,2.5,1.94,10.5,1.0,4,1.14,800


In [738]:
y = (df2['price'] / 100000).values.reshape(-1, 1) # Regularization
y = pd.DataFrame(y, columns=['price'])
y.head()

Unnamed: 0,price
0,3.13
1,23.84
2,3.42
3,4.2
4,5.5


In [740]:
def cost(x, y, w, b): 
    m = x.shape[0]
    cost=0.
    for i in range(m):
        prediction = np.dot(x[i], w) + b
        cost += (prediction - y[i]) ** 2
    return cost/(2*m)

In [741]:
def gradient(x, y, w, b): 
    m, n = x.shape
    derivative_w = np.zeros((n,))
    derivative_b = 0
    for i in range(m):
        predictions = (np.dot(x[i], w) + b) - y[i]
        for j in range(n):
            derivative_w[j] += predictions * x[i, j]    
        derivative_b += predictions
    return derivative_w/m, derivative_b/m

In [800]:
def gradient_descent(x, y, w, b, cost_function, gradient_function, alpha, num_iters): 
    m, n = x.shape
    for i in range(num_iters):
        derivative_w, derivative_b = gradient_function(x, y, w, b)  
        w = w - alpha * derivative_w               
        b = b - alpha * derivative_b               
        cost =  np.sum(cost_function(x, y, w, b))
        if i % (num_iters/10) == 0:
            print(f"Iteration {i:4}: Cost {float(cost):8.2f}   ")
    print(f"Iteration {num_iters:4}: Cost {float(cost):8.2f}   ")
    return w, b

In [804]:
x_train, x_test, x_cv, y_train, y_test, y_cv = split(X, y, 0.6)
initial_w = np.zeros([x_train.shape[1],])
initial_b = 0.
iterations = 1000
alpha = 0.000006 # Optimal value

In [805]:
w, b = gradient_descent(x_train, y_train, initial_w, initial_b, cost, gradient, alpha, iterations)

Iteration    0: Cost    35.47   
Iteration  100: Cost    27.43   
Iteration  200: Cost    27.10   
Iteration  300: Cost    26.87   
Iteration  400: Cost    26.66   
Iteration  500: Cost    26.46   
Iteration  600: Cost    26.26   
Iteration  700: Cost    26.07   
Iteration  800: Cost    25.89   
Iteration  900: Cost    25.71   
Iteration 1000: Cost    25.53   


In [808]:
print(f"Train data final cost: {cost(x_train, y_train, w, b)[0]}")
print(f"Test data final cost: {cost(x_test, y_test, w, b)[0]}")
print(f"Cross Validation data final cost: {cost(x_cv, y_cv, w, b)[0]}")

Train data final cost: 25.534135254600848
Test data final cost: 11.227007536225937
Cross Validation data final cost: 11.488365751123974
