In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_csv('train.csv', index_col=0)

In [3]:
df_test = pd.read_csv('test.csv', index_col=0)

In [4]:
years = 2018 - df_train['YearBuilt']

In [5]:
years = pd.DataFrame(years)
years = years.rename(columns={"YearBuilt":"Years"})

In [6]:
x_train = pd.concat([df_train, years], axis=1)

In [7]:
# notUsedFeatures = ['MSSubClass', 'MSZoning', 'LotFrontage', 'Stree', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl']
x_train = x_train[['LotArea', '1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'KitchenAbvGr', 'PoolArea', 'Years']]

In [8]:
x_train = np.matrix(x_train)

In [9]:
x_train

matrix([[ 8450,   856,   854, ...,     1,     0,    15],
        [ 9600,  1262,     0, ...,     1,     0,    42],
        [11250,   920,   866, ...,     1,     0,    17],
        ...,
        [ 9042,  1188,  1152, ...,     1,     0,    77],
        [ 9717,  1078,     0, ...,     1,     0,    68],
        [ 9937,  1256,     0, ...,     1,     0,    53]])

In [64]:
y_train = df_train[['SalePrice']]
y_train = np.matrix(y_train)

In [65]:
def initialize_parameters(x):
    theta = np.random.rand(1, x.shape[1])
    theta = np.matrix(theta)
    return theta

In [66]:
def h(x, theta):
    y_predict = x * theta.T
    return y_predict

In [67]:
def cost_function(h, y):
    m = y.shape[0]
    total_loss = np.square(np.subtract(h, y)).sum(axis=0)
    cost = (1./(2. * m)) * total_loss
    cost = np.asscalar(cost)
    return cost

In [80]:
def fit(x, y, num_iterations=100, learning_rate=0.001):
    theta = initialize_parameters(x)
    m = x.shape[0]
    costs = []
    for i in range(num_iterations):
        for j in range(x.shape[1]):
            theta[0, j] = theta[0, j] - learning_rate * ((1.0/m)*((np.multiply(np.subtract(h(x, theta), y),(x[:, j]))).sum(axis = 0)))
        y_predict = h(x, theta)
        costs.append(cost_function(y_predict, y))
        if i % 1000 == 0:
            print("Cost after {} iterations: {}".format(i, costs[i]))
    return theta, costs

In [69]:
x_train.shape, y_train.shape

((1460, 7), (1460, 1))

In [70]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train_norm = scaler.fit_transform(x_train)

In [72]:
x_train_norm = np.matrix(x_train_norm)
x_train_norm = np.insert(x_train_norm, 0, 1, axis = 1)

In [74]:
x_train_norm.shape

(1460, 8)

In [81]:
theta, costs = fit(x_train_norm, y_train, num_iterations=8000, learning_rate=0.0001)

Cost after 0 iterations: 19514893843.4
Cost after 1000 iterations: 15538610751.1
Cost after 2000 iterations: 12531022664.6
Cost after 3000 iterations: 10255293018.1
Cost after 4000 iterations: 8532496976.47
Cost after 5000 iterations: 7227460676.62
Cost after 6000 iterations: 6238063103.68
Cost after 7000 iterations: 5487153906.03


In [76]:
y_predict = h(x_train_norm, theta)

In [84]:
x_train_norm

matrix([[1.        , 0.0334198 , 0.11977972, ..., 0.33333333, 0.        ,
         0.05072464],
        [1.        , 0.03879502, 0.21294172, ..., 0.33333333, 0.        ,
         0.24637681],
        [1.        , 0.04650728, 0.13446535, ..., 0.33333333, 0.        ,
         0.06521739],
        ...,
        [1.        , 0.03618687, 0.19596145, ..., 0.33333333, 0.        ,
         0.5       ],
        [1.        , 0.03934189, 0.17072051, ..., 0.33333333, 0.        ,
         0.43478261],
        [1.        , 0.04037019, 0.21156494, ..., 0.33333333, 0.        ,
         0.32608696]])

In [79]:
theta

matrix([[87421.45282849,  4520.96156582, 20014.38133703, 18457.07864576,
         32114.25241255, 29726.17165041,   627.132638  , 16847.15537853]])