In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

In [42]:
df_train = pd.read_csv('train.csv', index_col=0)

In [43]:
df_test = pd.read_csv('test.csv', index_col=0)

In [44]:
years = 2018 - df_train['YearBuilt']
years_test = 2018 - df_test['YearBuilt']

In [45]:
years = pd.DataFrame(years)
years_test = pd.DataFrame(years_test)
years = years.rename(columns={"YearBuilt":"Years"})
years_test = years_test.rename(columns={"YearBuilt":"Years"})

In [46]:
x_train = pd.concat([df_train, years], axis=1)
x_test = pd.concat([df_test, years_test], axis=1)

In [47]:
# notUsedFeatures = ['MSSubClass', 'MSZoning', 'LotFrontage', 'Stree', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl']
x_train = x_train[['LotArea', '1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'KitchenAbvGr', 'PoolArea', 'Years']]
x_test = x_test[['LotArea', '1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'KitchenAbvGr', 'PoolArea', 'Years']]

In [8]:
x_train = np.matrix(x_train)

In [10]:
y_train = df_train[['SalePrice']]
y_train = np.matrix(y_train)

In [11]:
def initialize_parameters(x):
    theta = np.random.rand(1, x.shape[1])
    theta = np.matrix(theta)
    return theta

In [12]:
def h(x, theta):
    y_predict = x * theta.T
    return y_predict

In [13]:
def cost_function(h, y):
    m = y.shape[0]
    total_loss = np.square(np.subtract(h, y)).sum(axis=0)
    cost = (1./(2. * m)) * total_loss
    cost = np.asscalar(cost)
    return cost

In [14]:
def fit(x, y, num_iterations=100, learning_rate=0.001):
    theta = initialize_parameters(x)
    m = x.shape[0]
    costs = []
    for i in range(num_iterations):
        for j in range(x.shape[1]):
            theta[0, j] = theta[0, j] - learning_rate * ((1.0/m)*((np.multiply(np.subtract(h(x, theta), y),(x[:, j]))).sum(axis = 0)))
        y_predict = h(x, theta)
        costs.append(cost_function(y_predict, y))
        if i % 1000 == 0:
            print("Cost after {} iterations: {}".format(i, costs[i]))
    return theta, costs

In [15]:
x_train.shape, y_train.shape

((1460, 7), (1460, 1))

In [16]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train_norm = scaler.fit_transform(x_train)



In [17]:
x_train_norm = np.matrix(x_train_norm)
x_train_norm = np.insert(x_train_norm, 0, 1, axis = 1)

In [49]:
x_test_norm = scaler.fit_transform(x_test)
x_test_norm = np.matrix(x_test_norm)
x_test_norm = np.insert(x_test_norm, 0, 1, axis=1)

In [53]:
x_test_norm.shape

(1459, 8)

In [20]:
theta, costs = fit(x_train_norm, y_train, num_iterations=8000, learning_rate=0.0001)

Cost after 0 iterations: 19514915596.7
Cost after 1000 iterations: 15538626845.7
Cost after 2000 iterations: 12531034488.4
Cost after 3000 iterations: 10255301619.5
Cost after 4000 iterations: 8532503147.49
Cost after 5000 iterations: 7227465015.6
Cost after 6000 iterations: 6238066062.62
Cost after 7000 iterations: 5487155826.34


In [21]:
y_predict = h(x_train_norm, theta)

In [23]:
y_predict

matrix([[120408.91616069],
        [117961.02720783],
        [121113.42138165],
        ...,
        [136192.82692813],
        [116278.28388209],
        [119283.4694985 ]])

In [75]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=0.3)
clf.fit(x_train_norm, y_train)

Ridge(alpha=0.3, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [94]:
df_predict = pd.DataFrame(clf.predict(x_test_norm))

In [96]:
df_predict.insert(0, 'Id', range(1461, 2920))

In [98]:
df_predict = df_predict.rename({0: 'SalePrice'}, axis='columns')

In [99]:
df_predict

Unnamed: 0,Id,SalePrice
0,1461,84240.699029
1,1462,122146.481943
2,1463,178894.673523
3,1464,170184.788355
4,1465,145341.102917
5,1466,168551.347415
6,1467,122340.019763
7,1468,150472.139191
8,1469,160345.009726
9,1470,84371.231165


In [102]:
df_predict.to_csv('mysubmission_1.csv', index=False, header=True)

In [104]:
from sklearn.linear_model import Lasso
clf = Lasso(alpha=0.2)
clf.fit(x_train_norm, y_train)

Lasso(alpha=0.2, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [105]:
df_predict = pd.DataFrame(clf.predict(x_test_norm))

In [106]:
df_predict.insert(0, 'Id', range(1461, 2920))
df_predict = df_predict.rename({0: 'SalePrice'}, axis='columns')
df_predict.to_csv('mysubmission_2.csv', index=False, header=True)
mysubmission = pd.read_csv('mysubmission_2.csv')
mysubmission

Unnamed: 0,Id,SalePrice
0,1461,84240.699029
1,1462,122146.481943
2,1463,178894.673523
3,1464,170184.788355
4,1465,145341.102917
5,1466,168551.347415
6,1467,122340.019763
7,1468,150472.139191
8,1469,160345.009726
9,1470,84371.231165


In [None]:
from sklearn.svm import SVR
