In [20]:
def root_mean_squared_error(y, yp):
    # The squared percentage error is just:
    #     the difference between actual and predicted
    #     divided by the actual value
    #     raised to the power of 2
    # Here we do that for every y, yp pair
    squared_error = [((yi - ypi) / yi) ** 2 for yi, ypi in zip(y, yp)]
    # Next we take the mean of all those errors
    mean_squared_error = sum(squared_error)/len(squared_error)
    # And take the root of that (to 'undo' the original squaring)
    root_mean_squared_error = mean_squared_error ** 0.5
    return root_mean_squared_error

In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train = train.fillna(0)
test = test.fillna(0)

X_train = train[train.columns[3:]]
y_train = train['price']

X_test = test[test.columns[2:]]

# Training the Model
model = LinearRegression()
K = 10 # Number of iterations
total_score = 0.

# Cross Validation
for i in range(K):
    X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_train, y_train)
    model.fit(X_train_cv, y_train_cv)
    prediction = model.predict(X_test_cv)
    score = root_mean_squared_error(y_test_cv, prediction)
    print("Round {}: {}".format(i, score))
    total_score += score
print("Overall: {}".format(total_score / K))

Round 0: 0.3742323545581018
Round 1: 0.3618372759114513
Round 2: 0.3654510083801555
Round 3: 0.3530962146563929
Round 4: 0.35659340256377464
Round 5: 0.34491837097914446
Round 6: 0.3663324964159435
Round 7: 0.3416807539540106
Round 8: 0.3521028689970093
Round 9: 0.35224584142361254
Overall: 0.3568490587839596


In [26]:
model.fit(X_train, y_train)
preds = model.predict(X_test)

pred_df = pd.DataFrame({'price':preds, 'id':test['id']}).set_index('id')
pred_df.to_csv('linear-model.csv')