In [104]:
# Import the Dataset
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline


In [105]:
class LinearRegression(object):
    def __init__(self):
        self.coefficients = []
    
    
    def fit(self, X, y):
        # X: dataset, mXn matrix
        # y: target values, mX1 matrix
        # We will split features (X) from the target (y)
        # We will add a vector of ones to X for the bias b
        
        ones = np.ones([X.shape[0],1])
        X = np.concatenate((ones, X), axis=1)
        # concatenate an array of ones to W
        # mX1 matrix of ones
        # m is the number of data points 

        # coefficients: n+1
        self.coefficients = np.linalg.inv(X.transpose().dot(X)).dot(X.transpose()).dot(y)
        
    
    def predict(self, X):
        b0 = self.coefficients[0]
        betas = self.coefficients[1:]
        y_pred = []
        
        for data_point in X:
        # take a row and make a prediction for that data point
            pred = b0

            for x, b in zip(data_point, betas):
                pred += (b * x)
                
            y_pred.append(pred)
            
        return y_pred

In [106]:
def normalize(data):
        # Normalize the data
        data = (data - data.mean())/data.std()
        return data

In [107]:
# r2_score function
def r2_score(y_true, y_pred):
    SS_total = 0
    SS_reg = 0
    
    for i in range(len(y_true)):
        x = (y_true[i] - y_pred[i])**2
        SS_reg = SS_reg+x
    
    y_true_mean = np.mean(y_true)
    for i in range(len(y_true)):
        x = (y_true[i]-y_true_mean)**2
        SS_total = SS_total+x
        
    
    print(1 - (SS_reg/SS_total))

In [108]:
model = LinearRegression()

In [109]:
df_train = pd.read_csv('./expedia-personalized-sort/data/test3_data_train.csv')
df_train = normalize(df_train)

In [110]:
X_train = df_train.drop(['price_usd'], axis='columns').values
y_train = df_train['price_usd'].values
# turn DataFrame into numpy ndarray for computation using .values

In [111]:
df_test = pd.read_csv('./expedia-personalized-sort/data/test3_data_test.csv')
df_test = normalize(df_test)

In [112]:
X_test = df_test.drop(['price_usd'], axis='columns').values
y_test = df_test['price_usd'].values
# turn DataFrame into numpy ndarray for computation using .values

In [113]:
model.fit(X_train, y_train)

In [114]:
model.coefficients

array([-3.80893234e-14,  5.19648515e-02, -1.93406806e-02,  3.87062820e-02,
        1.44404765e-02,  5.61138118e-01, -4.70851965e-01,  4.23016350e-02,
        9.64694814e-03])

In [115]:
prediction = model.predict(X_test)

In [116]:
pd.DataFrame({
    'y_true': y_test,
    'y_pred' : np.ravel(prediction)
})

Unnamed: 0,y_true,y_pred
0,-0.944395,-1.651917
1,0.685837,0.587923
2,0.412423,0.601989
3,-0.038711,-0.099773
4,0.494447,0.338677
...,...,...
9995,0.221033,0.048948
9996,0.275715,0.258437
9997,-0.742752,-0.339488
9998,-0.742752,-0.388686


In [117]:
r2_score(y_test, prediction)

0.5449102444363592
