In [1]:
import csv
import os
import matplotlib.pyplot as plt
import random 
import numpy as np
from sklearn import linear_model

Family + Freedom => Happiness Score

In [2]:
def loadData(fileName, inputVariabName1, inputVariabName2, outputVariabName):
    data = []
    dataNames = []
    with open(fileName) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                dataNames = row
            else:
                data.append(row)
            line_count += 1
    selectedInput1 = dataNames.index(inputVariabName1)
    selectedInput2 = dataNames.index(inputVariabName2)
    selectedOutput = dataNames.index(outputVariabName)
    inputs = []
    outputs = []
    for i in range(len(data)):
      # if data[i][selectedVariable] == '' or data[i][selectedOutput] == '':
      #   continue
      inputs.append([float(data[i][selectedInput1]), float(data[i][selectedInput2])])
      outputs.append(float(data[i][selectedOutput]))
    
    return inputs, outputs

version = 'v2'
crtDir =  os.getcwd()
filePath = os.path.join(crtDir, 'data', version + '_world-happiness-report-2017.csv')

inputs, outputs = loadData(filePath, 'Family', 'Freedom', 'Score')
# split data into training data (80%) and testing data (20%)
indexes = [i for i in range(len(inputs))]

np.random.seed(5)
trainSample = np.random.choice(indexes, int(0.8 * len(inputs)), replace = False)
# trainSample = random.sample(indexes, int(0.8 * len(inputs)))
validationSample = [i for i in indexes  if not i in trainSample]

trainInputs = [inputs[i] for i in trainSample]
trainOutputs = [outputs[i] for i in trainSample]
validationInputs = [inputs[i] for i in validationSample]
validationOutputs = [outputs[i] for i in validationSample]

training with sklearn

In [12]:
xx = [[el[0], el[1]] for el in trainInputs]
regressor = linear_model.LinearRegression()
regressor.fit(xx, trainOutputs)
computedValidationOutputs = regressor.predict([[x[0], x[1]] for x in validationInputs])

training manually

In [9]:
class ManualLinearBivariateRegression:
    def __init__(self):
        self.w0 = 0.0
        self.w1 = 0.0
        self.intercept = 0.0
    
    @staticmethod
    def transpose(X):
        return [[X[j][i] for j in range(len(X))] for i in range(len(X[0]))]

    @staticmethod
    def inverse_2_2(matrix):
        a, b, c, d = matrix[0][0], matrix[0][1], matrix[1][0], matrix[1][1]
        det = a * d - b * c
        return [[d / det, -b / det], [-c / det, a / det]]

    @staticmethod
    def dot_product_1(xt, X):
        xtx = [[0, 0], [0, 0]]
        for i in range(len(xt)):
            for k in range(len(X[0])):
                xtx[i][k] = sum([xt[i][j] * X[j][k] for j in range(len(xt[i]))])

        return xtx
    
    @staticmethod
    def dot_product_2(xtx_inverse, xt):
        xtx_inverse_xt = [[0] * len(xt[0]), [0] * len(xt[0])]
        for i in range(len(xtx_inverse)):
            for k in range(len(xt[0])):
                xtx_inverse_xt[i][k] = sum([xtx_inverse[i][j] * xt[j][k] for j in range(len(xtx_inverse[i]))])
        
        return xtx_inverse_xt
    
    def dot_product_3(self, xtx_inverse_xt, Y):
        xtx_inverse_xt_y = [0, 0]
        for i in range(len(xtx_inverse_xt)):
            xtx_inverse_xt_y[i] = sum([xtx_inverse_xt[i][j] * Y[j] for j in range(len(xtx_inverse_xt[i]))])
        
        return xtx_inverse_xt_y

    def fit(self, X, Y):
        xt = self.transpose(X)
        xtx = self.dot_product_1(xt, X)
        xtx_inverse = self.inverse_2_2(xtx)
        xtx_inverse_xt = self.dot_product_2(xtx_inverse, xt)
        xtx_inverse_xt_y = self.dot_product_3(xtx_inverse_xt, Y)
        self.w0 = xtx_inverse_xt_y[0]
        self.w1 = xtx_inverse_xt_y[1]
        self.intercept = (sum(Y) - self.w0 * sum([row[0] for row in X]) - self.w1 * sum([row[1] for row in X])) / len(Y)

    def predict(self, X):
        return [self.w0 * row[0] + self.w1 * row[1] + self.intercept for row in X]
    
    def evaluate(self, X, Y):
        # mean squared error
        Y_pred = self.predict(X)
        return sum([(Y_pred[i] - Y[i]) ** 2 for i in range(len(Y))]) / len(Y)
        
xx = [[el[0], el[1]] for el in trainInputs]
regressor = ManualLinearBivariateRegression()
regressor.fit(xx, trainOutputs)
computedValidationOutputs = regressor.predict([[x[0], x[1]] for x in validationInputs])

Results

In [15]:

error = 0.0
for t1, t2 in zip(computedValidationOutputs, validationOutputs):
    if t1 - t2 < 0.000001:
        continue
    error += (t1 - t2) ** 2
error = error / len(validationOutputs)
print("Mean Squared Prediction error: ", error)

Mean Squared Prediction error:  0.13847214842479852
