## Importing libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

## 1. Reading input data

In [4]:
data = pd.read_csv("../datasets/fuel_consumption.csv")
data

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1062,2014,VOLVO,XC60 AWD,SUV - SMALL,3.0,6,AS6,X,13.4,9.8,11.8,24,271
1063,2014,VOLVO,XC60 AWD,SUV - SMALL,3.2,6,AS6,,13.2,9.5,11.5,25,264
1064,2014,VOLVO,XC70 AWD,SUV - SMALL,3.0,6,AS6,X,13.4,9.8,11.8,24,271
1065,2014,VOLVO,XC70 AWD,SUV - SMALL,3.2,6,AS6,X,12.9,9.3,11.3,25,260


## 2. Filling null values

In [5]:
for column in data.columns:
    if data[column].isnull().any():
        data[column] = data[column].fillna(data[column].mode()[0])  # fill with most common value

## 3. Performing One-Hot Encoding on categorical data

In [6]:
data.drop(columns=['MODELYEAR', 'MAKE', 'MODEL', 'VEHICLECLASS', 'TRANSMISSION'], inplace=True)

categorical_input_parameters = [
    'FUELTYPE'
]

output_parameter = 'CO2EMISSIONS'

for column in categorical_input_parameters:
    ohe = OneHotEncoder(sparse_output=False)
    transformed = ohe.fit_transform(data[column].to_numpy().reshape(-1, 1))
    labels = ohe.get_feature_names_out([column])
    data.drop(columns=[column], inplace=True)
    data = data.join(pd.DataFrame(data=transformed, columns=labels))

data

Unnamed: 0,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS,FUELTYPE_D,FUELTYPE_E,FUELTYPE_X,FUELTYPE_Z
0,2.0,4,9.9,6.7,8.5,33,196,0.0,0.0,0.0,1.0
1,2.4,4,11.2,7.7,9.6,29,221,0.0,0.0,0.0,1.0
2,1.5,4,6.0,5.8,5.9,48,136,0.0,0.0,0.0,1.0
3,3.5,6,12.7,9.1,11.1,25,255,0.0,0.0,0.0,1.0
4,3.5,6,12.1,8.7,10.6,27,244,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
1062,3.0,6,13.4,9.8,11.8,24,271,0.0,0.0,1.0,0.0
1063,3.2,6,13.2,9.5,11.5,25,264,0.0,0.0,1.0,0.0
1064,3.0,6,13.4,9.8,11.8,24,271,0.0,0.0,1.0,0.0
1065,3.2,6,12.9,9.3,11.3,25,260,0.0,0.0,1.0,0.0


## 4. Scaling input data and removing not important features

In [7]:
data.drop(columns=['FUELCONSUMPTION_COMB', 'FUELCONSUMPTION_HWY'], inplace=True)

sc = StandardScaler()
data = pd.DataFrame(
    data=sc.fit_transform(data),
    columns=data.columns
)

data

Unnamed: 0,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS,FUELTYPE_D,FUELTYPE_E,FUELTYPE_X,FUELTYPE_Z
0,-0.947967,-0.998969,-0.828558,0.878553,-0.950840,-0.161126,-0.307179,-0.964093,1.207694
1,-0.665489,-0.998969,-0.511433,0.342734,-0.556161,-0.161126,-0.307179,-0.964093,1.207694
2,-1.301063,-0.998969,-1.779932,2.887876,-1.898070,-0.161126,-0.307179,-0.964093,1.207694
3,0.111323,0.114242,-0.145519,-0.193086,-0.019397,-0.161126,-0.307179,-0.964093,1.207694
4,0.111323,0.114242,-0.291885,0.074824,-0.193056,-0.161126,-0.307179,-0.964093,1.207694
...,...,...,...,...,...,...,...,...,...
1062,-0.241774,0.114242,0.025240,-0.327041,0.233197,-0.161126,-0.307179,1.037244,-0.828024
1063,-0.100535,0.114242,-0.023548,-0.193086,0.122687,-0.161126,-0.307179,1.037244,-0.828024
1064,-0.241774,0.114242,0.025240,-0.327041,0.233197,-0.161126,-0.307179,1.037244,-0.828024
1065,-0.100535,0.114242,-0.096731,-0.193086,0.059538,-0.161126,-0.307179,1.037244,-0.828024


## 5. Train and test split

In [8]:
input_columns = list(set(data.columns).difference({output_parameter}))

x_train, x_test, y_train, y_test = train_test_split(
    data[input_columns],
    data[output_parameter],
    train_size=0.7,
    random_state=0
)

## 6. Developing regression model

* $ h(x) = w_0 + w_1*x_1 + w_2*x_2 + \cdots + w_n*x_n $

* **_Loss function_**: $ L(h(x), y) = \frac{1}{2} (h(x^{(i)})-y^{(i)})^2 $
* **_Mean squared error(MSE)_**: $ J(w) = \frac{1}{2m} \sum \limits_{i=0}^{m} (h(x^{(i)})-y^{(i)})^2 $

* Prediction:

$$
     \begin{bmatrix}
         1 & x_{11} & \cdots & x_{1n-1}\\
         1 & x_{21} & \cdots & x_{2n-1}\\
         \vdots & \vdots & \ddots & \vdots\\
         1 & x_{m1} & \cdots & x_{mn-1}
     \end{bmatrix}
     \times
     \begin{bmatrix}
         w_{0} \\
         w_{1} \\
         \vdots \\
         w_{n-1}
     \end{bmatrix}
      =
     \begin{bmatrix}
         w_0 + w_1 * x_{11} + w_2 * x_{12} + \cdots + w_{n-1} * x_{1n-1}\\
         h[1]\\
         \vdots \\
         h[m-1]
     \end{bmatrix}
  $$

* Updating weights: $ w_i = w_i - \alpha \frac{\partial J(w)}{\partial w_i} $

_Reference:_ http://ri4es.etf.rs/materijali/vezbe/IS_Linearna_regresija_(Python).pdf}


In [9]:
class LinearRegressionModel:
    def __init__(self):
        self.weights = None
        self.inputs = None
        self.outputs = None
        self.mse_history = None
        self.learning_rates = None
        self.iterations_number = None

    def calc_error_function(self):
        difference = self.inputs.dot(self.weights) - self.outputs  # (h(i) - y(i)), h(i) = w0 + w1 * x1 + w2 * x2
        loss = pow(difference, 2).reshape(-1, 1).flatten()  # calc loss function ->  (h(i) - y(i))^2
        m = len(loss)
        loss_sum = 0
        for i in range(m):
            loss_sum += loss[i]
        mse = 1 / (2 * m) * loss_sum  # calc mean squared error
        return mse

    def calc_dj_dw(self):
        difference = self.inputs.dot(self.weights) - self.outputs
        m = len(self.inputs)
        return 1 / m * self.inputs.T.dot(difference)

    def update_weights(self):
        alpha = self.learning_rates
        self.weights = self.weights - alpha * self.calc_dj_dw()  # wi = wi - alpha * dj/dwi

    def perform_gradient_descent(self):
        self.mse_history = []
        for i in range(self.iterations_number):
            self.update_weights()
            curr_cost = self.calc_error_function()
            self.mse_history.append(curr_cost)

    def predict(self, features):
        features = features.copy(deep=True)
        features.insert(0, 'w0', np.ones((len(features), 1)))
        features = features.to_numpy()
        return features.dot(self.weights).reshape(-1, 1).flatten()

    def fit(self, features, target, learning_rates, iterations_number=100):
        self.inputs = features.copy(deep=True)
        dimension_of_weight_vector = len(features.columns) + 1
        self.weights = np.ones(shape=dimension_of_weight_vector).reshape(-1, 1)
        self.inputs.insert(0, 'w0', np.ones((len(features), 1)))
        self.inputs = self.inputs.to_numpy()
        self.outputs = target.to_numpy().reshape(-1, 1)
        self.learning_rates = learning_rates
        self.iterations_number = iterations_number
        self.perform_gradient_descent()

## 7. Evaluating the model

In [10]:
lr = LinearRegressionModel()
learning_rate = np.array([[0.3] for i in range(len(x_train.columns) + 1)])
lr.fit(x_train, y_train, learning_rate, iterations_number=300)

y_predicted = lr.predict(x_test)

u = ((y_test - y_predicted) ** 2).sum()
v = ((y_test - y_predicted.mean()) ** 2).sum()

score = 1. - u / v
print(f'Score [test]: {score}')

print(f'MAE [test]: {mean_absolute_error(y_test, y_predicted)}')
print(f'MSE [test]: {mean_squared_error(y_test, y_predicted)}')

Score [test]: 0.9821808914771754
MAE [test]: 0.09504586784297023
MSE [test]: 0.01761060042133482
