# Welcome to my simple multivariate regression model

import necessary libraries

In [828]:
import os
# os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

import numpy as np
import random
import pandas as pd
import math
import sklearn
from sklearn.preprocessing import StandardScaler


### Data Preparation

Read World Bank dataset

In [829]:
wbdata = pd.read_csv("C:\\Users\\Teh Ze Shi\\OneDrive\\schoolwork\\BT4222\\WorldBankData.csv")

Data cleaning

In [830]:
# Function which takes in row and a specific column as args and fills all NA values in the specified column with the mean value of the region: 
#   e.g. country XYZ which belongs to Carribean Small States regionhas a NA value for the 'Life.Expectancy.at.birth' column 
#        -> replace NA with mean 'Life.Expectancy.at.birth' of Carribean Small States
def fill_NAs(row, column_name):
    if pd.isna(row[column_name]):  # Check if 'x' is NaN
        regionCode = row['Additional'] 
        return wbdata[wbdata["Country.Code"].isin([regionCode])][column_name].values[0]
    return row[column_name]  # If 'x' is not NaN, keep the original value

# Apply function to all relevant columns to be used for prediction
wbdata['Health.Expenditure.per.capita'] = wbdata.apply(fill_NAs, axis=1, args=('Health.Expenditure.per.capita',))
wbdata['Diabetes.Prevalence'] = wbdata.apply(fill_NAs, axis=1, args=('Diabetes.Prevalence',))
wbdata['GDP.per.capita.PPP'] = wbdata.apply(fill_NAs, axis=1, args=('GDP.per.capita.PPP',))
wbdata['Life.Expectancy.at.birth'] = wbdata.apply(fill_NAs, axis=1, args=('Life.Expectancy.at.birth',))

# Drop rows which contain aggregated regions instead of countries/states
noncountries = ["ARB", "CSS", "CEB", "EAR", "EAS", "EAP", "TEA", "EMU", "ECS", "ECA", "TEC", "EUU", "FCS", "HPC", "HIC", "IBD", "IBT", "IDB", "IDX", "IDA", "LTE", "LCN", "LAC", "TLA", "LDC", "LMY", "LIC", "LMC", "MEA", "MNA", "TMN", "MIC", "NAC", "OED", "OSS", "PSS", "PST", "PRE", "SST", "SAS", "TSA", "SSF", "SSA", "TSS", "UMC", "WLD"]
wbdata = wbdata[~wbdata['Country.Code'].isin(noncountries)]
wbdata = wbdata.reset_index(drop=True)

# Randomize by shuffling rows so wbdata no longer ordered alphabetically
wbdata = wbdata.sample(frac=1)


split wbdata into training(85%) and test(15%) sets

In [831]:
# Training set contains first 181 entries in wbdata (181/212 = 0.854)
n = math.floor(0.85 * wbdata.shape[0])

trainingSet0raw =wbdata.iloc[:n, [3, 4, 5]]
trainingSet0raw_actual = wbdata.iloc[:n, 8]

# Test set contains remaining rows
testSet0raw = wbdata.iloc[n:, [3, 4, 5]]
testSet0raw_actual = wbdata.iloc[n:, 8]

### Gradient descent and Mmdel fitting functions

In [832]:
# Gradient descent function GDgetWeights returns optimum feature weights
def GDgetWeights(w, learnRate, num_iterations, input_set, output_actual):
    for _ in range(num_iterations):
        predictions = np.matmul(input_set, w) 
        errors = predictions - output_actual

        # L(w) = 1/n ǁAx - yǁ^2  --> ∇L(w) = 2/n ǁAx - yǁ
        gradient = 2/input_set.shape[0] * np.matmul(np.transpose(errors), input_set)
        
        # Update wVector
        w -= learnRate * gradient  
        if np.linalg.norm(gradient) < 0.001:
            print("Error minimized; Breaking out of loop")
            break
        # print(np.linalg.norm(gradient))
    return w

# Generalized and scalable model utilizing GDgetWeights fitting function given input, returns prediction function
def fit_model(w, learnRate, num_iterations, input_set, output_actual):   
    # Convert input dataframes into np arrays
    input_set, output_actual = np.array(input_set), np.array(output_actual)

    # Scale input to compensate for possible extreme values of variables
    scaler = StandardScaler()
    input_set = scaler.fit_transform(input_set)    
    input_set = np.hstack([np.ones((input_set.shape[0], 1)), input_set]) # Add column filled with 1s to account for bias term

    # Run gradient descent to get optimum feature weights
    weights = GDgetWeights(w, learnRate, num_iterations, input_set, output_actual)

    # Prediction function
    def predict(*independent_variables):
        v = np.array(independent_variables).reshape(1, len(independent_variables))
        v = scaler.transform(v)
        v = np.column_stack([np.ones((v.shape[0], 1)), v])
        return np.matmul(v, weights)[0]
    return predict

### Evaluation:

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 1 - Train the model and fit it to training set trainingSet0raw\
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 2 - Evaluate model performance on test set testSet0raw

In [833]:
# Initialize and train model using training dataset


# training set contains first 181 entries in wbdata (181/212 = 0.854)
n = math.floor(0.85 * wbdata.shape[0])
# n = 181
# print(n)
trainingSet0raw =wbdata.iloc[:n, [3, 4, 5]]
trainingSet0raw_actual = wbdata.iloc[:n, 8]

testSet0raw = wbdata.iloc[n:, [3, 4, 5]]
testSet0raw_actual = wbdata.iloc[n:, 8]


wVector = np.random.random(trainingSet0raw.shape[1] + 1) # + 1 to account for the bias weight
lr = random.uniform(0.001, 0.01) # Initialize learning rate as a random float between 0.001 and 0.01

model = fit_model(wVector, lr, 1000, trainingSet0raw, trainingSet0raw_actual)

#Apply trained model on values of test dataset
def iterator(row):
    return model(row['Health.Expenditure.per.capita'], row['Diabetes.Prevalence'], row['GDP.per.capita.PPP'])

testSet0_pred = np.array(testSet0raw.apply(iterator, axis = 1))
testSet0raw_actual = np.array(testSet0raw_actual)
testSet0_pred, testSet0raw_actual = testSet0_pred.reshape(-1, 1), testSet0raw_actual.reshape(-1, 1)

evaluation = pd.DataFrame(np.concatenate((testSet0_pred, testSet0raw_actual), axis = 1), columns = ['Predicted', 'Actual'])
evaluation['Error'] = evaluation['Actual'] - evaluation['Predicted']

print(evaluation)

    Predicted     Actual      Error
0   74.368697  76.009000   1.640303
1   82.285682  81.950724  -0.334959
2   67.770090  58.575000  -9.195090
3   70.287665  66.766000  -3.521665
4   70.262098  66.480000  -3.782098
5   73.205098  71.661000  -1.544098
6   82.467362  77.374000  -5.093362
7   72.055121  80.030000   7.974879
8   68.515936  62.643000  -5.872936
9   70.371005  75.498000   5.126995
10  77.937753  71.096018  -6.841735
11  75.688905  73.318549  -2.370355
12  81.771097  82.470488   0.699391
13  70.459666  74.562000   4.102334
14  77.586460  74.514634  -3.071825
15  73.165293  73.318549   0.153257
16  91.403113  82.685366  -8.717747
17  76.968959  84.680488   7.711529
18  66.145534  66.312000   0.166466
19  70.510553  75.943000   5.432447
20  67.670081  70.604000   2.933919
21  67.756666  71.214000   3.457334
22  65.865433  52.214000 -13.651433
23  71.114997  78.495000   7.380003
24  69.770269  70.565000   0.794731
25  83.370027  81.641463  -1.728564
26  66.077377  67.291000   1