Multiple Linear Regression Machine Learning Restricted Dataset

In [283]:
# Importing dependencies.
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
import numpy as np

In [284]:
# Loading csv.
# df = pd.read_csv(Path('happiness_kcal_by_country.csv'))
df = pd.read_csv("https://happycal.s3.us-east-2.amazonaws.com/happiness_kcal_by_country.csv")
df.head()

Unnamed: 0,country,happiness_score,Alcoholic_Beverages,Animal_Products,Animal_fats,Cereal_Excluding_Beer,Eggs,Fish_Seafood,Fruits_Excluding_Wine,Meat,...,Starchy_Roots,Stimulants,Sugar_Crops,Sugar_Sweeteners,Treenuts,Vegetal_Products,Vegetable_Oils,Vegetables,Obesity,Population
0,Afghanistan,2.523,0.0,4.7774,0.8504,37.1186,0.1501,0.0,1.4757,1.2006,...,0.3252,0.075,0.0,2.2261,0.1251,45.2476,2.3012,0.7504,4.5,38928000.0
1,Albania,5.117,0.912,16.093,1.0591,16.2107,0.8091,0.1471,3.8982,3.8688,...,1.2651,0.2501,0.0,3.4422,0.3972,33.907,2.8244,2.7508,22.3,2838000.0
2,Algeria,4.887,0.0896,6.0326,0.1941,25.0112,0.4181,0.1195,3.1805,1.2543,...,1.9262,0.1493,0.0,3.9869,0.224,43.9749,5.7638,2.0457,26.6,44357000.0
3,Argentina,5.929,1.4354,14.9869,1.065,16.7927,0.8643,0.2006,1.4663,9.4459,...,1.4045,0.2315,0.0,7.0536,0.0463,34.99,5.541,0.8643,28.5,45377000.0
4,Armenia,5.283,0.2274,12.833,1.7706,19.2658,0.731,0.1787,2.5341,4.2235,...,1.2508,0.6985,0.0,5.2956,0.3086,37.167,3.5737,3.2164,20.9,2956000.0


In [285]:
# Selecting the top 5 columns for highest R-value during individual testing for X.
X = df[['Animal_fats', 'Animal_Products', 'Cereal_Excluding_Beer', 'Eggs', 'Meat']]
# happiness_score is the dependent variable.
y = df['happiness_score']

In [286]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 150)

In [287]:
#Fitting the model
model = LinearRegression()  
model.fit(x_train, y_train)

LinearRegression()

In [288]:
print("Intercept: ", model.intercept_)
print("Coefficients:")
list(zip(X, model.coef_))

Intercept:  5.100661241907707
Coefficients:


[('Animal_fats', 0.01833050260829601),
 ('Animal_Products', 0.08464832911830616),
 ('Cereal_Excluding_Beer', -0.038868135940618116),
 ('Eggs', 0.6011484350888634),
 ('Meat', 0.0341421466745864)]

In [289]:
#Prediction of test set
y_pred_model= model.predict(x_test)
#Predicted values
print("Prediction for test set: {}".format(y_pred_model))

Prediction for test set: [5.96814594 6.53776716 4.30989142 4.73857028 5.19844968 5.39485021
 6.20175338 5.84023114 3.9963011  4.48362552 5.83851524 6.32330298
 6.39793744 5.43174095 5.51381    6.31130998 6.92456377 4.68372415
 6.45987835 4.52252844 5.20257843 4.70242907 4.93689464 4.67750925
 4.38738561 4.96016889 6.22767158 5.93217698 4.68743006 6.16526607
 4.68226211 5.5755833  6.80171779 6.37039136 4.44217573 6.51649102
 5.500491   6.02390238 4.45434314 6.57817422 6.5673781  6.41263007
 6.68049779 5.43986588 4.63524654 4.41662292 4.65467382 4.66828312
 6.55256098 4.89650609 6.64795452 6.62691268 5.35163077 6.1415067
 4.35097279]


In [290]:
#Actual value and the predicted value
model_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_model})
model_diff.head()

Unnamed: 0,Actual value,Predicted value
82,6.317,5.968146
129,6.951,6.537767
134,3.658,4.309891
21,4.355,4.73857
40,4.852,5.19845


In [291]:
# Model Evaluation
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_model)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_model)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_model))
# R-squared is a measure of how well a linear regression model fits the data(0 ≤ R2 ≤ 1). 100% is a perfect prediction.
print('R squared: {:.2f}'.format(model.score(X,y)*100))
# MAE tells us how big of an error we can expect from the forecast on average.0 perfect, 1 is not.
print('Mean Absolute Error:', meanAbErr)
# The MSE measures how close a regression line is to a set of data points.0 perfect, 1 is not.
print('Mean Square Error:', meanSqErr)
# RMSE estimates the deviation of the actual y-values from the regression line. 0 perfect, 1 is not.
print('Root Mean Square Error:', rootMeanSqErr)

R squared: 61.39
Mean Absolute Error: 0.5020149456327939
Mean Square Error: 0.35811937734655047
Root Mean Square Error: 0.5984307623665001
