## Linear Regression (Alan)

The purpose of this notebook is to train the linear regression model on the clean merged data set to produce a set
of weights, one for each feature. Those weights will then be applied to the actual values plus an intercept to predict
the happiness score.

This notebook utilises Python packages: pandas, matplotlib, and statsmodels.

In [1]:
# Imports.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Import statsmodels package for training a linear regression model.
import statsmodels.formula.api as sm

# Allows plots to appear directly in the notebook.
%matplotlib inline

### Read data from csv

In [2]:
df = pd.read_csv('processed_data/cleaned_merged_happiness_data.csv')

In [3]:
df.head(3)

Unnamed: 0,country,year,Happiness Score,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Confidence in national government,Democratic Quality,Delivery Quality,GDP,Life expectancy birth,Life expectancy age 60,Infant mortality rate,Neonatal mortality rate,Under-five mortality rate
0,Afghanistan,2008,4.0,0.450662,49.209663,0.718114,0.181819,0.881686,0.517637,0.258195,0.612072,-1.92969,-1.655084,10.297,58.1,15.6,70.8,50.9,98.2
1,Afghanistan,2009,4.0,0.552308,49.624432,0.678896,0.203614,0.850035,0.583926,0.237092,0.611545,-2.044093,-1.635025,12.066,58.6,15.7,68.2,49.4,94.1
2,Afghanistan,2010,5.0,0.539075,50.008961,0.600127,0.13763,0.706766,0.618265,0.275324,0.299357,-1.99181,-1.617176,15.325,58.8,15.7,65.7,48.0,90.2


### Changed column headers to remove spaces/capital letters and abbreviate descriptions 

All changes can be viewed below:

country = country

year = year 

happiness_score = Happiness Score 

social_support = Social support 

healthy_life_exp_birth = Healthy life expectancy at birth

life_choices = Freedom to make life choices 

generosity = Generosity 

corruption = Perceptions of corruption 

pos_effect = Positive affect

neg_affect = Negative affect 

confidence_gov = Confidence in national government 

dem_quality = Demoncratic Quality 

gdp = GDP 

life_exp_birth = Life expectancy birth

life_exp_60 = Life expectancy age 60 

infant_mortality = Infant mortality rate 

neonatal_mortality = Neonatal mortality rate 

u5_mortaility = Under-five mortality rate

In [4]:
df.columns = ["country", "year", "happiness_score", "social_support", "healthy_life_exp_birth", "life_choices", "generosity", "corruption", "pos_affect", "neg_affect", "confidence_gov", "dem_quality", "delivery_quality", "gdp", "life_exp_birth", "life_exp_60", "infant_mortality", "neonatal_mortality", "u5_mortality"]

In [5]:
df.head(1)

Unnamed: 0,country,year,happiness_score,social_support,healthy_life_exp_birth,life_choices,generosity,corruption,pos_affect,neg_affect,confidence_gov,dem_quality,delivery_quality,gdp,life_exp_birth,life_exp_60,infant_mortality,neonatal_mortality,u5_mortality
0,Afghanistan,2008,4.0,0.450662,49.209663,0.718114,0.181819,0.881686,0.517637,0.258195,0.612072,-1.92969,-1.655084,10.297,58.1,15.6,70.8,50.9,98.2


In [6]:
# mean target feature score
df.happiness_score.mean()

5.424107142857143

In [7]:
df.dtypes

country                    object
year                        int64
happiness_score           float64
social_support            float64
healthy_life_exp_birth    float64
life_choices              float64
generosity                float64
corruption                float64
pos_affect                float64
neg_affect                float64
confidence_gov            float64
dem_quality               float64
delivery_quality          float64
gdp                       float64
life_exp_birth            float64
life_exp_60               float64
infant_mortality          float64
neonatal_mortality        float64
u5_mortality              float64
dtype: object

In [8]:
# check correlation for feature selection
df[[ "happiness_score", "country", "year", "social_support", "healthy_life_exp_birth", "life_choices", "generosity", "corruption", "pos_affect", "neg_affect", "confidence_gov", "dem_quality", "delivery_quality", "gdp", "life_exp_birth", "life_exp_60", "infant_mortality", "neonatal_mortality", "u5_mortality"]].corr()

Unnamed: 0,happiness_score,year,social_support,healthy_life_exp_birth,life_choices,generosity,corruption,pos_affect,neg_affect,confidence_gov,dem_quality,delivery_quality,gdp,life_exp_birth,life_exp_60,infant_mortality,neonatal_mortality,u5_mortality
happiness_score,1.0,-0.008398,0.677275,0.69358,0.498827,0.183873,-0.410998,0.532431,-0.261297,-0.09573,0.59498,0.670657,0.21017,0.667684,0.698286,-0.595494,-0.604736,-0.573666
year,-0.008398,1.0,-0.001686,0.085714,0.18178,-0.003323,-0.052797,0.00358,0.192507,-0.002864,0.019568,0.005982,0.00575,0.045016,0.040693,-0.06654,-0.061648,-0.068502
social_support,0.677275,-0.001686,1.0,0.589475,0.415969,0.08415,-0.222379,0.456244,-0.365476,-0.149647,0.541915,0.549914,0.157553,0.570871,0.559523,-0.605337,-0.629924,-0.579524
healthy_life_exp_birth,0.69358,0.085714,0.589475,1.0,0.335576,0.054289,-0.314577,0.302134,-0.120397,-0.199992,0.621891,0.73679,0.232309,0.918687,0.827421,-0.87591,-0.834034,-0.866297
life_choices,0.498827,0.18178,0.415969,0.335576,1.0,0.351494,-0.495258,0.623822,-0.290106,0.418441,0.419629,0.45827,0.142719,0.328524,0.367235,-0.290917,-0.307664,-0.285404
generosity,0.183873,-0.003323,0.08415,0.054289,0.351494,1.0,-0.291499,0.36624,-0.097922,0.276602,0.115362,0.196835,0.048086,0.030423,0.086943,0.040542,0.043253,0.03823
corruption,-0.410998,-0.052797,-0.222379,-0.314577,-0.495258,-0.291499,1.0,-0.294326,0.248384,-0.449852,-0.285157,-0.498382,-0.077981,-0.307606,-0.348312,0.222328,0.230193,0.205088
pos_affect,0.532431,0.00358,0.456244,0.302134,0.623822,0.36624,-0.294326,1.0,-0.385255,0.161131,0.378571,0.3669,0.195737,0.304713,0.400902,-0.261728,-0.255344,-0.255688
neg_affect,-0.261297,0.192507,-0.365476,-0.120397,-0.290106,-0.097922,0.248384,-0.385255,1.0,-0.167158,-0.238025,-0.249651,-0.098236,-0.085052,-0.093485,0.086652,0.082621,0.074034
confidence_gov,-0.09573,-0.002864,-0.149647,-0.199992,0.418441,0.276602,-0.449852,0.161131,-0.167158,1.0,-0.165711,-0.074368,-0.077217,-0.187935,-0.192838,0.215414,0.218006,0.184351


### Remove a single feature from feature pairs with over 90% correlation

Feature pairs with over 90% correlation essentially tell us the same thing. I have decided to remove the feature
with the lowest level of negative/positive correlation with the target feature.

Feature correlation: 0.93 life_exp_birth vs healthy_life_exp_birth - drop life_exp_birth

Feature correlation: 0.99 u5_mortality vs infant_mortality - drop u5_mortality

Feature correlation: 0.96 neonatal_mortality vs infant_mortality - drop neonatal_mortality

Feature correlation -0.92 u5_mortality vs life_exp_birth: both features dropped already due to correlation with other descriptive features.

In [9]:
df.drop('life_exp_birth', axis=1, inplace=True)

In [10]:
df.drop('u5_mortality', axis=1, inplace=True)

In [11]:
df.drop('neonatal_mortality', axis=1, inplace=True)

In [12]:
df.head(3)

Unnamed: 0,country,year,happiness_score,social_support,healthy_life_exp_birth,life_choices,generosity,corruption,pos_affect,neg_affect,confidence_gov,dem_quality,delivery_quality,gdp,life_exp_60,infant_mortality
0,Afghanistan,2008,4.0,0.450662,49.209663,0.718114,0.181819,0.881686,0.517637,0.258195,0.612072,-1.92969,-1.655084,10.297,15.6,70.8
1,Afghanistan,2009,4.0,0.552308,49.624432,0.678896,0.203614,0.850035,0.583926,0.237092,0.611545,-2.044093,-1.635025,12.066,15.7,68.2
2,Afghanistan,2010,5.0,0.539075,50.008961,0.600127,0.13763,0.706766,0.618265,0.275324,0.299357,-1.99181,-1.617176,15.325,15.7,65.7


### Feature Selection

I have decided to select features with positive/negative correlation of greater than 0.50 away from 0. This threshold is an arbitrary one but the selected features should provide a strong indication toward predicting the happiness score.

Selected Features:

social_support   (target  corr: 0.677275)

healthy_life_exp_birth   (target  corr: 0.693580)

pos_affect   (target  corr: 0.532431)

dem_quality   (target  corr: 0.594980)

delivery_quality   (target  corr: 0.670657)

life_exp_60   (target  corr: 0.698286)

infant_mortality   (target  corr: -0.595494)

In [13]:
# selected feature correlation
df[["happiness_score", "social_support", "healthy_life_exp_birth", "pos_affect", "dem_quality", "delivery_quality", "life_exp_60", "infant_mortality"]].corr()

Unnamed: 0,happiness_score,social_support,healthy_life_exp_birth,pos_affect,dem_quality,delivery_quality,life_exp_60,infant_mortality
happiness_score,1.0,0.677275,0.69358,0.532431,0.59498,0.670657,0.698286,-0.595494
social_support,0.677275,1.0,0.589475,0.456244,0.541915,0.549914,0.559523,-0.605337
healthy_life_exp_birth,0.69358,0.589475,1.0,0.302134,0.621891,0.73679,0.827421,-0.87591
pos_affect,0.532431,0.456244,0.302134,1.0,0.378571,0.3669,0.400902,-0.261728
dem_quality,0.59498,0.541915,0.621891,0.378571,1.0,0.86675,0.635915,-0.559324
delivery_quality,0.670657,0.549914,0.73679,0.3669,0.86675,1.0,0.710327,-0.641283
life_exp_60,0.698286,0.559523,0.827421,0.400902,0.635915,0.710327,1.0,-0.784651
infant_mortality,-0.595494,-0.605337,-0.87591,-0.261728,-0.559324,-0.641283,-0.784651,1.0


### Training the model

This section trains the model on a linear relationship between descriptive features and the target feature. The data set is split 70/30 into a training set and test set. The purpose of this is to fit the model to the training set and then test that model on the testing set. This process helps avoid over/under fitting a model by using 100% of the data during fitting.

In [14]:
# Create a training data set (first 70% of rows)
training_size = int(len(df) * 0.7)
df_train = df[:training_size]
print("Training set size (rows):",len(df_train))

Training set size (rows): 940


In [15]:
# Create a test data set (remaining 30% of rows)
training_size = int(len(df) * 0.7)
df_test = df[training_size:]
print("Test set:",len(df_test))

Test set: 404


In [16]:
# A copy of df for full cross validation purposes
df_cross = df

In [17]:
# train model on all continuous features using df_train
lm = sm.ols(formula="happiness_score ~ social_support + healthy_life_exp_birth + pos_affect + dem_quality + delivery_quality + life_exp_60 + infant_mortality", data=df_train).fit()
print(lm.params)

Intercept                -2.297854
social_support            2.804478
healthy_life_exp_birth    0.019403
pos_affect                2.054902
dem_quality              -0.175589
delivery_quality          0.310884
life_exp_60               0.135842
infant_mortality          0.004932
dtype: float64


### Table with feature weights

In [18]:
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:        happiness_score   R-squared:                       0.716
Model:                            OLS   Adj. R-squared:                  0.713
Method:                 Least Squares   F-statistic:                     334.6
Date:                Wed, 25 Apr 2018   Prob (F-statistic):          4.21e-249
Time:                        18:16:16   Log-Likelihood:                -866.63
No. Observations:                 939   AIC:                             1749.
Df Residuals:                     931   BIC:                             1788.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -2

### Test on df_train

Note we will also evaluate the model on df_test to avoid overfitting

In [19]:
lm.predict(df_train)

0      3.277174
1      3.733587
2      3.758539
3      3.690219
4      3.940235
5      3.667680
6      3.608792
7      3.691676
8      3.740336
9      3.436225
10     5.439211
11     5.225534
12     5.291915
13     5.325180
14     5.316533
15     5.064058
16     5.146827
17     5.069811
18     5.056787
19     5.466397
20     5.356856
21     5.540801
22     5.535473
23     5.384354
24     5.543327
25     4.278710
26     4.171578
27     4.271523
28     4.217889
29     5.923214
         ...   
910    4.762476
911    4.911902
912    5.319802
913    4.998718
914    5.190287
915    5.086364
916    5.069571
917    5.107065
918    5.250784
919    5.279713
920    6.523951
921    6.593037
922    6.618302
923    6.565206
924    6.588457
925    6.597785
926    6.411248
927    6.411895
928    6.465707
929    6.549664
930    5.885575
931    5.916722
932    5.951439
933    5.812439
934    6.062601
935    6.243614
936    6.309070
937    6.021601
938    6.188573
939    5.618792
Length: 940, dtype: floa

### Actual Happiness Score vs Predicted Happiness

In [30]:
predict_df_train = pd.DataFrame({'ActualHappiness': df_train.happiness_score, 'PredictedHappiness': lm.predict(df_train)})
predict_df_train.head(10)

Unnamed: 0,ActualHappiness,PredictedHappiness
0,4.0,3.277174
1,4.0,3.733587
2,5.0,3.758539
3,4.0,3.690219
4,4.0,3.940235
5,4.0,3.66768
6,3.0,3.608792
7,4.0,3.691676
8,4.0,3.740336
9,3.0,3.436225


### Actual Happiness Score minus Predicted Score

In [32]:
# analyse the value of the actual happiness score minus the predicted happiness score
print("Actual minus Predicted:\n", (df_train.happiness_score - lm.predict(df_train)))
print("\n(Actual minus Predicted) squared:\n", (df_train.happiness_score - lm.predict(df_train))**2)

Actual minus Predicted:
 0      0.722826
1      0.266413
2      1.241461
3      0.309781
4      0.059765
5      0.332320
6     -0.608792
7      0.308324
8      0.259664
9     -0.436225
10    -0.439211
11    -0.225534
12     0.708085
13     0.674820
14    -0.316533
15    -0.064058
16    -0.146827
17    -0.069811
18    -0.056787
19    -0.466397
20    -0.356856
21     0.459199
22     0.464527
23    -0.384354
24    -0.543327
25     1.721290
26    -0.171578
27    -0.271523
28    -0.217889
29     0.076786
         ...   
910   -0.762476
911   -0.911902
912   -0.319802
913    0.001282
914   -0.190287
915   -0.086364
916   -0.069571
917   -0.107065
918   -0.250784
919   -0.279713
920    0.476049
921    0.406963
922    0.381698
923    0.434794
924    0.411543
925    0.402215
926    0.588752
927    0.588105
928   -0.465707
929    0.450336
930    0.114425
931    0.083278
932    0.048561
933    0.187561
934   -0.062601
935   -0.243614
936   -1.309070
937   -0.021601
938   -0.188573
939   -0.618792

### Mean Squared Error

Performed on the df_train

In [36]:
mse = ((df_train.happiness_score - lm.predict(df_train))** 2).mean()
print("\nMean Squared Error:", mse)


Mean Squared Error: 0.3708295262213801


### Mean Absolute Error

Performed on df_train

In [37]:
mae = abs(df_train.happiness_score - lm.predict(df_train)).mean()
print("\nMean Absolute Error:", mae)


Mean Absolute Error: 0.48935309318341114


### Test on df_test

In [39]:
predict_df_test = pd.DataFrame({'ActualHappiness': df_test.happiness_score, 'PredictedHappiness': lm.predict(df_test)})
predict_df_test.head(10)

Unnamed: 0,ActualHappiness,PredictedHappiness
940,6.0,5.778462
941,6.0,5.778015
942,6.0,5.630173
943,6.0,5.637376
944,6.0,5.790562
945,6.0,5.767568
946,6.0,5.698933
947,6.0,5.792373
948,6.0,5.866276
949,5.0,5.190931


### Mean Squared Error

Performed on the df_test

In [43]:
mse = ((df_test.happiness_score - lm.predict(df_test))** 2).mean()
print("\nMean Squared Error:", mse)


Mean Squared Error: 0.5548221496958053


### Mean Absolute Error

Performed on the df_test

In [44]:
mae = abs(df_test.happiness_score - lm.predict(df_test)).mean()
print("\nMean Absolute Error:", mae)


Mean Absolute Error: 0.5730301353922305


### Comparison of results between train & test data set.

In [45]:
mse = ((df_train.happiness_score - lm.predict(df_train))** 2).mean()
print("\nTRAIN DATA SET - Mean Squared Error:", mse)

mse = ((df_test.happiness_score - lm.predict(df_test))** 2).mean()
print("\nTEST DATA SET - Mean Squared Error:", mse)

mae = abs(df_train.happiness_score - lm.predict(df_train)).mean()
print("\nTRAIN DATA SET - Mean Absolute Error:", mae)

mae = abs(df_test.happiness_score - lm.predict(df_test)).mean()
print("\nTEST DATA SET - Mean Absolute Error:", mae)


TRAIN DATA SET - Mean Squared Error: 0.3708295262213801

TEST DATA SET - Mean Squared Error: 0.5548221496958053

TRAIN DATA SET - Mean Absolute Error: 0.48935309318341114

TEST DATA SET - Mean Absolute Error: 0.5730301353922305


### Perform normalisation of the features

This process will give each feature a relative value and help to compare the coef of different features.

In [20]:
df_feat = df_train[['happiness_score', 'social_support', 'healthy_life_exp_birth', 'pos_affect', 'dem_quality', 'delivery_quality', 'life_exp_60', 'infant_mortality']]

In [21]:
df_feat.min()

happiness_score            3.000000
social_support             0.290184
healthy_life_exp_birth    40.076595
pos_affect                 0.422928
dem_quality               -2.044093
delivery_quality          -1.900852
life_exp_60               10.300000
infant_mortality           1.600000
dtype: float64

In [22]:
df_feat.max()

happiness_score             8.000000
social_support              0.987343
healthy_life_exp_birth     76.536362
pos_affect                  0.943621
dem_quality                 1.540097
delivery_quality            2.121312
life_exp_60                26.100000
infant_mortality          101.300000
dtype: float64

In [23]:
# range normalise all columns
df_norm = (df_feat - df_feat.min()) / (df_feat.max() - df_feat.min())
df_norm.head(10)

Unnamed: 0,happiness_score,social_support,healthy_life_exp_birth,pos_affect,dem_quality,delivery_quality,life_exp_60,infant_mortality
0,0.2,0.230189,0.250497,0.181891,0.031919,0.061103,0.335443,0.694082
1,0.2,0.375989,0.261873,0.309199,0.0,0.066091,0.341772,0.668004
2,0.4,0.357007,0.27242,0.37515,0.014587,0.070528,0.341772,0.642929
3,0.2,0.331229,0.282248,0.36194,0.034896,0.070766,0.348101,0.618857
4,0.2,0.330559,0.291627,0.552066,0.056107,0.123509,0.348101,0.595787
5,0.2,0.277365,0.30078,0.379604,0.045864,0.123768,0.35443,0.573721
6,0.0,0.337633,0.309764,0.208883,0.075564,0.146277,0.35443,0.553661
7,0.2,0.341978,0.318623,0.250869,0.055725,0.151475,0.360759,0.534604
8,0.2,0.38569,0.327482,0.272762,0.035266,0.116431,0.348101,0.517553
9,0.0,0.287877,0.336341,0.141006,0.038881,0.103328,0.348101,0.599911


### Train Linear Model on normalised columns

The values of each column have been normalised so we can now train a model on them and compare the coef with one another.

In [24]:
lm_df_norm = sm.ols(formula="happiness_score ~ social_support + healthy_life_exp_birth + pos_affect + dem_quality + delivery_quality + life_exp_60 + infant_mortality", data=df_norm).fit()
print(lm_df_norm.params)

Intercept                -0.332461
social_support            0.391034
healthy_life_exp_birth    0.141489
pos_affect                0.213995
dem_quality              -0.125869
delivery_quality          0.250085
life_exp_60               0.429260
infant_mortality          0.098349
dtype: float64


In [25]:
print(lm_df_norm.summary())

                            OLS Regression Results                            
Dep. Variable:        happiness_score   R-squared:                       0.716
Model:                            OLS   Adj. R-squared:                  0.713
Method:                 Least Squares   F-statistic:                     334.6
Date:                Wed, 25 Apr 2018   Prob (F-statistic):          4.21e-249
Time:                        18:16:28   Log-Likelihood:                 644.63
No. Observations:                 939   AIC:                            -1273.
Df Residuals:                     931   BIC:                            -1234.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -0

### Standardisation of features 

In [26]:
df_st = (df_feat - df_feat.mean()) / df_feat.std()
df_st.head(10)

Unnamed: 0,happiness_score,social_support,healthy_life_exp_birth,pos_affect,dem_quality,delivery_quality,life_exp_60,infant_mortality
0,-1.252827,-2.808808,-1.707488,-1.749264,-2.125722,-1.703961,-1.445899,2.139915
1,-1.252827,-1.988406,-1.65384,-1.135243,-2.259258,-1.683236,-1.411412,2.020281
2,-0.377896,-2.095214,-1.604104,-0.817157,-2.198232,-1.664795,-1.411412,1.905248
3,-1.252827,-2.240266,-1.557756,-0.880868,-2.113266,-1.663808,-1.376926,1.794817
4,-1.252827,-2.244033,-1.513525,0.036133,-2.024529,-1.444625,-1.376926,1.688987
5,-1.252827,-2.543351,-1.470361,-0.795673,-2.067382,-1.443548,-1.342439,1.587758
6,-2.127758,-2.204229,-1.427995,-1.619081,-1.943127,-1.350011,-1.342439,1.495732
7,-1.252827,-2.179783,-1.386217,-1.416579,-2.026126,-1.328408,-1.307952,1.408307
8,-1.252827,-1.933818,-1.344439,-1.310987,-2.11172,-1.474039,-1.376926,1.330085
9,-2.127758,-2.484204,-1.302661,-1.946458,-2.096596,-1.528492,-1.376926,1.707903


In [27]:
lm_df_st = sm.ols(formula="happiness_score ~ social_support + healthy_life_exp_birth + pos_affect + dem_quality + delivery_quality + life_exp_60 + infant_mortality", data=df_st).fit()
print(lm_df_st.summary())

                            OLS Regression Results                            
Dep. Variable:        happiness_score   R-squared:                       0.716
Model:                            OLS   Adj. R-squared:                  0.713
Method:                 Least Squares   F-statistic:                     334.6
Date:                Wed, 25 Apr 2018   Prob (F-statistic):          4.21e-249
Time:                        18:16:34   Log-Likelihood:                -741.17
No. Observations:                 939   AIC:                             1498.
Df Residuals:                     931   BIC:                             1537.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -0

### Save updated dataframe to csv

In [28]:
df.to_csv('processed_data/happiness_data_alan.csv', index=False)