## Linear Regression (lab6.1)

The purpose of this notebook is to train the linear regression model on the clean merged data set to produce a set
of weights, one for each feature. Those weights will then be applied to the actual values plus an intercept to predict
the happiness score.

This notebook utilises Python packages: pandas, matplotlib, and statsmodels.

In [1]:
# Imports.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Import statsmodels package for training a linear regression model.
import statsmodels.formula.api as sm

# Allows plots to appear directly in the notebook.
%matplotlib inline

### Read data from csv

In [2]:
df = pd.read_csv('processed_data/cleaned_merged_happiness_data.csv')

In [3]:
df.head(3)

Unnamed: 0,country,year,Happiness Score,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Confidence in national government,Democratic Quality,Delivery Quality,GDP,Life expectancy birth,Life expectancy age 60,Infant mortality rate,Neonatal mortality rate,Under-five mortality rate
0,Afghanistan,2008,4.0,0.450662,49.209663,0.718114,0.181819,0.881686,0.517637,0.258195,0.612072,-1.92969,-1.655084,10.297,58.1,15.6,70.8,50.9,98.2
1,Afghanistan,2009,4.0,0.552308,49.624432,0.678896,0.203614,0.850035,0.583926,0.237092,0.611545,-2.044093,-1.635025,12.066,58.6,15.7,68.2,49.4,94.1
2,Afghanistan,2010,5.0,0.539075,50.008961,0.600127,0.13763,0.706766,0.618265,0.275324,0.299357,-1.99181,-1.617176,15.325,58.8,15.7,65.7,48.0,90.2


### Changed column headers to remove spaces/capital letters and abbreviate descriptions 

All changes can be viewed below:

country = country

year = year 

happiness_score = Happiness Score 

social_support = Social support 

healthy_life_exp_birth = Healthy life expectancy at birth

life_choices = Freedom to make life choices 

generosity = Generosity 

corruption = Perceptions of corruption 

pos_effect = Positive affect

neg_affect = Negative affect 

confidence_gov = Confidence in national government 

dem_quality = Demoncratic Quality 

gdp = GDP 

life_exp_birth = Life expectancy birth

life_exp_60 = Life expectancy age 60 

infant_mortality = Infant mortality rate 

neonatal_mortality = Neonatal mortality rate 

u5_mortaility = Under-five mortality rate

In [4]:
df.columns = ["country", "year", "happiness_score", "social_support", "healthy_life_exp_birth", "life_choices", "generosity", "corruption", "pos_affect", "neg_affect", "confidence_gov", "dem_quality", "delivery_quality", "gdp", "life_exp_birth", "life_exp_60", "infant_mortality", "neonatal_mortality", "u5_mortality"]

In [5]:
df.head(1)

Unnamed: 0,country,year,happiness_score,social_support,healthy_life_exp_birth,life_choices,generosity,corruption,pos_affect,neg_affect,confidence_gov,dem_quality,delivery_quality,gdp,life_exp_birth,life_exp_60,infant_mortality,neonatal_mortality,u5_mortality
0,Afghanistan,2008,4.0,0.450662,49.209663,0.718114,0.181819,0.881686,0.517637,0.258195,0.612072,-1.92969,-1.655084,10.297,58.1,15.6,70.8,50.9,98.2


In [6]:
# mean target feature score
df.happiness_score.mean()

5.424107142857143

In [7]:
df.dtypes

country                    object
year                        int64
happiness_score           float64
social_support            float64
healthy_life_exp_birth    float64
life_choices              float64
generosity                float64
corruption                float64
pos_affect                float64
neg_affect                float64
confidence_gov            float64
dem_quality               float64
delivery_quality          float64
gdp                       float64
life_exp_birth            float64
life_exp_60               float64
infant_mortality          float64
neonatal_mortality        float64
u5_mortality              float64
dtype: object

In [8]:
# check correlation for feature selection
df[[ "happiness_score", "country", "year", "social_support", "healthy_life_exp_birth", "life_choices", "generosity", "corruption", "pos_affect", "neg_affect", "confidence_gov", "dem_quality", "delivery_quality", "gdp", "life_exp_birth", "life_exp_60", "infant_mortality", "neonatal_mortality", "u5_mortality"]].corr()

Unnamed: 0,happiness_score,year,social_support,healthy_life_exp_birth,life_choices,generosity,corruption,pos_affect,neg_affect,confidence_gov,dem_quality,delivery_quality,gdp,life_exp_birth,life_exp_60,infant_mortality,neonatal_mortality,u5_mortality
happiness_score,1.0,-0.008398,0.677275,0.69358,0.498827,0.183873,-0.410998,0.532431,-0.261297,-0.09573,0.59498,0.670657,0.21017,0.667684,0.698286,-0.595494,-0.604736,-0.573666
year,-0.008398,1.0,-0.001686,0.085714,0.18178,-0.003323,-0.052797,0.00358,0.192507,-0.002864,0.019568,0.005982,0.00575,0.045016,0.040693,-0.06654,-0.061648,-0.068502
social_support,0.677275,-0.001686,1.0,0.589475,0.415969,0.08415,-0.222379,0.456244,-0.365476,-0.149647,0.541915,0.549914,0.157553,0.570871,0.559523,-0.605337,-0.629924,-0.579524
healthy_life_exp_birth,0.69358,0.085714,0.589475,1.0,0.335576,0.054289,-0.314577,0.302134,-0.120397,-0.199992,0.621891,0.73679,0.232309,0.918687,0.827421,-0.87591,-0.834034,-0.866297
life_choices,0.498827,0.18178,0.415969,0.335576,1.0,0.351494,-0.495258,0.623822,-0.290106,0.418441,0.419629,0.45827,0.142719,0.328524,0.367235,-0.290917,-0.307664,-0.285404
generosity,0.183873,-0.003323,0.08415,0.054289,0.351494,1.0,-0.291499,0.36624,-0.097922,0.276602,0.115362,0.196835,0.048086,0.030423,0.086943,0.040542,0.043253,0.03823
corruption,-0.410998,-0.052797,-0.222379,-0.314577,-0.495258,-0.291499,1.0,-0.294326,0.248384,-0.449852,-0.285157,-0.498382,-0.077981,-0.307606,-0.348312,0.222328,0.230193,0.205088
pos_affect,0.532431,0.00358,0.456244,0.302134,0.623822,0.36624,-0.294326,1.0,-0.385255,0.161131,0.378571,0.3669,0.195737,0.304713,0.400902,-0.261728,-0.255344,-0.255688
neg_affect,-0.261297,0.192507,-0.365476,-0.120397,-0.290106,-0.097922,0.248384,-0.385255,1.0,-0.167158,-0.238025,-0.249651,-0.098236,-0.085052,-0.093485,0.086652,0.082621,0.074034
confidence_gov,-0.09573,-0.002864,-0.149647,-0.199992,0.418441,0.276602,-0.449852,0.161131,-0.167158,1.0,-0.165711,-0.074368,-0.077217,-0.187935,-0.192838,0.215414,0.218006,0.184351


### Remove a single feature from feature pairs with over 90% correlation

Feature pairs with over 90% correlation essentially tell us the same thing. I have decided to remove the feature
with the lowest level of negative/positive correlation with the target feature.

Feature correlation: 0.93 life_exp_birth vs healthy_life_exp_birth - drop life_exp_birth

Feature correlation: 0.99 u5_mortality vs infant_mortality - drop u5_mortality

Feature correlation: 0.96 neonatal_mortality vs infant_mortality - drop neonatal_mortality

Feature correlation -0.92 u5_mortality vs life_exp_birth: both features dropped already due to correlation with other descriptive features.

In [9]:
df.drop('life_exp_birth', axis=1, inplace=True)

In [10]:
df.drop('u5_mortality', axis=1, inplace=True)

In [11]:
df.drop('neonatal_mortality', axis=1, inplace=True)

In [12]:
df.head(3)

Unnamed: 0,country,year,happiness_score,social_support,healthy_life_exp_birth,life_choices,generosity,corruption,pos_affect,neg_affect,confidence_gov,dem_quality,delivery_quality,gdp,life_exp_60,infant_mortality
0,Afghanistan,2008,4.0,0.450662,49.209663,0.718114,0.181819,0.881686,0.517637,0.258195,0.612072,-1.92969,-1.655084,10.297,15.6,70.8
1,Afghanistan,2009,4.0,0.552308,49.624432,0.678896,0.203614,0.850035,0.583926,0.237092,0.611545,-2.044093,-1.635025,12.066,15.7,68.2
2,Afghanistan,2010,5.0,0.539075,50.008961,0.600127,0.13763,0.706766,0.618265,0.275324,0.299357,-1.99181,-1.617176,15.325,15.7,65.7


### Feature Selection

I have decided to select features with positive/negative correlation of greater than 0.50 away from 0. This threshold is an arbitrary one but the selected features should provide a strong indication toward predicting the happiness score.

Selected Features:

social_support   (target  corr: 0.677275)

healthy_life_exp_birth   (target  corr: 0.693580)

pos_affect   (target  corr: 0.532431)

dem_quality   (target  corr: 0.594980)

delivery_quality   (target  corr: 0.670657)

life_exp_60   (target  corr: 0.698286)

infant_mortality   (target  corr: -0.595494)

In [13]:
# selected feature correlation
df[["happiness_score", "social_support", "healthy_life_exp_birth", "pos_affect", "dem_quality", "delivery_quality", "life_exp_60", "infant_mortality"]].corr()

Unnamed: 0,happiness_score,social_support,healthy_life_exp_birth,pos_affect,dem_quality,delivery_quality,life_exp_60,infant_mortality
happiness_score,1.0,0.677275,0.69358,0.532431,0.59498,0.670657,0.698286,-0.595494
social_support,0.677275,1.0,0.589475,0.456244,0.541915,0.549914,0.559523,-0.605337
healthy_life_exp_birth,0.69358,0.589475,1.0,0.302134,0.621891,0.73679,0.827421,-0.87591
pos_affect,0.532431,0.456244,0.302134,1.0,0.378571,0.3669,0.400902,-0.261728
dem_quality,0.59498,0.541915,0.621891,0.378571,1.0,0.86675,0.635915,-0.559324
delivery_quality,0.670657,0.549914,0.73679,0.3669,0.86675,1.0,0.710327,-0.641283
life_exp_60,0.698286,0.559523,0.827421,0.400902,0.635915,0.710327,1.0,-0.784651
infant_mortality,-0.595494,-0.605337,-0.87591,-0.261728,-0.559324,-0.641283,-0.784651,1.0


### Training the model

This section trains the model on a linear relationship between descriptive features and the target feature. The data set is split 70/30 into a training set and test set. The purpose of this is to fit the model to the training set and then test that model on the testing set. This process helps avoid over/under fitting a model by using 100% of the data during fitting.

In [14]:
# Create a training data set (first 70% of rows)
training_size = int(len(df) * 0.7)
df_train = df[:training_size]
print("Training set size (rows):",len(df_train))

Training set size (rows): 940


In [15]:
# Create a test data set (remaining 30% of rows)
training_size = int(len(df) * 0.7)
df_test = df[training_size:]
print("Test set:",len(df_test))

Test set: 404


In [16]:
# A copy of df for full cross validation purposes
df_cross = df

In [17]:
# train model on all continuous features
lm = sm.ols(formula="happiness_score ~ social_support + healthy_life_exp_birth + pos_affect + dem_quality + delivery_quality + life_exp_60 + infant_mortality", data=df).fit()
print(lm.params)

Intercept                -3.664816
social_support            2.951729
healthy_life_exp_birth    0.049486
pos_affect                2.124284
dem_quality              -0.086974
delivery_quality          0.251553
life_exp_60               0.092144
infant_mortality          0.011810
dtype: float64


### Table with feature weights

In [18]:
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:        happiness_score   R-squared:                       0.684
Model:                            OLS   Adj. R-squared:                  0.682
Method:                 Least Squares   F-statistic:                     412.6
Date:                Mon, 23 Apr 2018   Prob (F-statistic):               0.00
Time:                        16:40:17   Log-Likelihood:                -1312.3
No. Observations:                1343   AIC:                             2641.
Df Residuals:                    1335   BIC:                             2682.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -3

### Perform normalisation of the features

This process will give each feature a relative value and help to compare the coef of different features.

In [23]:
df_feat = df[['happiness_score', 'social_support', 'healthy_life_exp_birth', 'pos_affect', 'dem_quality', 'delivery_quality', 'life_exp_60', 'infant_mortality']]

In [24]:
df_feat.min()

happiness_score            3.000000
social_support             0.290184
healthy_life_exp_birth    39.351990
pos_affect                 0.362498
dem_quality               -2.448228
delivery_quality          -2.144974
life_exp_60               10.300000
infant_mortality           1.600000
dtype: float64

In [25]:
df_feat.max()

happiness_score             8.000000
social_support              0.987343
healthy_life_exp_birth     76.536362
pos_affect                  0.943621
dem_quality                 1.540097
delivery_quality            2.121312
life_exp_60                26.100000
infant_mortality          116.200000
dtype: float64

In [33]:
# range normalise all columns
df_norm = (df_feat - df_feat.min()) / (df_feat.max() - df_feat.min())
df_norm.head(10)

Unnamed: 0,happiness_score,social_support,healthy_life_exp_birth,pos_affect,dem_quality,delivery_quality,life_exp_60,infant_mortality
0,0.2,0.230189,0.265103,0.266965,0.130014,0.114828,0.335443,0.603839
1,0.2,0.375989,0.276257,0.381035,0.10133,0.11953,0.341772,0.581152
2,0.4,0.357007,0.286598,0.440127,0.114439,0.123714,0.341772,0.559337
3,0.2,0.331229,0.296235,0.428291,0.13269,0.123937,0.348101,0.538394
4,0.2,0.330559,0.305431,0.598646,0.151751,0.173663,0.348101,0.518325
5,0.2,0.277365,0.314406,0.444118,0.142546,0.173907,0.35443,0.499127
6,0.0,0.337633,0.323215,0.29115,0.169237,0.195128,0.35443,0.481675
7,0.2,0.341978,0.331901,0.328769,0.151408,0.200029,0.360759,0.465096
8,0.2,0.38569,0.340588,0.348386,0.133022,0.16699,0.348101,0.450262
9,0.0,0.287877,0.349274,0.230331,0.136271,0.154636,0.348101,0.521912


### Train Linear Model on normalised columns

The values of each column have been normalised so we can now train a model on them and compare the coef with one another.

In [34]:
lm_df_norm = sm.ols(formula="happiness_score ~ social_support + healthy_life_exp_birth + pos_affect + dem_quality + delivery_quality + life_exp_60 + infant_mortality", data=df_norm).fit()
print(lm_df_norm.params)

Intercept                -0.489907
social_support            0.411565
healthy_life_exp_birth    0.368018
pos_affect                0.246894
dem_quality              -0.069376
delivery_quality          0.214639
life_exp_60               0.291175
infant_mortality          0.270681
dtype: float64


In [35]:
print(lm_df_norm.summary())

                            OLS Regression Results                            
Dep. Variable:        happiness_score   R-squared:                       0.684
Model:                            OLS   Adj. R-squared:                  0.682
Method:                 Least Squares   F-statistic:                     412.6
Date:                Mon, 23 Apr 2018   Prob (F-statistic):               0.00
Time:                        16:58:01   Log-Likelihood:                 849.17
No. Observations:                1343   AIC:                            -1682.
Df Residuals:                    1335   BIC:                            -1641.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -0

### Standardisation of features 

In [36]:
df_st = (df_feat - df_feat.mean()) / df_feat.std()
df_st.head(10)

Unnamed: 0,happiness_score,social_support,healthy_life_exp_birth,pos_affect,dem_quality,delivery_quality,life_exp_60,infant_mortality
0,-1.244627,-2.970453,-1.703022,-1.742407,-2.051695,-1.693464,-1.447391,2.195478
1,-1.244627,-2.124199,-1.64979,-1.132641,-2.182721,-1.67293,-1.411822,2.072839
2,-0.370657,-2.234372,-1.600439,-0.81676,-2.122842,-1.654659,-1.411822,1.954917
3,-1.244627,-2.383995,-1.55445,-0.88003,-2.039473,-1.653682,-1.376254,1.841712
4,-1.244627,-2.387882,-1.510561,0.030615,-1.952405,-1.436521,-1.376254,1.733223
5,-1.244627,-2.696631,-1.467732,-0.795425,-1.994452,-1.435454,-1.340685,1.629452
6,-2.118597,-2.346823,-1.425694,-1.613126,-1.872532,-1.34278,-1.340685,1.535114
7,-1.244627,-2.321606,-1.384239,-1.412028,-1.953971,-1.321377,-1.305117,1.445494
8,-1.244627,-2.067891,-1.342785,-1.307167,-2.037956,-1.465664,-1.376254,1.365307
9,-2.118597,-2.63562,-1.301331,-1.938234,-2.023116,-1.519615,-1.376254,1.752615


In [37]:
lm_df_st = sm.ols(formula="happiness_score ~ social_support + healthy_life_exp_birth + pos_affect + dem_quality + delivery_quality + life_exp_60 + infant_mortality", data=df_st).fit()
print(lm_df_st.summary())

                            OLS Regression Results                            
Dep. Variable:        happiness_score   R-squared:                       0.684
Model:                            OLS   Adj. R-squared:                  0.682
Method:                 Least Squares   F-statistic:                     412.6
Date:                Mon, 23 Apr 2018   Prob (F-statistic):               0.00
Time:                        16:58:17   Log-Likelihood:                -1131.4
No. Observations:                1343   AIC:                             2279.
Df Residuals:                    1335   BIC:                             2320.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -0

### Save updated dataframe to csv

In [38]:
df.to_csv('processed_data/happiness_data_alan.csv', index=False)