## Linear Regression (Alan)

The purpose of this notebook is to train the linear regression model on the clean merged data set to produce a set
of weights, one for each feature. Those weights will then be applied to the actual values plus an intercept to predict
the happiness score.

This notebook utilises Python packages: pandas, matplotlib, and statsmodels.

In [33]:
# Imports.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Import statsmodels package for training a linear regression model.
import statsmodels.formula.api as sm

# Allows plots to appear directly in the notebook.
%matplotlib inline

### Read data from csv

In [34]:
df = pd.read_csv('processed_data/cleaned_merged_happiness_data.csv')

In [35]:
df.head(3)

Unnamed: 0,country,year,Happiness Score,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Confidence in national government,Democratic Quality,Delivery Quality,GDP,Life expectancy birth,Life expectancy age 60,Infant mortality rate,Neonatal mortality rate,Under-five mortality rate
0,Afghanistan,2008,4.0,0.450662,49.209663,0.718114,0.181819,0.881686,0.517637,0.258195,0.612072,-1.92969,-1.655084,10.297,58.1,15.6,70.8,50.9,98.2
1,Afghanistan,2009,4.0,0.552308,49.624432,0.678896,0.203614,0.850035,0.583926,0.237092,0.611545,-2.044093,-1.635025,12.066,58.6,15.7,68.2,49.4,94.1
2,Afghanistan,2010,5.0,0.539075,50.008961,0.600127,0.13763,0.706766,0.618265,0.275324,0.299357,-1.99181,-1.617176,15.325,58.8,15.7,65.7,48.0,90.2


### Changed column headers to remove spaces/capital letters and abbreviate descriptions 

All changes can be viewed below:

country = country

year = year 

happiness_score = Happiness Score 

social_support = Social support 

healthy_life_exp_birth = Healthy life expectancy at birth

life_choices = Freedom to make life choices 

generosity = Generosity 

corruption = Perceptions of corruption 

pos_effect = Positive affect

neg_affect = Negative affect 

confidence_gov = Confidence in national government 

dem_quality = Demoncratic Quality 

gdp = GDP 

life_exp_birth = Life expectancy birth

life_exp_60 = Life expectancy age 60 

infant_mortality = Infant mortality rate 

neonatal_mortality = Neonatal mortality rate 

u5_mortaility = Under-five mortality rate

In [36]:
df.columns = ["country", "year", "happiness_score", "social_support", "healthy_life_exp_birth", "life_choices", "generosity", "corruption", "pos_affect", "neg_affect", "confidence_gov", "dem_quality", "delivery_quality", "gdp", "life_exp_birth", "life_exp_60", "infant_mortality", "neonatal_mortality", "u5_mortality"]

In [37]:
df.head(1)

Unnamed: 0,country,year,happiness_score,social_support,healthy_life_exp_birth,life_choices,generosity,corruption,pos_affect,neg_affect,confidence_gov,dem_quality,delivery_quality,gdp,life_exp_birth,life_exp_60,infant_mortality,neonatal_mortality,u5_mortality
0,Afghanistan,2008,4.0,0.450662,49.209663,0.718114,0.181819,0.881686,0.517637,0.258195,0.612072,-1.92969,-1.655084,10.297,58.1,15.6,70.8,50.9,98.2


### Transform happiness_score to a binary class & insert as happiness_class

In [38]:
happiness_class = (df['happiness_score']>5)*1.0
df_happiness_class = pd.DataFrame({'happiness_class':happiness_class})
df = pd.concat([df, df_happiness_class], axis=1)

In [39]:
df.head(10)

Unnamed: 0,country,year,happiness_score,social_support,healthy_life_exp_birth,life_choices,generosity,corruption,pos_affect,neg_affect,confidence_gov,dem_quality,delivery_quality,gdp,life_exp_birth,life_exp_60,infant_mortality,neonatal_mortality,u5_mortality,happiness_class
0,Afghanistan,2008,4.0,0.450662,49.209663,0.718114,0.181819,0.881686,0.517637,0.258195,0.612072,-1.92969,-1.655084,10.297,58.1,15.6,70.8,50.9,98.2,0.0
1,Afghanistan,2009,4.0,0.552308,49.624432,0.678896,0.203614,0.850035,0.583926,0.237092,0.611545,-2.044093,-1.635025,12.066,58.6,15.7,68.2,49.4,94.1,0.0
2,Afghanistan,2010,5.0,0.539075,50.008961,0.600127,0.13763,0.706766,0.618265,0.275324,0.299357,-1.99181,-1.617176,15.325,58.8,15.7,65.7,48.0,90.2,0.0
3,Afghanistan,2011,4.0,0.521104,50.367298,0.495901,0.175329,0.731109,0.611387,0.267175,0.307386,-1.919018,-1.616221,17.89,59.2,15.8,63.3,46.5,86.4,0.0
4,Afghanistan,2012,4.0,0.520637,50.709263,0.530935,0.247159,0.77562,0.710385,0.267919,0.43544,-1.842996,-1.404078,20.293,59.5,15.8,61.0,45.1,82.8,0.0
5,Afghanistan,2013,4.0,0.483552,51.04298,0.577955,0.074735,0.823204,0.620585,0.273328,0.482847,-1.879709,-1.403036,20.17,59.9,15.9,58.8,43.7,79.3,0.0
6,Afghanistan,2014,3.0,0.525568,51.370525,0.508514,0.118579,0.871242,0.531691,0.374861,0.409048,-1.773257,-1.312503,20.616,59.9,15.9,56.8,42.4,76.1,0.0
7,Afghanistan,2015,4.0,0.528597,51.693527,0.388928,0.094686,0.880638,0.553553,0.339276,0.260557,-1.844364,-1.291594,20.079,60.5,16.0,54.9,41.1,73.2,0.0
8,Afghanistan,2016,4.0,0.559072,52.016529,0.522566,0.057072,0.793246,0.564953,0.348332,0.32499,-1.917693,-1.432548,19.454,59.3125,15.8,53.2,40.0,70.4,0.0
9,Afghanistan,2017,3.0,0.49088,52.339527,0.427011,-0.10634,0.954393,0.496349,0.371326,0.261179,-1.904737,-1.485251,20.889,59.3125,15.8,61.411111,45.233333,83.411111,0.0


In [40]:
# mean target feature score
df.happiness_class.mean()

0.453125

In [41]:
df.dtypes

country                    object
year                        int64
happiness_score           float64
social_support            float64
healthy_life_exp_birth    float64
life_choices              float64
generosity                float64
corruption                float64
pos_affect                float64
neg_affect                float64
confidence_gov            float64
dem_quality               float64
delivery_quality          float64
gdp                       float64
life_exp_birth            float64
life_exp_60               float64
infant_mortality          float64
neonatal_mortality        float64
u5_mortality              float64
happiness_class           float64
dtype: object

In [42]:
# check correlation for feature selection
df[[ "happiness_class", "country", "year", "social_support", "healthy_life_exp_birth", "life_choices", "generosity", "corruption", "pos_affect", "neg_affect", "confidence_gov", "dem_quality", "delivery_quality", "gdp", "life_exp_birth", "life_exp_60", "infant_mortality", "neonatal_mortality", "u5_mortality"]].corr()

Unnamed: 0,happiness_class,year,social_support,healthy_life_exp_birth,life_choices,generosity,corruption,pos_affect,neg_affect,confidence_gov,dem_quality,delivery_quality,gdp,life_exp_birth,life_exp_60,infant_mortality,neonatal_mortality,u5_mortality
happiness_class,1.0,0.054499,0.601155,0.61053,0.449123,0.128654,-0.315596,0.485218,-0.206525,-0.124023,0.509799,0.559981,0.170105,0.585935,0.631818,-0.530871,-0.546274,-0.507856
year,0.054499,1.0,-0.001686,0.085714,0.18178,-0.003323,-0.052797,0.00358,0.192507,-0.002864,0.019568,0.005982,0.00575,0.045016,0.040693,-0.06654,-0.061648,-0.068502
social_support,0.601155,-0.001686,1.0,0.589475,0.415969,0.08415,-0.222379,0.456244,-0.365476,-0.149647,0.541915,0.549914,0.157553,0.570871,0.559523,-0.605337,-0.629924,-0.579524
healthy_life_exp_birth,0.61053,0.085714,0.589475,1.0,0.335576,0.054289,-0.314577,0.302134,-0.120397,-0.199992,0.621891,0.73679,0.232309,0.918687,0.827421,-0.87591,-0.834034,-0.866297
life_choices,0.449123,0.18178,0.415969,0.335576,1.0,0.351494,-0.495258,0.623822,-0.290106,0.418441,0.419629,0.45827,0.142719,0.328524,0.367235,-0.290917,-0.307664,-0.285404
generosity,0.128654,-0.003323,0.08415,0.054289,0.351494,1.0,-0.291499,0.36624,-0.097922,0.276602,0.115362,0.196835,0.048086,0.030423,0.086943,0.040542,0.043253,0.03823
corruption,-0.315596,-0.052797,-0.222379,-0.314577,-0.495258,-0.291499,1.0,-0.294326,0.248384,-0.449852,-0.285157,-0.498382,-0.077981,-0.307606,-0.348312,0.222328,0.230193,0.205088
pos_affect,0.485218,0.00358,0.456244,0.302134,0.623822,0.36624,-0.294326,1.0,-0.385255,0.161131,0.378571,0.3669,0.195737,0.304713,0.400902,-0.261728,-0.255344,-0.255688
neg_affect,-0.206525,0.192507,-0.365476,-0.120397,-0.290106,-0.097922,0.248384,-0.385255,1.0,-0.167158,-0.238025,-0.249651,-0.098236,-0.085052,-0.093485,0.086652,0.082621,0.074034
confidence_gov,-0.124023,-0.002864,-0.149647,-0.199992,0.418441,0.276602,-0.449852,0.161131,-0.167158,1.0,-0.165711,-0.074368,-0.077217,-0.187935,-0.192838,0.215414,0.218006,0.184351


### Remove a single feature from feature pairs with over 90% correlation

Feature pairs with over 90% correlation essentially tell us the same thing. I have decided to remove the feature
with the lowest level of negative/positive correlation with the target feature.

Feature correlation: 0.93 life_exp_birth vs healthy_life_exp_birth - drop life_exp_birth

Feature correlation: 0.99 u5_mortality vs infant_mortality - drop u5_mortality

Feature correlation: 0.96 neonatal_mortality vs infant_mortality - drop neonatal_mortality

Feature correlation -0.92 u5_mortality vs life_exp_birth: both features dropped already due to correlation with other descriptive features.

In [43]:
df.drop('life_exp_birth', axis=1, inplace=True)

In [44]:
df.drop('u5_mortality', axis=1, inplace=True)

In [45]:
df.drop('neonatal_mortality', axis=1, inplace=True)

In [46]:
df.head(3)

Unnamed: 0,country,year,happiness_score,social_support,healthy_life_exp_birth,life_choices,generosity,corruption,pos_affect,neg_affect,confidence_gov,dem_quality,delivery_quality,gdp,life_exp_60,infant_mortality,happiness_class
0,Afghanistan,2008,4.0,0.450662,49.209663,0.718114,0.181819,0.881686,0.517637,0.258195,0.612072,-1.92969,-1.655084,10.297,15.6,70.8,0.0
1,Afghanistan,2009,4.0,0.552308,49.624432,0.678896,0.203614,0.850035,0.583926,0.237092,0.611545,-2.044093,-1.635025,12.066,15.7,68.2,0.0
2,Afghanistan,2010,5.0,0.539075,50.008961,0.600127,0.13763,0.706766,0.618265,0.275324,0.299357,-1.99181,-1.617176,15.325,15.7,65.7,0.0


### Feature Selection

I have decided to select features with positive/negative correlation of greater than 0.50 away from 0. This threshold is an arbitrary one but the selected features should provide a strong indication toward predicting the happiness score.

Selected Features:

social_support   (target  corr: 0.677275)

healthy_life_exp_birth   (target  corr: 0.693580)

pos_affect   (target  corr: 0.532431)

dem_quality   (target  corr: 0.594980)

delivery_quality   (target  corr: 0.670657)

life_exp_60   (target  corr: 0.698286)

infant_mortality   (target  corr: -0.595494)

In [47]:
# selected feature correlation
df[["happiness_class", "social_support", "healthy_life_exp_birth", "pos_affect", "dem_quality", "delivery_quality", "life_exp_60", "infant_mortality"]].corr()

Unnamed: 0,happiness_class,social_support,healthy_life_exp_birth,pos_affect,dem_quality,delivery_quality,life_exp_60,infant_mortality
happiness_class,1.0,0.601155,0.61053,0.485218,0.509799,0.559981,0.631818,-0.530871
social_support,0.601155,1.0,0.589475,0.456244,0.541915,0.549914,0.559523,-0.605337
healthy_life_exp_birth,0.61053,0.589475,1.0,0.302134,0.621891,0.73679,0.827421,-0.87591
pos_affect,0.485218,0.456244,0.302134,1.0,0.378571,0.3669,0.400902,-0.261728
dem_quality,0.509799,0.541915,0.621891,0.378571,1.0,0.86675,0.635915,-0.559324
delivery_quality,0.559981,0.549914,0.73679,0.3669,0.86675,1.0,0.710327,-0.641283
life_exp_60,0.631818,0.559523,0.827421,0.400902,0.635915,0.710327,1.0,-0.784651
infant_mortality,-0.530871,-0.605337,-0.87591,-0.261728,-0.559324,-0.641283,-0.784651,1.0


### Training the model

This section trains the model on a linear relationship between descriptive features and the target feature. The data set is split 70/30 into a training set and test set. The purpose of this is to fit the model to the training set and then test that model on the testing set. This process helps avoid over/under fitting a model by using 100% of the data during fitting.

In [48]:
# Create a training data set (first 70% of rows)
training_size = int(len(df) * 0.7)
df_train = df[:training_size]
print("Training set size (rows):",len(df_train))

Training set size (rows): 940


In [49]:
# Create a test data set (remaining 30% of rows)
training_size = int(len(df) * 0.7)
df_test = df[training_size:]
print("Test set:",len(df_test))

Test set: 404


In [50]:
# A copy of df for full cross validation purposes
df_cross = df

In [51]:
# train model on all continuous features using df_train
lm = sm.ols(formula="happiness_class ~ social_support + healthy_life_exp_birth + pos_affect + dem_quality + delivery_quality + life_exp_60 + infant_mortality", data=df_train).fit()
print(lm.params)

Intercept                -2.632553
social_support            1.119994
healthy_life_exp_birth    0.003332
pos_affect                0.862037
dem_quality              -0.004743
delivery_quality          0.027721
life_exp_60               0.066706
infant_mortality          0.001483
dtype: float64


### Table with feature weights

In [52]:
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:        happiness_class   R-squared:                       0.554
Model:                            OLS   Adj. R-squared:                  0.551
Method:                 Least Squares   F-statistic:                     165.4
Date:                Wed, 25 Apr 2018   Prob (F-statistic):          1.44e-158
Time:                        23:20:14   Log-Likelihood:                -294.28
No. Observations:                 939   AIC:                             604.6
Df Residuals:                     931   BIC:                             643.3
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -2

### Test on df_train

Note we will also evaluate the model on df_test to avoid overfitting

In [53]:
lm.predict(df_train)

0     -0.408732
1     -0.232450
2     -0.219848
3     -0.241918
4     -0.153852
5     -0.268075
6     -0.297515
7     -0.269431
8     -0.243819
9     -0.367602
10     0.465208
11     0.374502
12     0.399612
13     0.421527
14     0.429545
15     0.333446
16     0.359793
17     0.318312
18     0.313103
19     0.461617
20     0.416454
21     0.495302
22     0.503589
23     0.443634
24     0.496160
25     0.023326
26    -0.032164
27     0.020028
28    -0.007236
29     0.703999
         ...   
910    0.161038
911    0.219081
912    0.384905
913    0.253650
914    0.331678
915    0.288719
916    0.280308
917    0.295980
918    0.352786
919    0.363986
920    0.918136
921    0.963848
922    0.959937
923    0.947490
924    0.949691
925    0.954430
926    0.890042
927    0.899826
928    0.920607
929    0.936850
930    0.695020
931    0.694423
932    0.709275
933    0.654515
934    0.753109
935    0.837230
936    0.880944
937    0.769544
938    0.832615
939    0.529840
Length: 940, dtype: floa

### Actual Happiness Score vs Predicted Happiness

In [54]:
predict_df_train = pd.DataFrame({'ActualHappiness': df_train.happiness_class, 'PredictedHappiness': lm.predict(df_train)})
predict_df_train.head(10)

Unnamed: 0,ActualHappiness,PredictedHappiness
0,0.0,-0.408732
1,0.0,-0.23245
2,0.0,-0.219848
3,0.0,-0.241918
4,0.0,-0.153852
5,0.0,-0.268075
6,0.0,-0.297515
7,0.0,-0.269431
8,0.0,-0.243819
9,0.0,-0.367602


### Actual Happiness Score minus Predicted Score

In [55]:
# analyse the value of the actual happiness score minus the predicted happiness score
print("Actual minus Predicted:\n", (df_train.happiness_class - lm.predict(df_train)))
print("\n(Actual minus Predicted) squared:\n", (df_train.happiness_class - lm.predict(df_train))**2)

Actual minus Predicted:
 0      0.408732
1      0.232450
2      0.219848
3      0.241918
4      0.153852
5      0.268075
6      0.297515
7      0.269431
8      0.243819
9      0.367602
10    -0.465208
11    -0.374502
12     0.600388
13     0.578473
14    -0.429545
15    -0.333446
16    -0.359793
17    -0.318312
18    -0.313103
19    -0.461617
20    -0.416454
21     0.504698
22     0.496411
23    -0.443634
24    -0.496160
25     0.976674
26     0.032164
27    -0.020028
28     0.007236
29     0.296001
         ...   
910   -0.161038
911   -0.219081
912   -0.384905
913   -0.253650
914   -0.331678
915   -0.288719
916   -0.280308
917   -0.295980
918   -0.352786
919   -0.363986
920    0.081864
921    0.036152
922    0.040063
923    0.052510
924    0.050309
925    0.045570
926    0.109958
927    0.100174
928    0.079393
929    0.063150
930    0.304980
931    0.305577
932    0.290725
933    0.345485
934    0.246891
935    0.162770
936   -0.880944
937    0.230456
938    0.167385
939   -0.529840

### Mean Squared Error

Performed on the df_train

In [56]:
mse = ((df_train.happiness_class - lm.predict(df_train))** 2).mean()
print("\nMean Squared Error:", mse)


Mean Squared Error: 0.10958282655354191


### Mean Absolute Error

Performed on df_train

In [57]:
mae = abs(df_train.happiness_class - lm.predict(df_train)).mean()
print("\nMean Absolute Error:", mae)


Mean Absolute Error: 0.25869800243637747


### Test on df_test

In [58]:
predict_df_test = pd.DataFrame({'ActualHappiness': df_test.happiness_class, 'PredictedHappiness': lm.predict(df_test)})
predict_df_test.head(10)

Unnamed: 0,ActualHappiness,PredictedHappiness
940,1.0,0.582157
941,1.0,0.575416
942,1.0,0.528869
943,1.0,0.530138
944,1.0,0.597675
945,1.0,0.603385
946,1.0,0.581966
947,1.0,0.618578
948,1.0,0.626032
949,0.0,0.294223


### Mean Squared Error

Performed on the df_test

In [59]:
mse = ((df_test.happiness_class - lm.predict(df_test))** 2).mean()
print("\nMean Squared Error:", mse)


Mean Squared Error: 0.13626133163804768


### Mean Absolute Error

Performed on the df_test

In [60]:
mae = abs(df_test.happiness_class - lm.predict(df_test)).mean()
print("\nMean Absolute Error:", mae)


Mean Absolute Error: 0.3209654713549332


### Comparison of results between train & test data set.

In [61]:
mse = ((df_train.happiness_class - lm.predict(df_train))** 2).mean()
print("\nTRAIN DATA SET - Mean Squared Error:", mse)

mse = ((df_test.happiness_class - lm.predict(df_test))** 2).mean()
print("\nTEST DATA SET - Mean Squared Error:", mse)

mae = abs(df_train.happiness_class - lm.predict(df_train)).mean()
print("\nTRAIN DATA SET - Mean Absolute Error:", mae)

mae = abs(df_test.happiness_class - lm.predict(df_test)).mean()
print("\nTEST DATA SET - Mean Absolute Error:", mae)


TRAIN DATA SET - Mean Squared Error: 0.10958282655354191

TEST DATA SET - Mean Squared Error: 0.13626133163804768

TRAIN DATA SET - Mean Absolute Error: 0.25869800243637747

TEST DATA SET - Mean Absolute Error: 0.3209654713549332


### Perform normalisation of the features

This process will give each feature a relative value and help to compare the coef of different features.

In [62]:
df_feat = df_train[['happiness_class', 'social_support', 'healthy_life_exp_birth', 'pos_affect', 'dem_quality', 'delivery_quality', 'life_exp_60', 'infant_mortality']]

In [63]:
df_feat.min()

happiness_class            0.000000
social_support             0.290184
healthy_life_exp_birth    40.076595
pos_affect                 0.422928
dem_quality               -2.044093
delivery_quality          -1.900852
life_exp_60               10.300000
infant_mortality           1.600000
dtype: float64

In [64]:
df_feat.max()

happiness_class             1.000000
social_support              0.987343
healthy_life_exp_birth     76.536362
pos_affect                  0.943621
dem_quality                 1.540097
delivery_quality            2.121312
life_exp_60                26.100000
infant_mortality          101.300000
dtype: float64

In [65]:
# range normalise all columns
df_norm = (df_feat - df_feat.min()) / (df_feat.max() - df_feat.min())
df_norm.head(10)

Unnamed: 0,happiness_class,social_support,healthy_life_exp_birth,pos_affect,dem_quality,delivery_quality,life_exp_60,infant_mortality
0,0.0,0.230189,0.250497,0.181891,0.031919,0.061103,0.335443,0.694082
1,0.0,0.375989,0.261873,0.309199,0.0,0.066091,0.341772,0.668004
2,0.0,0.357007,0.27242,0.37515,0.014587,0.070528,0.341772,0.642929
3,0.0,0.331229,0.282248,0.36194,0.034896,0.070766,0.348101,0.618857
4,0.0,0.330559,0.291627,0.552066,0.056107,0.123509,0.348101,0.595787
5,0.0,0.277365,0.30078,0.379604,0.045864,0.123768,0.35443,0.573721
6,0.0,0.337633,0.309764,0.208883,0.075564,0.146277,0.35443,0.553661
7,0.0,0.341978,0.318623,0.250869,0.055725,0.151475,0.360759,0.534604
8,0.0,0.38569,0.327482,0.272762,0.035266,0.116431,0.348101,0.517553
9,0.0,0.287877,0.336341,0.141006,0.038881,0.103328,0.348101,0.599911


### Train Linear Model on normalised columns

The values of each column have been normalised so we can now train a model on them and compare the coef with one another.

In [66]:
lm_df_norm = sm.ols(formula="happiness_class ~ social_support + healthy_life_exp_birth + pos_affect + dem_quality + delivery_quality + life_exp_60 + infant_mortality", data=df_norm).fit()
print(lm_df_norm.params)

Intercept                -1.162974
social_support            0.780815
healthy_life_exp_birth    0.121494
pos_affect                0.448857
dem_quality              -0.017000
delivery_quality          0.111498
life_exp_60               1.053960
infant_mortality          0.147845
dtype: float64


In [67]:
print(lm_df_norm.summary())

                            OLS Regression Results                            
Dep. Variable:        happiness_class   R-squared:                       0.554
Model:                            OLS   Adj. R-squared:                  0.551
Method:                 Least Squares   F-statistic:                     165.4
Date:                Wed, 25 Apr 2018   Prob (F-statistic):          1.44e-158
Time:                        23:20:25   Log-Likelihood:                -294.28
No. Observations:                 939   AIC:                             604.6
Df Residuals:                     931   BIC:                             643.3
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -1

In [None]:
predictions = lm.predict(df_train)

happiness_prediction_classification = []
for i in predictions:
  if (i < 0.5):
      happiness_prediction_classification.append(0.0)
  else:
      happiness_prediction_classification.append(1.0)

happiness_prediction_classification

### Standardisation of features 

In [68]:
df_st = (df_feat - df_feat.mean()) / df_feat.std()
df_st.head(10)

Unnamed: 0,happiness_class,social_support,healthy_life_exp_birth,pos_affect,dem_quality,delivery_quality,life_exp_60,infant_mortality
0,-0.879069,-2.808808,-1.707488,-1.749264,-2.125722,-1.703961,-1.445899,2.139915
1,-0.879069,-1.988406,-1.65384,-1.135243,-2.259258,-1.683236,-1.411412,2.020281
2,-0.879069,-2.095214,-1.604104,-0.817157,-2.198232,-1.664795,-1.411412,1.905248
3,-0.879069,-2.240266,-1.557756,-0.880868,-2.113266,-1.663808,-1.376926,1.794817
4,-0.879069,-2.244033,-1.513525,0.036133,-2.024529,-1.444625,-1.376926,1.688987
5,-0.879069,-2.543351,-1.470361,-0.795673,-2.067382,-1.443548,-1.342439,1.587758
6,-0.879069,-2.204229,-1.427995,-1.619081,-1.943127,-1.350011,-1.342439,1.495732
7,-0.879069,-2.179783,-1.386217,-1.416579,-2.026126,-1.328408,-1.307952,1.408307
8,-0.879069,-1.933818,-1.344439,-1.310987,-2.11172,-1.474039,-1.376926,1.330085
9,-0.879069,-2.484204,-1.302661,-1.946458,-2.096596,-1.528492,-1.376926,1.707903


In [69]:
lm_df_st = sm.ols(formula="happiness_class ~ social_support + healthy_life_exp_birth + pos_affect + dem_quality + delivery_quality + life_exp_60 + infant_mortality", data=df_st).fit()
print(lm_df_st.summary())

                            OLS Regression Results                            
Dep. Variable:        happiness_class   R-squared:                       0.554
Model:                            OLS   Adj. R-squared:                  0.551
Method:                 Least Squares   F-statistic:                     165.4
Date:                Wed, 25 Apr 2018   Prob (F-statistic):          1.44e-158
Time:                        23:20:26   Log-Likelihood:                -952.36
No. Observations:                 939   AIC:                             1921.
Df Residuals:                     931   BIC:                             1959.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -0

### Save updated dataframe to csv

In [70]:
df.to_csv('processed_data/happiness_data_alan.csv', index=False)