# Happiness Prediction
- In this project, we will predict which countries or regions of the world will rise or fall in happiness based on factors such as GDP, freedom, and trust in government. We will use 
- How is this data collected? How is the happiness score calculated? 

In [65]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from mpl_toolkits import mplot3d # for scatter plots
%matplotlib inline

# EDA/Prep
- read in from 2015-2019
- put them all in the same dataframe with a year as a label 

In [66]:
df2015 = pd.read_csv('world_happiness/2015.csv')
df2016 = pd.read_csv('world_happiness/2016.csv')
df2017 = pd.read_csv('world_happiness/2017.csv')
df2018 = pd.read_csv('world_happiness/2018.csv')
df2019 = pd.read_csv('world_happiness/2019.csv')
df2020 = pd.read_csv('world_happiness/2020.csv')

df2020.head()
df2019.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


### preprocessing notes
- country / region in 2015 and 2016 can be joined. 2017 has just country 
- 2015/2016 have similar formats
- <b>should check out 2020 report</b>

In [67]:
# to stay loyal to country or region, we only use region if there is no country provided 
# 2017 does not provide a region
print("Number of missing countries in 2015: ", df2015.Country.isna().sum())
print("Number of missing countries in 2016: ", df2016.Country.isna().sum())
df2015 = df2015.drop(columns=['Region'])
df2016 = df2016.drop(columns=['Region'])

Number of missing countries in 2015:  0
Number of missing countries in 2016:  0


In [68]:
# renaming stuff
df2015 = df2015.rename({'Happiness Rank': 'Overall rank', 'Happiness Score': 'Score',
                       'Economy (GDP per Capita)':'GDP per capita', 'Health (Life Expectancy)': 'Healthy life expectancy', 
                       'Trust (Government Corruption)' : 'Perceptions of corruption', 
                        'Country' : 'Country or region'}, axis='columns')
df2016 = df2016.rename({'Happiness Rank': 'Overall rank', 'Happiness Score': 'Score',
                       'Economy (GDP per Capita)':'GDP per capita', 'Health (Life Expectancy)': 'Healthy life expectancy', 
                       'Trust (Government Corruption)' : 'Perceptions of corruption', 
                       'Country' : 'Country or region'}, axis='columns')
df2017 = df2017.rename({'Happiness.Rank': 'Overall rank', 'Happiness.Score' : 'Score', 
                       'Economy..GDP.per.Capita.' : 'GDP per capita', 'Health..Life.Expectancy.' : 'Healthy life expectancy',
                       'Trust..Government.Corruption.': 'Perceptions of corruption', 
                       'Country' : 'Country or region'}, axis='columns')
df2018 = df2018.rename({'Freedom to make life choices': 'Freedom'}, axis='columns')
df2019 = df2019.rename({'Freedom to make life choices': 'Freedom'}, axis='columns')

### see if we can preserve the error metrics from 2015-2017
- <b>2015-2016</b>
    - Standard Error, Dystopia Residual, Lower Confidence Level
- <b>2017</b>
    - Whisker.high, Whisker.low, Dystopia.Residual

In [69]:
# dropping stuff 
df2015 = df2015.drop(columns=['Standard Error', 'Dystopia Residual'])
df2016 = df2016.drop(columns=['Dystopia Residual', 'Lower Confidence Interval', 'Upper Confidence Interval'])
df2017 = df2017.drop(columns=['Whisker.high', 'Whisker.low', 'Dystopia.Residual'])

In [70]:
# check if they're all the same columns 
print("2015 columns: ", df2015.columns)
print("2016 columns: ", df2016.columns)
print("2017 columns: ", df2017.columns)
print("2018 columns: ", df2018.columns) # 2018, 2019 have the same column names
print("2019 columns: ", df2019.columns)

2015 columns:  Index(['Country or region', 'Overall rank', 'Score', 'GDP per capita',
       'Family', 'Healthy life expectancy', 'Freedom',
       'Perceptions of corruption', 'Generosity'],
      dtype='object')
2016 columns:  Index(['Country or region', 'Overall rank', 'Score', 'GDP per capita',
       'Family', 'Healthy life expectancy', 'Freedom',
       'Perceptions of corruption', 'Generosity'],
      dtype='object')
2017 columns:  Index(['Country or region', 'Overall rank', 'Score', 'GDP per capita',
       'Family', 'Healthy life expectancy', 'Freedom', 'Generosity',
       'Perceptions of corruption'],
      dtype='object')
2018 columns:  Index(['Overall rank', 'Country or region', 'Score', 'GDP per capita',
       'Social support', 'Healthy life expectancy', 'Freedom', 'Generosity',
       'Perceptions of corruption'],
      dtype='object')
2019 columns:  Index(['Overall rank', 'Country or region', 'Score', 'GDP per capita',
       'Social support', 'Healthy life expectancy'

In [71]:
# preprocess 2020 a little differently
# df2020.columns
df2020 = df2020.drop(columns=['Regional indicator',
       'Standard error of ladder score', 'upperwhisker', 'lowerwhisker',
       'Logged GDP per capita', 'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Ladder score in Dystopia', 'Dystopia + residual'])
df2020

Unnamed: 0,Country name,Ladder score,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption
0,Finland,7.8087,1.285190,1.499526,0.961271,0.662317,0.159670,0.477857
1,Denmark,7.6456,1.326949,1.503449,0.979333,0.665040,0.242793,0.495260
2,Switzerland,7.5599,1.390774,1.472403,1.040533,0.628954,0.269056,0.407946
3,Iceland,7.5045,1.326502,1.547567,1.000843,0.661981,0.362330,0.144541
4,Norway,7.4880,1.424207,1.495173,1.008072,0.670201,0.287985,0.434101
...,...,...,...,...,...,...,...,...
148,Central African Republic,3.4759,0.041072,0.000000,0.000000,0.292814,0.253513,0.028265
149,Rwanda,3.3123,0.343243,0.522876,0.572383,0.604088,0.235705,0.485542
150,Zimbabwe,3.2992,0.425564,1.047835,0.375038,0.377405,0.151349,0.080929
151,South Sudan,2.8166,0.289083,0.553279,0.208809,0.065609,0.209935,0.111157


In [72]:
df2020.insert(loc=1, column='Overall Rank', value=np.arange(1, 154)) # rank of 2020 countries 
df2020

Unnamed: 0,Country name,Overall Rank,Ladder score,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption
0,Finland,1,7.8087,1.285190,1.499526,0.961271,0.662317,0.159670,0.477857
1,Denmark,2,7.6456,1.326949,1.503449,0.979333,0.665040,0.242793,0.495260
2,Switzerland,3,7.5599,1.390774,1.472403,1.040533,0.628954,0.269056,0.407946
3,Iceland,4,7.5045,1.326502,1.547567,1.000843,0.661981,0.362330,0.144541
4,Norway,5,7.4880,1.424207,1.495173,1.008072,0.670201,0.287985,0.434101
...,...,...,...,...,...,...,...,...,...
148,Central African Republic,149,3.4759,0.041072,0.000000,0.000000,0.292814,0.253513,0.028265
149,Rwanda,150,3.3123,0.343243,0.522876,0.572383,0.604088,0.235705,0.485542
150,Zimbabwe,151,3.2992,0.425564,1.047835,0.375038,0.377405,0.151349,0.080929
151,South Sudan,152,2.8166,0.289083,0.553279,0.208809,0.065609,0.209935,0.111157


In [77]:
df2020 = df2020.rename({'Country name': 'Country or region', 'Ladder score' : 'Score', 
                                      'Explained by: Log GDP per capita': 'GDP per capita', 
                                      'Explained by: Social support' : 'Social support', 
                                      'Explained by: Healthy life expectancy' : 'Healthy life expectancy', 
                                      'Explained by: Freedom to make life choices': 'Freedom', 
                                      'Explained by: Generosity' : 'Generosity',
                                      'Explained by: Perceptions of corruption' : 'Perceptions of corruption'},
                                      axis='columns')
df2020

Unnamed: 0,Country or region,Overall Rank,Score,GDP per capita,Social support,Healthy life expectancy,Freedom,Generosity,Perceptions of corruption
0,Finland,1,7.8087,1.285190,1.499526,0.961271,0.662317,0.159670,0.477857
1,Denmark,2,7.6456,1.326949,1.503449,0.979333,0.665040,0.242793,0.495260
2,Switzerland,3,7.5599,1.390774,1.472403,1.040533,0.628954,0.269056,0.407946
3,Iceland,4,7.5045,1.326502,1.547567,1.000843,0.661981,0.362330,0.144541
4,Norway,5,7.4880,1.424207,1.495173,1.008072,0.670201,0.287985,0.434101
...,...,...,...,...,...,...,...,...,...
148,Central African Republic,149,3.4759,0.041072,0.000000,0.000000,0.292814,0.253513,0.028265
149,Rwanda,150,3.3123,0.343243,0.522876,0.572383,0.604088,0.235705,0.485542
150,Zimbabwe,151,3.2992,0.425564,1.047835,0.375038,0.377405,0.151349,0.080929
151,South Sudan,152,2.8166,0.289083,0.553279,0.208809,0.065609,0.209935,0.111157


In [78]:
df2020.columns

Index(['Country or region', 'Overall Rank', 'Score', 'GDP per capita',
       'Social support', 'Healthy life expectancy', 'Freedom', 'Generosity',
       'Perceptions of corruption'],
      dtype='object')

In [79]:
df2019.columns

Index(['Overall rank', 'Country or region', 'Score', 'GDP per capita',
       'Social support', 'Healthy life expectancy', 'Freedom', 'Generosity',
       'Perceptions of corruption'],
      dtype='object')

In [91]:
# columns are still out of order
# next, will drop countries that are not in 2020
target_countries = df2020['Country or region'].to_numpy()
c_2015 = df2015['Country or region']
c_2016 = df2016['Country or region']
c_2017 = df2017['Country or region']
c_2018 = df2018['Country or region']
c_2019 = df2019['Country or region']
countries = [c_2015, c_2016, c_2017, c_2018, c_2019]

In [94]:
for c in countries:
    for i in c:
        if i not in target_countries:
            print(i, ' not in 2020 report')
    print("\n")

Oman  not in 2020 report
Qatar  not in 2020 report
Taiwan  not in 2020 report
Suriname  not in 2020 report
Hong Kong  not in 2020 report
Bhutan  not in 2020 report
Somaliland region  not in 2020 report
Sudan  not in 2020 report
Djibouti  not in 2020 report
Angola  not in 2020 report
Syria  not in 2020 report


Puerto Rico  not in 2020 report
Taiwan  not in 2020 report
Qatar  not in 2020 report
Suriname  not in 2020 report
Belize  not in 2020 report
Hong Kong  not in 2020 report
Somalia  not in 2020 report
Bhutan  not in 2020 report
Somaliland Region  not in 2020 report
Sudan  not in 2020 report
Angola  not in 2020 report
Syria  not in 2020 report


Qatar  not in 2020 report
Belize  not in 2020 report
Hong Kong S.A.R., China  not in 2020 report
Somalia  not in 2020 report
Bhutan  not in 2020 report
Sudan  not in 2020 report
Angola  not in 2020 report
Syria  not in 2020 report


Taiwan  not in 2020 report
Qatar  not in 2020 report
Trinidad & Tobago  not in 2020 report
Belize  not in 2020

In [95]:
# drop a row 
df2015 = df2015.drop['Finland']

TypeError: 'method' object is not subscriptable