In [1]:
# Dependencies
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as sts
import numpy as np

In [2]:
## 2015 Data Cleaning
# Import 2015 World Happiness Data
data_2015 = pd.read_csv('./Resources/2015_data.csv')

# Add column indicating the year
data_2015['Year'] = '2015'

# Drop non-predictive variables
data_2015_drop = data_2015.drop(columns='Standard Error')

# Rearrange the data fields
column_order = ['Year','Country','Region','Happiness Rank','Happiness Score','Economy (GDP per Capita)','Family',
                'Health (Life Expectancy)','Freedom','Trust (Government Corruption)','Generosity','Dystopia Residual']
data_2015_clean = data_2015_drop.reindex(columns = column_order)

data_2015_clean

Unnamed: 0,Year,Country,Region,Happiness Rank,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,2015,Switzerland,Western Europe,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,2015,Iceland,Western Europe,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.43630,2.70201
2,2015,Denmark,Western Europe,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,2015,Norway,Western Europe,4,7.522,1.45900,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,2015,Canada,North America,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176
...,...,...,...,...,...,...,...,...,...,...,...,...
153,2015,Rwanda,Sub-Saharan Africa,154,3.465,0.22208,0.77370,0.42864,0.59201,0.55191,0.22628,0.67042
154,2015,Benin,Sub-Saharan Africa,155,3.340,0.28665,0.35386,0.31910,0.48450,0.08010,0.18260,1.63328
155,2015,Syria,Middle East and Northern Africa,156,3.006,0.66320,0.47489,0.72193,0.15684,0.18906,0.47179,0.32858
156,2015,Burundi,Sub-Saharan Africa,157,2.905,0.01530,0.41587,0.22396,0.11850,0.10062,0.19727,1.83302


In [3]:
## 2016 Data Cleaning
# Import 2016 World Happiness Data
data_2016 = pd.read_csv('./Resources/2016_data.csv')

# Add column indicating the year
data_2016['Year'] = '2016'

# Drop non-predictive variables
data_2016_drop = data_2016.drop(columns=['Lower Confidence Interval','Upper Confidence Interval'])

# Rearrange the data fields
column_order = ['Year','Country','Region','Happiness Rank','Happiness Score','Economy (GDP per Capita)','Family',
                'Health (Life Expectancy)','Freedom','Trust (Government Corruption)','Generosity','Dystopia Residual']
data_2016_clean = data_2016_drop.reindex(columns = column_order)

data_2016_clean

Unnamed: 0,Year,Country,Region,Happiness Rank,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,2016,Denmark,Western Europe,1,7.526,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939
1,2016,Switzerland,Western Europe,2,7.509,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463
2,2016,Iceland,Western Europe,3,7.501,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137
3,2016,Norway,Western Europe,4,7.498,1.57744,1.12690,0.79579,0.59609,0.35776,0.37895,2.66465
4,2016,Finland,Western Europe,5,7.413,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492,2.82596
...,...,...,...,...,...,...,...,...,...,...,...,...
152,2016,Benin,Sub-Saharan Africa,153,3.484,0.39499,0.10419,0.21028,0.39747,0.06681,0.20180,2.10812
153,2016,Afghanistan,Southern Asia,154,3.360,0.38227,0.11037,0.17344,0.16430,0.07112,0.31268,2.14558
154,2016,Togo,Sub-Saharan Africa,155,3.303,0.28123,0.00000,0.24811,0.34678,0.11587,0.17517,2.13540
155,2016,Syria,Middle East and Northern Africa,156,3.069,0.74719,0.14866,0.62994,0.06912,0.17233,0.48397,0.81789


In [4]:
## 2017 Data Cleaning
# Import 2017 World Happiness Data
data_2017 = pd.read_csv('./Resources/2017_data.csv')

# Add column indicating the year
data_2017['Year'] = '2017'

# Rename data fields to match prior years
data_2017_rename = data_2017.rename(columns={'Happiness.Rank': 'Happiness Rank',
                                             'Happiness.Score': 'Happiness Score',
                                             'Economy..GDP.per.Capita.': 'Economy (GDP per Capita)',
                                             'Health..Life.Expectancy.': 'Health (Life Expectancy)',
                                             'Trust..Government.Corruption.': 'Trust (Government Corruption)',
                                             'Dystopia.Residual': 'Dystopia Residual'})                           

# Drop non-predictive variables
data_2017_drop = data_2017_rename.drop(columns=['Whisker.high','Whisker.low'])

# Rearrange the data fields
column_order = ['Year','Country','Happiness Rank','Happiness Score','Economy (GDP per Capita)','Family',
                'Health (Life Expectancy)','Freedom','Trust (Government Corruption)','Generosity','Dystopia Residual']
data_2017_clean = data_2017_drop.reindex(columns = column_order)

data_2017_clean

Unnamed: 0,Year,Country,Happiness Rank,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,2017,Norway,1,7.537,1.616463,1.533524,0.796667,0.635423,0.315964,0.362012,2.277027
1,2017,Denmark,2,7.522,1.482383,1.551122,0.792566,0.626007,0.400770,0.355280,2.313707
2,2017,Iceland,3,7.504,1.480633,1.610574,0.833552,0.627163,0.153527,0.475540,2.322715
3,2017,Switzerland,4,7.494,1.564980,1.516912,0.858131,0.620071,0.367007,0.290549,2.276716
4,2017,Finland,5,7.469,1.443572,1.540247,0.809158,0.617951,0.382612,0.245483,2.430182
...,...,...,...,...,...,...,...,...,...,...,...
150,2017,Rwanda,151,3.471,0.368746,0.945707,0.326425,0.581844,0.455220,0.252756,0.540061
151,2017,Syria,152,3.462,0.777153,0.396103,0.500533,0.081539,0.151347,0.493664,1.061574
152,2017,Tanzania,153,3.349,0.511136,1.041990,0.364509,0.390018,0.066035,0.354256,0.621130
153,2017,Burundi,154,2.905,0.091623,0.629794,0.151611,0.059901,0.084148,0.204435,1.683024


In [5]:
## 2018 Data Cleaning
# Import 2018 World Happiness Data
data_2018 = pd.read_csv('./Resources/2018_data.csv')

# Add column indicating the year
data_2018['Year'] = '2018'

# Rename data fields to match prior years
data_2018_rename = data_2018.rename(columns={'Overall rank': 'Happiness Rank',
                                             'Country or region': 'Country',
                                             'Score': 'Happiness Score',
                                             'GDP per capita': 'Economy (GDP per Capita)',
                                             'Social support': 'Family',
                                             'Healthy life expectancy': 'Health (Life Expectancy)',
                                             'Freedom to make life choices': 'Freedom',
                                             'Perceptions of corruption': 'Trust (Government Corruption)'})
                            
# Rearrange the data fields
column_order = ['Year','Country','Happiness Rank','Happiness Score','Economy (GDP per Capita)','Family',
                'Health (Life Expectancy)','Freedom','Trust (Government Corruption)','Generosity','Dystopia Residual']
data_2018_clean = data_2018_rename.reindex(columns = column_order)

data_2018_clean

Unnamed: 0,Year,Country,Happiness Rank,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,2018,Finland,1,7.632,1.305,1.592,0.874,0.681,0.393,0.202,
1,2018,Norway,2,7.594,1.456,1.582,0.861,0.686,0.340,0.286,
2,2018,Denmark,3,7.555,1.351,1.590,0.868,0.683,0.408,0.284,
3,2018,Iceland,4,7.495,1.343,1.644,0.914,0.677,0.138,0.353,
4,2018,Switzerland,5,7.487,1.420,1.549,0.927,0.660,0.357,0.256,
...,...,...,...,...,...,...,...,...,...,...,...
151,2018,Yemen,152,3.355,0.442,1.073,0.343,0.244,0.064,0.083,
152,2018,Tanzania,153,3.303,0.455,0.991,0.381,0.481,0.097,0.270,
153,2018,South Sudan,154,3.254,0.337,0.608,0.177,0.112,0.106,0.224,
154,2018,Central African Republic,155,3.083,0.024,0.000,0.010,0.305,0.038,0.218,


In [6]:
## 2019 Data Cleaning
# Import 2019 World Happiness Data
data_2019 = pd.read_csv('./Resources/2019_data.csv')

# Add column indicating the year
data_2019['Year'] = '2019'

# Rename data fields to match prior years
data_2019_rename = data_2019.rename(columns={'Overall rank': 'Happiness Rank',
                                             'Country or region': 'Country',
                                             'Score': 'Happiness Score',
                                             'GDP per capita': 'Economy (GDP per Capita)',
                                             'Social support': 'Family',
                                             'Healthy life expectancy': 'Health (Life Expectancy)',
                                             'Freedom to make life choices': 'Freedom',
                                             'Perceptions of corruption': 'Trust (Government Corruption)'})
                            
# Rearrange the data fields
column_order = ['Year','Country','Happiness Rank','Happiness Score','Economy (GDP per Capita)','Family',
                'Health (Life Expectancy)','Freedom','Trust (Government Corruption)','Generosity','Dystopia Residual']
data_2019_clean = data_2019_rename.reindex(columns = column_order)

data_2019_clean

Unnamed: 0,Year,Country,Happiness Rank,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,2019,Finland,1,7.769,1.340,1.587,0.986,0.596,0.393,0.153,
1,2019,Denmark,2,7.600,1.383,1.573,0.996,0.592,0.410,0.252,
2,2019,Norway,3,7.554,1.488,1.582,1.028,0.603,0.341,0.271,
3,2019,Iceland,4,7.494,1.380,1.624,1.026,0.591,0.118,0.354,
4,2019,Netherlands,5,7.488,1.396,1.522,0.999,0.557,0.298,0.322,
...,...,...,...,...,...,...,...,...,...,...,...
151,2019,Rwanda,152,3.334,0.359,0.711,0.614,0.555,0.411,0.217,
152,2019,Tanzania,153,3.231,0.476,0.885,0.499,0.417,0.147,0.276,
153,2019,Afghanistan,154,3.203,0.350,0.517,0.361,0.000,0.025,0.158,
154,2019,Central African Republic,155,3.083,0.026,0.000,0.105,0.225,0.035,0.235,


In [8]:
# Aggregate cleaned data sets from each year into one file
data_list = [data_2015_clean, data_2016_clean, data_2017_clean, data_2018_clean, data_2019_clean]
multi_year_data = pd.concat(data_list)
multi_year_data

Unnamed: 0,Year,Country,Region,Happiness Rank,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,2015,Switzerland,Western Europe,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,2015,Iceland,Western Europe,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.43630,2.70201
2,2015,Denmark,Western Europe,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,2015,Norway,Western Europe,4,7.522,1.45900,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,2015,Canada,North America,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176
...,...,...,...,...,...,...,...,...,...,...,...,...
151,2019,Rwanda,,152,3.334,0.35900,0.71100,0.61400,0.55500,0.41100,0.21700,
152,2019,Tanzania,,153,3.231,0.47600,0.88500,0.49900,0.41700,0.14700,0.27600,
153,2019,Afghanistan,,154,3.203,0.35000,0.51700,0.36100,0.00000,0.02500,0.15800,
154,2019,Central African Republic,,155,3.083,0.02600,0.00000,0.10500,0.22500,0.03500,0.23500,


In [9]:
# Group the data sets by country
total_country_rank = multi_year_data.groupby('Country')['Happiness Rank'].sum()

# Calculate the average happiness rank for each country over the 5 year period
avg_country_rank = total_country_rank/5
data_rank = pd.merge(left=multi_year_data, right=avg_country_rank, on='Country')
data_rank

Unnamed: 0,Year,Country,Region,Happiness Rank_x,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual,Happiness Rank_y
0,2015,Switzerland,Western Europe,1,7.587,1.39651,1.349510,0.941430,0.665570,0.419780,0.296780,2.517380,3.6
1,2016,Switzerland,Western Europe,2,7.509,1.52733,1.145240,0.863030,0.585570,0.412030,0.280830,2.694630,3.6
2,2017,Switzerland,,4,7.494,1.56498,1.516912,0.858131,0.620071,0.367007,0.290549,2.276716,3.6
3,2018,Switzerland,,5,7.487,1.42000,1.549000,0.927000,0.660000,0.357000,0.256000,,3.6
4,2019,Switzerland,,6,7.480,1.45200,1.526000,1.052000,0.572000,0.343000,0.263000,,3.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
777,2019,Trinidad & Tobago,,39,6.192,1.23100,1.477000,0.713000,0.489000,0.016000,0.185000,,15.4
778,2018,Northern Cyprus,,58,5.835,1.22900,1.211000,0.909000,0.495000,0.154000,0.179000,,24.4
779,2019,Northern Cyprus,,64,5.718,1.26300,1.252000,1.042000,0.417000,0.162000,0.191000,,24.4
780,2019,North Macedonia,,84,5.274,0.98300,1.294000,0.838000,0.345000,0.034000,0.185000,,16.8


In [17]:
# Rename columns and fill nulls
data_rank_all_rename = data_rank.rename(columns={'Happiness Rank_x': 'Happiness Rank',
                                                 'Happiness Rank_y': 'Avg Happiness Rank'})
data_rank_all = data_rank_all_rename.ffill(axis=0)

data_rank_all

Unnamed: 0,Year,Country,Region,Happiness Rank,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual,Avg Happiness Rank
0,2015,Switzerland,Western Europe,1,7.587,1.39651,1.349510,0.941430,0.665570,0.419780,0.296780,2.517380,3.6
1,2016,Switzerland,Western Europe,2,7.509,1.52733,1.145240,0.863030,0.585570,0.412030,0.280830,2.694630,3.6
2,2017,Switzerland,Western Europe,4,7.494,1.56498,1.516912,0.858131,0.620071,0.367007,0.290549,2.276716,3.6
3,2018,Switzerland,Western Europe,5,7.487,1.42000,1.549000,0.927000,0.660000,0.357000,0.256000,2.276716,3.6
4,2019,Switzerland,Western Europe,6,7.480,1.45200,1.526000,1.052000,0.572000,0.343000,0.263000,2.276716,3.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
777,2019,Trinidad & Tobago,Sub-Saharan Africa,39,6.192,1.23100,1.477000,0.713000,0.489000,0.016000,0.185000,0.554633,15.4
778,2018,Northern Cyprus,Sub-Saharan Africa,58,5.835,1.22900,1.211000,0.909000,0.495000,0.154000,0.179000,0.554633,24.4
779,2019,Northern Cyprus,Sub-Saharan Africa,64,5.718,1.26300,1.252000,1.042000,0.417000,0.162000,0.191000,0.554633,24.4
780,2019,North Macedonia,Sub-Saharan Africa,84,5.274,0.98300,1.294000,0.838000,0.345000,0.034000,0.185000,0.554633,16.8


In [18]:
# Export data for all years
data_rank_all.to_csv('Resources/clean_data/data_country_all.csv', index=False)

In [21]:
# Sort the data set and drop values below top 20
data_rank_all_sort = data_rank_all.sort_values('Avg Happiness Rank')
data_rank_top = data_rank_all_sort.drop(range(100,len(data_rank_all_sort)))
data_rank_top

Unnamed: 0,Year,Country,Region,Happiness Rank,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual,Avg Happiness Rank
10,2015,Denmark,Western Europe,3,7.527,1.325480,1.360580,0.874640,0.649380,0.483570,0.34139,2.492040,2.2
11,2016,Denmark,Western Europe,1,7.526,1.441780,1.163740,0.795040,0.579410,0.444530,0.36171,2.739390,2.2
12,2017,Denmark,Western Europe,2,7.522,1.482383,1.551122,0.792566,0.626007,0.400770,0.35528,2.313707,2.2
13,2018,Denmark,Western Europe,3,7.555,1.351000,1.590000,0.868000,0.683000,0.408000,0.28400,2.313707,2.2
14,2019,Denmark,Western Europe,2,7.600,1.383000,1.573000,0.996000,0.592000,0.410000,0.25200,2.313707,2.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,2015,Brazil,Latin America and Caribbean,16,6.983,0.981240,1.232870,0.697020,0.490490,0.175210,0.14574,3.260010,23.0
76,2016,Brazil,Latin America and Caribbean,17,6.952,1.087540,1.039380,0.614150,0.404250,0.141660,0.15776,3.507330,23.0
79,2019,Brazil,Latin America and Caribbean,32,6.300,1.004000,1.439000,0.802000,0.390000,0.086000,0.09900,2.769267,23.0
78,2018,Brazil,Latin America and Caribbean,28,6.419,0.986000,1.474000,0.675000,0.493000,0.088000,0.11000,2.769267,23.0


In [22]:
data_rank_top.to_csv('Resources/clean_data/data_country_top.csv', index=False)