# Machine Learning on World Happiness from years 2015-2021

### Importing basic libraries needed to work with data

In [21]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [22]:
data = {}
for x in range (2015,2022):
    data[x] = pd.read_csv(f'../uncleaned_data/{str(x)}.csv')

In [23]:
for key in data:
    print(key)
    print("----")
    print(data[key].dtypes)
    print("----")

2015
----
Country                           object
Region                            object
Happiness Rank                     int64
Happiness Score                  float64
Standard Error                   float64
Economy (GDP per Capita)         float64
Family                           float64
Health (Life Expectancy)         float64
Freedom                          float64
Trust (Government Corruption)    float64
Generosity                       float64
Dystopia Residual                float64
dtype: object
----
2016
----
Country                           object
Region                            object
Happiness Rank                     int64
Happiness Score                  float64
Lower Confidence Interval        float64
Upper Confidence Interval        float64
Economy (GDP per Capita)         float64
Family                           float64
Health (Life Expectancy)         float64
Freedom                          float64
Trust (Government Corruption)    float64
Generosity        

## As shown above, not all datasets have the same column and row space, hence we need to identify which columns and rows are important/neccessary. Some variables which are the same are also named differently.


## Thus, we have to clean and normalise the individual datasets before combining them


In [24]:
## Removing uneccessary columns
## Note that we remove the rank column, this is because we will need to remove certain rows later on and re-set the rank

data[2015].drop(data[2015].columns[[1,2,4,11]], axis = 1, inplace = True)
data[2016].drop(data[2016].columns[[1,2,4,5,12]], axis = 1, inplace = True)
data[2017].drop(data[2017].columns[[1,3,4,11]], axis = 1, inplace = True)
data[2018].drop(data[2018].columns[[0]], axis = 1, inplace = True)
data[2019].drop(data[2019].columns[[0]], axis = 1, inplace = True)
data[2020].drop(data[2020].columns[[1,3,4,5,12,13,14,15,16,17,18,19]], axis = 1, inplace = True)
data[2021].drop(data[2021].columns[[1,3,4,5,12,13,14,15,16,17,18,19]], axis = 1, inplace = True)


In [25]:
for key in data:
    print(key)
    print("----")
    print(data[key].dtypes)
    print("----")

2015
----
Country                           object
Happiness Score                  float64
Economy (GDP per Capita)         float64
Family                           float64
Health (Life Expectancy)         float64
Freedom                          float64
Trust (Government Corruption)    float64
Generosity                       float64
dtype: object
----
2016
----
Country                           object
Happiness Score                  float64
Economy (GDP per Capita)         float64
Family                           float64
Health (Life Expectancy)         float64
Freedom                          float64
Trust (Government Corruption)    float64
Generosity                       float64
dtype: object
----
2017
----
Country                           object
Happiness.Score                  float64
Economy..GDP.per.Capita.         float64
Family                           float64
Health..Life.Expectancy.         float64
Freedom                          float64
Generosity                    

In [26]:
## Reordering the columns so they all match

data[2015] = data[2015].reindex(data[2015].columns[[0,1,2,3,4,5,7,6]], axis=1)
data[2016] = data[2016].reindex(data[2016].columns[[0,1,2,3,4,5,7,6]], axis=1)


In [27]:
for key in data:
    data[key].columns = ['Country','Score','Economy','Family','Health','Freedom','Generosity','Trust']

In [28]:
for key in data:
    print(key)
    print("----")
    print(data[key])
    print("----")

2015
----
         Country  Score  Economy   Family   Health  Freedom  Generosity  \
0    Switzerland  7.587  1.39651  1.34951  0.94143  0.66557     0.29678   
1        Iceland  7.561  1.30232  1.40223  0.94784  0.62877     0.43630   
2        Denmark  7.527  1.32548  1.36058  0.87464  0.64938     0.34139   
3         Norway  7.522  1.45900  1.33095  0.88521  0.66973     0.34699   
4         Canada  7.427  1.32629  1.32261  0.90563  0.63297     0.45811   
..           ...    ...      ...      ...      ...      ...         ...   
153       Rwanda  3.465  0.22208  0.77370  0.42864  0.59201     0.22628   
154        Benin  3.340  0.28665  0.35386  0.31910  0.48450     0.18260   
155        Syria  3.006  0.66320  0.47489  0.72193  0.15684     0.47179   
156      Burundi  2.905  0.01530  0.41587  0.22396  0.11850     0.19727   
157         Togo  2.839  0.20868  0.13995  0.28443  0.36453     0.16681   

       Trust  
0    0.41978  
1    0.14145  
2    0.48357  
3    0.36503  
4    0.32957  

## Now that all the columns have been normalised and cleaned, we will have to clean the rows

In [29]:
## Removing non-repeats in the rows of datasets

data_clean = {}
data_clean[2015] = data[2015]

for i in range(2015,2022):
    country_ref = data_clean[i]['Country'].to_numpy()
    for x in range (2015,2022):
        data_clean[x] = data[x][data[x]['Country'].isin(country_ref)]

## Resetting rank and sorting based on country for easier indexing

for key in data_clean:
    data_clean[key] = data_clean[key].reset_index(drop = True)
    data_clean[key] = data_clean[key].sort_values(by=['Country'])
    data_clean[key] = data_clean[key].reset_index()
    data_clean[key].rename(columns={'index':'Rank'}, inplace = True)
   

In [30]:
for key in data_clean:
    print(key)
    print("----")
    print(data_clean[key])
    print("----")

2015
----
     Rank      Country  Score  Economy   Family   Health  Freedom  Generosity  \
0     132  Afghanistan  3.575  0.31982  0.30285  0.30335  0.23414     0.36510   
1      83      Albania  4.959  0.87867  0.80434  0.81325  0.35733     0.14272   
2      61      Algeria  5.605  0.93929  1.07772  0.61766  0.28579     0.07822   
3      27    Argentina  6.574  1.05351  1.24823  0.78723  0.44974     0.11451   
4     109      Armenia  4.350  0.76821  0.77711  0.72990  0.19847     0.07855   
..    ...          ...    ...      ...      ...      ...      ...         ...   
132    21    Venezuela  6.810  1.04424  1.25596  0.72052  0.42908     0.05841   
133    67      Vietnam  5.360  0.63216  0.91226  0.74676  0.59444     0.16860   
134   118        Yemen  4.077  0.54649  0.68093  0.40064  0.35571     0.09131   
135    76       Zambia  5.129  0.47038  0.91612  0.29924  0.48827     0.19591   
136   100     Zimbabwe  4.610  0.27100  1.03276  0.33475  0.25861     0.18987   

       Trust  
0 

In [32]:
for key in data_clean:
    data_clean[key].to_csv(f'../cleaned_data/{key}.csv')