In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
# preprocess countries coordinates dataset
df_coordinate = pd.read_csv('geo/coordinates.csv')

# df_filtered = df_coordinate[df_coordinate['name'].isin(['Australia', 'Germany', 'Canada', 'United States', 'India', 'United Kingdom'])]
# print(df_filtered)
df_2021 = pd.read_csv('origin/2021.csv')
df_2020 = pd.read_csv('origin/2020.csv')
df_2019 = pd.read_csv('origin/2019.csv')
df_2018 = pd.read_csv('origin/2018.csv')
df_2017 = pd.read_csv('origin/2017.csv')
df_2016 = pd.read_csv('origin/2016.csv')

scaler = MinMaxScaler()
columns_to_normalize = ['Happiness score']

def process_yearly_data(df, rename_dict, year):
    df = df[list(rename_dict.keys())].rename(columns=rename_dict)
    df['Year'] = year
    df['Happiness score'] = df['Happiness score'] ** 3
    df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])
    return df


# define the rename dictionaries
df_2021 = process_yearly_data(
    df_2021,
    {
        'Country name': 'Country',
        'Ladder score': 'Happiness score',
        'Logged GDP per capita': 'GDP',
        'Perceptions of corruption': 'Corruption',
        'Healthy life expectancy': 'Healthy life expectancy'
    },
    2021
)
df_2020 = process_yearly_data(
    df_2020,
    {
        'Country name': 'Country',
        'Ladder score': 'Happiness score',
        'Logged GDP per capita': 'GDP',
        'Perceptions of corruption': 'Corruption',
        'Healthy life expectancy': 'Healthy life expectancy'
    },
    2020
)
df_2019 = process_yearly_data(
    df_2019,
    {
        'Country or region': 'Country',
        'Score': 'Happiness score',
        'Perceptions of corruption': 'Corruption',
        'GDP per capita': 'GDP',
        'Healthy life expectancy': 'Healthy life expectancy'
    },
    2019
)
df_2018 = process_yearly_data(
    df_2018,
    {
        'Country or region': 'Country',
        'Score': 'Happiness score',
        'Perceptions of corruption': 'Corruption',
        'GDP per capita': 'GDP',
        'Healthy life expectancy': 'Healthy life expectancy'
    },
    2018
)
df_2017 = process_yearly_data(
    df_2017,
    {
        'Country': 'Country',
        'Happiness.Score': 'Happiness score',
        'Economy..GDP.per.Capita.': 'GDP',
        'Health..Life.Expectancy.': 'Healthy life expectancy',
        'Trust..Government.Corruption.': 'Corruption'
    },
    2017
)
df_2016 = process_yearly_data(
    df_2016,
    {
        'Country': 'Country',
        'Happiness Score': 'Happiness score',
        'Economy (GDP per Capita)': 'GDP',
        'Health (Life Expectancy)': 'Healthy life expectancy',
        'Trust (Government Corruption)': 'Corruption'
    },
    2016
)

df_all_years = pd.concat([df_2021, df_2020, df_2019, df_2018, df_2017, df_2016], ignore_index=True)
df_all_years['Country'] = df_all_years['Country'].replace({
    'Congo (Brazzaville)': 'Congo [Republic]',
    'Congo (Kinshasa)': 'Congo [DRC]',
    'Myanmar': 'Myanmar [Burma]',
    'Macedonia': 'Macedonia [FYROM]',
    'Ivory Coast': 'C?te d\'Ivoire',
    'Taiwan Province of China': 'Taiwan',
    'Hong Kong S.A.R. of China': 'Hong Kong',
    'North Macedonia': 'Macedonia [FYROM]',
    'Trinidad & Tobago': 'Trinidad and Tobago'
})

df_merged = pd.merge(df_all_years, df_coordinate, how='inner', left_on='Country', right_on='name')
df_merged = df_merged[
    ['Country', 'Happiness score', 'Year', 'latitude', 'longitude', 'GDP', 'Healthy life expectancy', 'Corruption']]
df_merged['Happiness score'] = df_merged['Happiness score'].round(2)

In [37]:
print(df_all_years['Country'])

0          Finland
1          Denmark
2      Switzerland
3          Iceland
4      Netherlands
          ...     
921          Benin
922    Afghanistan
923           Togo
924          Syria
925        Burundi
Name: Country, Length: 926, dtype: object


In [3]:
df_merged.to_csv('processed/happiness_coordinates.csv', index=False)