# DSMLC Annual Final Competition

## Imports

In [143]:
import pandas as pd
import numpy as np
import math
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

## Import Data

In [186]:
def load_data():
    # The columns we will make
    column_names = ['Year', 'Country', 'Happiness', 'Economy', 'Health', 'Freedom', 'Generosity', 'Corruption']
    main_dataframe = pd.DataFrame(columns=column_names)
    
    # Import xls sheet
    xls = pd.ExcelFile('World Happiness Datasets (2015-2022).xlsx')
    
    for i in range(2015, 2023):
        # Read Dataframe In
        dataframe = pd.read_excel(xls, f'{i}')
        
        # Read in year
        year = [i for _ in dataframe[dataframe.columns[0]]]
        dataframe['Year'] = year
        
        # Get Old Columns
        old_columns = []
        if i == 2015 or i == 2016:
            old_columns = ['Year', 'Country','Happiness Score', 'Economy (GDP per Capita)', 'Health (Life Expectancy)', 'Freedom', 'Generosity', 'Trust (Government Corruption)']
        elif i == 2017:
            old_columns = ['Year', 'Country','Happiness.Score', 'Economy..GDP.per.Capita.', 'Health..Life.Expectancy.', 'Freedom', 'Generosity', 'Trust..Government.Corruption.']
        elif i == 2018 or i == 2019:
            old_columns = ['Year', 'Country or region','Score', 'GDP per capita', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']
        elif i == 2020 or i == 2021:
            old_columns = ['Year', 'Country name','Ladder score', 'Explained by: Log GDP per capita', 'Explained by: Healthy life expectancy', 'Explained by: Freedom to make life choices', 'Explained by: Generosity', 'Explained by: Perceptions of corruption']
        else:
            old_columns = ['Year', 'Country','Happiness score', 'Explained by: GDP per capita', 'Explained by: Healthy life expectancy', 'Explained by: Freedom to make life choices', 'Explained by: Generosity', 'Explained by: Perceptions of corruption']
        
        # Get Only Needed Columns
        dataframe = dataframe.filter(old_columns)
        
        # Rename Columns
        for column in range(len(column_names)):
            dataframe = dataframe.rename(columns={
                old_columns[column]: column_names[column]
            })
        
        # Modify needed columns
        if i == 2018:
            dataframe['Economy'] = dataframe['Economy'].apply(lambda x: x/1000)
            dataframe['Happiness'] = dataframe['Happiness'].apply(lambda x: x/1000)
        np.seterr(divide = 'ignore') 
        if i != 2020 and i != 2021:
            dataframe['Economy'] = np.log10(dataframe['Economy'])
        np.seterr(divide = 'warn') 
        
        # Add dataframe to main dataframe
        main_dataframe = pd.concat([main_dataframe, dataframe])
    
    # Replace inf and -inf with NaN, then Imputer all NaN's
    main_dataframe = main_dataframe.replace([np.inf, -np.inf], np.NaN)
    main_dataframe = main_dataframe.fillna(main_dataframe.mean(numeric_only=True))
    
    return main_dataframe
    
    

In [187]:
all_data = load_data()

## Create Test and Training Sets

In [188]:
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [189]:
train_set, test_set = split_train_test(all_data, 0.2)

In [190]:
len(train_set)

985

In [191]:
len(test_set)

246

## Transformation Pipeline

In [192]:
def transform_data(data):
    
    one_hot_encoder = OneHotEncoder()
    data_one_hot = one_hot_encoder.fit_transform(data['Country'].values.reshape(-1,1))
    
    one_hot_categories = one_hot_encoder.categories_
    one_hot_data = data_one_hot.toarray()
    
    numberic_data = data.drop(columns=['Country'])

    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('std_scaler', StandardScaler())
    ])
    
    all_data_pipelined = pipeline.fit_transform(numberic_data)

In [194]:
transform_data(train_set)

In [195]:
train_set.describe

<bound method NDFrame.describe of      Year     Country    Happiness   Economy    Health   Freedom  Generosity  \
82   2017  Montenegro     5.237000  0.049656  0.667465  0.194989    0.197911   
52   2019      Latvia     5.940000  0.074451  0.812000  0.264000    0.075000   
18   2018      Israel     0.006814 -2.885723  0.883000  0.533000    0.354000   
103  2022      Niger*  5003.000000 -0.244125  0.326000  0.571000    0.165000   
44   2022   Nicaragua  6165.000000  3.043362  0.617000  0.617000    0.168000   
..    ...         ...          ...       ...       ...       ...         ...   
95   2017  Tajikistan     5.041000 -0.280078  0.529235  0.471567    0.248998   
36   2022      Panama  6309.000000  3.234264  0.709000  0.592000    0.049000   
40   2022      Cyprus  6221.000000  3.258877  0.819000  0.448000    0.123000   
88   2017    Portugal     5.195000  0.118984  0.795844  0.498465    0.095103   
101  2016        Laos     4.876000 -0.167223  0.382910  0.521680    0.430790   

     