# DSMLC Annual Final Competition

## Imports

In [378]:
import pandas as pd
import numpy as np
import math
import os
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

## Import Data

In [346]:
def load_data():
    # The columns we will make
    column_names = ['Year', 'Country', 'Happiness', 'Economy', 'Health', 'Freedom', 'Generosity', 'Corruption']
    main_dataframe = pd.DataFrame(columns=column_names)
    
    # Import xls sheet
    xls = pd.ExcelFile('World Happiness Datasets (2015-2022).xlsx')
    
    for i in range(2015, 2023):
        # Read Dataframe In
        dataframe = pd.read_excel(xls, f'{i}')
        
        # Read in year
        year = [i for _ in dataframe[dataframe.columns[0]]]
        dataframe['Year'] = year
        
        # Get Old Columns
        old_columns = []
        if i == 2015 or i == 2016:
            old_columns = ['Year', 'Country','Happiness Score', 'Economy (GDP per Capita)', 'Health (Life Expectancy)', 'Freedom', 'Generosity', 'Trust (Government Corruption)']
        elif i == 2017:
            old_columns = ['Year', 'Country','Happiness.Score', 'Economy..GDP.per.Capita.', 'Health..Life.Expectancy.', 'Freedom', 'Generosity', 'Trust..Government.Corruption.']
        elif i == 2018 or i == 2019:
            old_columns = ['Year', 'Country or region','Score', 'GDP per capita', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']
        elif i == 2020 or i == 2021:
            old_columns = ['Year', 'Country name','Ladder score', 'Explained by: Log GDP per capita', 'Explained by: Healthy life expectancy', 'Explained by: Freedom to make life choices', 'Explained by: Generosity', 'Explained by: Perceptions of corruption']
        else:
            old_columns = ['Year', 'Country','Happiness score', 'Explained by: GDP per capita', 'Explained by: Healthy life expectancy', 'Explained by: Freedom to make life choices', 'Explained by: Generosity', 'Explained by: Perceptions of corruption']
        
        # Get Only Needed Columns
        dataframe = dataframe.filter(old_columns)
        
        # Rename Columns
        for column in range(len(column_names)):
            dataframe = dataframe.rename(columns={
                old_columns[column]: column_names[column]
            })
        
        # Modify needed columns
        if i == 2018:
            dataframe['Economy'] = dataframe['Economy'].apply(lambda x: x/1000)
            dataframe['Happiness'] = dataframe['Happiness'].apply(lambda x: x/1000)
        np.seterr(divide = 'ignore') 
        if i != 2020 and i != 2021:
            dataframe['Economy'] = np.log10(dataframe['Economy'])
        np.seterr(divide = 'warn') 
        
        # Add dataframe to main dataframe
        main_dataframe = pd.concat([main_dataframe, dataframe])
    
    # Replace inf and -inf with NaN, then Imputer all NaN's
    main_dataframe = main_dataframe.replace([np.inf, -np.inf], np.NaN)
    main_dataframe = main_dataframe.fillna(main_dataframe.mean(numeric_only=True))
    
    return main_dataframe
    
    

In [347]:
all_data = load_data()

## Create Test and Training Sets

In [348]:
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [349]:
train_set, test_set = split_train_test(all_data, 0.2)

In [350]:
len(train_set)

985

In [351]:
len(test_set)

246

## Transformation Pipeline

In [352]:
def transform_data(data):
    
    one_hot_encoder = OneHotEncoder()
    data_one_hot = one_hot_encoder.fit_transform(data['Country'].values.reshape(-1,1))
    
    one_hot_categories = one_hot_encoder.categories_
    one_hot_data = data_one_hot.toarray()
    
    one_hot_dataframe = pd.DataFrame(
        data=one_hot_data,
        columns=one_hot_categories)
    
    numberic_data = data.drop(columns=['Country'])
    numberic_data_categories = list(numberic_data.columns.values)

    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('std_scaler', StandardScaler())
    ])
    
    all_data_pipelined = pipeline.fit_transform(numberic_data)
    
    numberic_dataframe = pd.DataFrame(
        data=all_data_pipelined,
        columns=numberic_data_categories)
    
    data_prepped = pd.concat([numberic_dataframe, one_hot_dataframe], axis=1)
    data_y = data_prepped.filter(['Happiness'])
    data_x = data_prepped.drop(columns=['Happiness'])
    
    return data_x, data_y

In [353]:
train_set_x, train_set_y = transform_data(train_set)

## Train Model

In [354]:
models = []

### Linear Regression

In [355]:
lin_reg = LinearRegression()
lin_reg.fit(train_set_x.values, train_set_y.values)

models.append({
    'model_name': 'Standrd Linear Regression',
    'model': lin_reg
})

### Decision Trees

In [356]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(train_set_x.values, train_set_y.values)

models.append({
    'model_name': 'Decision Tree Regressor',
    'model': tree_reg
})

### Random Forest Regressor

In [357]:
forest_reg = RandomForestRegressor()
forest_reg.fit(train_set_x.values, train_set_y.values.ravel())

models.append({
    'model_name': 'Random Forest Regressor',
    'model': forest_reg
})

## Evaluate Model

In [358]:
def evaluate_model(model_name, model):
    scores = cross_val_score(model, train_set_x.values, train_set_y.values.ravel(),
                            scoring="neg_mean_squared_error", cv=10)
    
    rmse = np.sqrt(-scores)

    print(f'###\t{model_name.upper()}\t###')
    print(f'\nMean:\t\t{scores.mean()}')
    print(f'Standard Dev:\t{scores.std()}')
    print(f'Scores:')
    for score in scores:
        print(f'\t{score}')
    print(f'\nNon-Normalized\nError Range:\t\t{rmse.mean()}\nMin Value In Dataset:\t{min(train_set_y.values)[0]}\nMax Value In Dataset:\t{max(train_set_y.values)[0]}')
    print(f'\nNormalized\nError Range:\t\t{(rmse.mean()-min(train_set_y.values)[0])/(max(train_set_y.values)[0] - min(train_set_y.values)[0])}\nMin Value In Dataset:\t{(min(train_set_y.values)[0]-min(train_set_y.values)[0])/(max(train_set_y.values)[0] - min(train_set_y.values)[0])}\nMax Value In Dataset:\t{(max(train_set_y.values)[0]-min(train_set_y.values)[0])/(max(train_set_y.values)[0] - min(train_set_y.values)[0])}')
    print('\n\n')
    
    return (rmse.mean()-min(train_set_y.values)[0])/(max(train_set_y.values)[0] - min(train_set_y.values)[0])

In [365]:
results = []

for model in models:
    results.append({
        'model': model['model'],
        'model_name': model['model_name'],
        'result': evaluate_model(model['model_name'], model['model'])
    })

###	STANDRD LINEAR REGRESSION	###

Mean:		-7.81830811950912e+24
Standard Dev:	2.1035839752468996e+25
Scores:
	-7.06397172781102e+25
	-1.9875484467152564e+19
	-6.787501804456458e+24
	-2.3554882267471751e+21
	-2.868053745372194e+21
	-4.739500450966682e+23
	-7.998451252246209e+21
	-2.447406475459781e+23
	-2.270868869990607e+22
	-1.2208624731426348e+21

Non-Normalized
Error Range:		1257479487295.9058
Min Value In Dataset:	-0.36390394009656307
Max Value In Dataset:	3.8681731101662282

Normalized
Error Range:		297130574978.11536
Min Value In Dataset:	0.0
Max Value In Dataset:	1.0



###	DECISION TREE REGRESSOR	###

Mean:		-0.016698800116425227
Standard Dev:	0.0063359680064615466
Scores:
	-0.02330146470489714
	-0.02271901630639875
	-0.024102999528451977
	-0.007039658916661296
	-0.006224842444297995
	-0.013357142064994356
	-0.01788243302956284
	-0.01928866092402705
	-0.011821359120932177
	-0.02125042412402867

Non-Normalized
Error Range:		0.12641114796534297
Min Value In Dataset:	-0.3639039400

In [367]:
for result in results:
    print(f'{result["model_name"]}:\t{result["result"]}')

Standrd Linear Regression:	297130574978.11536
Decision Tree Regressor:	0.11585684339831663
Random Forest Regressor:	0.10935900804950963


## Save Good Models

### Refresh Directory

In [385]:
files = os.listdir()

for item in files:
    if item.endswith(".pkl"):
        print(f'Removing {item}')
        os.remove(item)

Removing Decision Tree Regressor.pkl
Removing Random Forest Regressor.pkl


### Save Models

In [386]:
average_model_accuracy = 0
count = 0

for model in results:
    average_model_accuracy += model["result"]
    count += 1

average_model_accuracy /= count

for model in results:
    if model["result"] < average_model_accuracy:
        print(f'Saving model {model["model_name"]} as {model["model_name"]}.pkl')
        joblib.dump(model['model'], f'{model["model_name"]}.pkl')
    else:
        print(f'Not saving {model["model_name"]}')

Not saving Standrd Linear Regression
Saving model Decision Tree Regressor as Decision Tree Regressor.pkl
Saving model Random Forest Regressor as Random Forest Regressor.pkl
