# 0. Import data
The train and test split has already been done in the dataset, so I'll just import them from different csv files.

In [None]:
# load the zip file
from zipfile import ZipFile

with ZipFile("world_happiness_competition_data.zip", 'r') as zObject:
    zObject.extractall()

In [None]:
# Load data
import pandas as pd
X_train = pd.read_csv('world_happiness_competition_data/X_train.csv')
X_test = pd.read_csv('world_happiness_competition_data/X_test.csv')
y_train = pd.read_csv('world_happiness_competition_data/y_train.csv')
y_train_labels = y_train.idxmax(axis=1)  # summarize y_train into one vector

X_train.head()

In [None]:
y_train_labels.head()

## Add new data

In [None]:
# Truncated and cleaned up region data to merge (Week 4 folder)
countrydata=pd.read_csv("newcountryvars.csv")

countrydata.head()

In [None]:
# Join new data to X_train and X_test by taking "Country or region" from first table and "country_name" from 2nd table.

X_train = pd.merge(X_train, countrydata, how='left', left_on=["Country or region"], right_on=["country_name"])
X_test= pd.merge(X_test, countrydata, how='left', left_on=["Country or region"], right_on=["country_name"])

# only keep one key
X_train.drop(columns=['country_name'], inplace=True)
X_test.drop(columns=["country_name"], inplace=True)

In [None]:
X_train.head(1)

# 1. EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# plot distribution of y variable
happiness_order = ['Very Low', 'Low', 'Average', 'High', 'Very High']
happiness_count = y_train_labels.value_counts()
plt.bar(x=happiness_order, height=happiness_count[happiness_order])
plt.title("Distributin of Happiness Level")

In [None]:
# select columns from X_train and add y_trai
df_try = X_train[["GDP per capita", "Freedom to make life choices", "hdi", "life_expectancy", "mean_years_of_schooling"]]
df_try["target"] = y_train_labels

# pairplot the relations between columns, the color indicates different happiness levels
level_palette = ["#ccdbdc", "#9ad1d4", "#80ced7", "#007ea7", "#003249"]
pairplot = sns.pairplot(df_try, hue='target', hue_order=happiness_order, palette=level_palette)
pairplot._legend.set_title('Happiness Level')

 With all the features that are intuitively considered to be positively correlated with happiness, the pairplot does roughly show this tendency.


# 2. Preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Create the preprocessing pipelines for both numeric and categorical data.
numeric_features = X_train.drop(["Country or region", "name", "region", "sub-region"], axis=1)  ## Drop all the non-numerical features from X_train
numeric_features = numeric_features.columns.tolist()

numeric_transformer = Pipeline(steps=[
     ('imputer', SimpleImputer(strategy='constant', fill_value=0)), ## Is this good enough?
     ('scaler', StandardScaler())]) # You will need to describe why this is being done in the next cell

categorical_features = ['region', 'sub-region']

# Replacing missing values with Modal value and then one hot encoding.
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown= 'error'))])

# Final preprocessor object set up with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

preprocess = preprocessor.fit(X_train)

def preprocessor(data):
    data.drop(['Country or region', 'name'], axis=1)
    preprocessed_data=preprocess.transform(data)
    return preprocessed_data

# 3. Establish the model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

model = RandomForestClassifier()

param_grid = {
    "n_estimators": np.arange(50, 201, 50),
    "max_depth": np.arange(5, 21, 5),
    "min_samples_split": np.arange(2, 5),
    "min_samples_leaf": np.arange(1, 5),
    }

gridmodel = GridSearchCV(model, param_grid, cv=5)

gridmodel.fit(preprocessor(X_train), y_train_labels)

print("best mean cross-validation score: {:.3f}".format(gridmodel.best_score_))
print("best parameters: {}".format(gridmodel.best_params_))

# 4. Save and Submit the model to aishare

In [None]:
#Set credentials using modelshare.org username/password
import aimodelshare as ai

apiurl="https://e2w6gh3id1.execute-api.us-east-2.amazonaws.com/prod/m"
ai.aws.set_credentials(apiurl=apiurl)

In [None]:
# Initiate Competition
mycompetition= ai.Competition(apiurl)

In [None]:
# save the preprocessor to aimodelshare
ai.export_preprocessor(preprocessor,"")

In [None]:
# Save the RF model to local ONNX file
feature_count = preprocessor(X_test).shape[1] #Get count of preprocessed features
initial_type = [('float_input', FloatTensorType([None, feature_count]))] # Insert correct number of preprocessed features

onnx_model = ai.aimsonnx.model_to_onnx(gridmodel, framework='sklearn',
                          initial_types=initial_type,
                          transfer_learning=False,
                          deep_learning=False)

with open("model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

In [None]:
# Generate predicted values
prediction_labels = gridmodel.predict(preprocessor(X_test))
gridmodel_filepath = "model.onnx"
preprocessor_filepath="preprocessor.zip"

# Submit to Competition Leaderboard
mycompetition.submit_model(model=gridmodel_filepath,
                           prediction_submission=prediction_labels,
                           preprocessor=preprocessor_filepath)