# Supervised Learning for World Happiness based on Dietary Dataset

### Importing dependencies

In [87]:
import warnings
warnings.filterwarnings('ignore')

In [88]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [89]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

### Loading the DataFrame

In [90]:
# Loading the data
file_path = Path('happiness_kcal_by_country.csv')
df = pd.read_csv(file_path)
df

Unnamed: 0,country,happiness_score,Alcoholic_Beverages,Animal_Products,Animal_fats,Cereal_Excluding_Beer,Eggs,Fish_Seafood,Fruits_Excluding_Wine,Meat,...,Spices,Starchy_Roots,Stimulants,Sugar_Crops,Treenuts,Vegetal_Products,Vegetable_Oils,Vegtables,Obesity,Population
0,Afghanistan,2.523,0.0000,4.7774,0.8504,37.1186,0.1501,0.0000,1.4757,1.2006,...,0.1001,0.3252,0.0750,0.0000,0.1251,45.2476,2.3012,0.7504,4.5,38928000.0
1,Albania,5.117,0.9120,16.0930,1.0591,16.2107,0.8091,0.1471,3.8982,3.8688,...,0.0000,1.2651,0.2501,0.0000,0.3972,33.9070,2.8244,2.7508,22.3,2838000.0
2,Algeria,4.887,0.0896,6.0326,0.1941,25.0112,0.4181,0.1195,3.1805,1.2543,...,0.1195,1.9262,0.1493,0.0000,0.2240,43.9749,5.7638,2.0457,26.6,44357000.0
3,Argentina,5.929,1.4354,14.9869,1.0650,16.7927,0.8643,0.2006,1.4663,9.4459,...,0.0309,1.4045,0.2315,0.0000,0.0463,34.9900,5.5410,0.8643,28.5,45377000.0
4,Armenia,5.283,0.2274,12.8330,1.7706,19.2658,0.7310,0.1787,2.5341,4.2235,...,0.0162,1.2508,0.6985,0.0000,0.3086,37.1670,3.5737,3.2164,20.9,2956000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,Venezuela (Bolivarian Republic of),4.892,0.8454,7.2303,0.6007,21.3126,0.2892,0.4449,2.3804,3.1368,...,0.0000,1.3571,0.0667,0.0000,0.0000,42.7586,7.5417,0.6674,25.2,28645000.0
133,Vietnam,5.411,0.7150,10.9806,0.9363,26.9833,0.2894,1.0385,1.8046,7.8311,...,0.4256,0.7150,0.1021,0.1532,0.3575,39.0364,1.3279,1.9578,2.1,96209000.0
134,Yemen,3.658,0.0000,3.4667,0.3394,32.0727,0.1455,0.1697,1.1879,2.0121,...,0.0485,0.3152,0.1212,0.0000,0.0242,46.5455,3.9515,0.3636,14.1,29826000.0
135,Zambia,4.073,1.1925,3.3043,0.3230,31.5528,0.1988,0.5714,0.2236,1.5155,...,0.0745,4.0994,0.0248,0.0000,0.0000,46.7081,3.0062,0.4472,6.5,18384000.0


### Binning Happiness data into categories: Happy and Unhappy.

In [91]:
happiness= pd.cut(df.happiness_score,bins=[0,4.99,10],labels=['Unhappy','Happy'])
happiness

0      Unhappy
1        Happy
2      Unhappy
3        Happy
4        Happy
        ...   
132    Unhappy
133      Happy
134    Unhappy
135    Unhappy
136    Unhappy
Name: happiness_score, Length: 137, dtype: category
Categories (2, object): ['Unhappy' < 'Happy']

In [92]:
# Adding Happiness column to the DataFrame
df.insert(1, 'Happiness', happiness)
df.head(10)

Unnamed: 0,country,Happiness,happiness_score,Alcoholic_Beverages,Animal_Products,Animal_fats,Cereal_Excluding_Beer,Eggs,Fish_Seafood,Fruits_Excluding_Wine,...,Spices,Starchy_Roots,Stimulants,Sugar_Crops,Treenuts,Vegetal_Products,Vegetable_Oils,Vegtables,Obesity,Population
0,Afghanistan,Unhappy,2.523,0.0,4.7774,0.8504,37.1186,0.1501,0.0,1.4757,...,0.1001,0.3252,0.075,0.0,0.1251,45.2476,2.3012,0.7504,4.5,38928000.0
1,Albania,Happy,5.117,0.912,16.093,1.0591,16.2107,0.8091,0.1471,3.8982,...,0.0,1.2651,0.2501,0.0,0.3972,33.907,2.8244,2.7508,22.3,2838000.0
2,Algeria,Unhappy,4.887,0.0896,6.0326,0.1941,25.0112,0.4181,0.1195,3.1805,...,0.1195,1.9262,0.1493,0.0,0.224,43.9749,5.7638,2.0457,26.6,44357000.0
3,Argentina,Happy,5.929,1.4354,14.9869,1.065,16.7927,0.8643,0.2006,1.4663,...,0.0309,1.4045,0.2315,0.0,0.0463,34.99,5.541,0.8643,28.5,45377000.0
4,Armenia,Happy,5.283,0.2274,12.833,1.7706,19.2658,0.731,0.1787,2.5341,...,0.0162,1.2508,0.6985,0.0,0.3086,37.167,3.5737,3.2164,20.9,2956000.0
5,Australia,Happy,7.183,1.9783,15.6146,1.9027,11.643,0.4681,0.604,1.6611,...,0.1057,1.3138,0.5134,0.0,0.8457,34.3854,8.3812,1.1326,30.4,25754000.0
6,Austria,Happy,7.268,2.8161,15.6106,5.2532,12.3748,0.7853,0.4062,1.5976,...,0.1083,1.4622,0.3656,0.0,0.4197,34.3894,7.1622,1.029,21.9,8914000.0
7,Azerbaijan,Happy,5.171,2.2555,8.1682,1.305,28.7417,0.5478,0.0967,1.6111,...,0.0161,2.2072,0.3061,0.0,0.4511,41.8237,1.2083,1.4661,19.9,10108000.0
8,Bangladesh,Happy,5.025,0.0,2.3695,0.1541,37.5265,0.2312,0.8284,0.655,...,0.5009,1.9264,0.0193,0.0193,0.1156,47.6401,3.429,0.6165,3.4,169809000.0
9,Belarus,Happy,5.534,3.2979,12.2644,2.6748,14.2705,0.8511,0.4863,1.3982,...,0.0152,4.9848,0.3951,0.0,0.4255,37.7204,6.1246,1.8389,26.6,9375000.0


In [93]:
# Dropping unhelpful columns
# 'country' is a string column/label
# 'happiness_score' is redundant
# 'Population' is not a dietary dataset
df=df.drop(columns=['country', 'happiness_score', 'Population'], axis=1)
df.head(10)

Unnamed: 0,Happiness,Alcoholic_Beverages,Animal_Products,Animal_fats,Cereal_Excluding_Beer,Eggs,Fish_Seafood,Fruits_Excluding_Wine,Meat,Milk_Excluding_Butter,...,Pulses,Spices,Starchy_Roots,Stimulants,Sugar_Crops,Treenuts,Vegetal_Products,Vegetable_Oils,Vegtables,Obesity
0,Unhappy,0.0,4.7774,0.8504,37.1186,0.1501,0.0,1.4757,1.2006,2.4512,...,0.5003,0.1001,0.3252,0.075,0.0,0.1251,45.2476,2.3012,0.7504,4.5
1,Happy,0.912,16.093,1.0591,16.2107,0.8091,0.1471,3.8982,3.8688,9.9441,...,0.8091,0.0,1.2651,0.2501,0.0,0.3972,33.907,2.8244,2.7508,22.3
2,Unhappy,0.0896,6.0326,0.1941,25.0112,0.4181,0.1195,3.1805,1.2543,3.9869,...,1.09,0.1195,1.9262,0.1493,0.0,0.224,43.9749,5.7638,2.0457,26.6
3,Happy,1.4354,14.9869,1.065,16.7927,0.8643,0.2006,1.4663,9.4459,3.1641,...,0.1235,0.0309,1.4045,0.2315,0.0,0.0463,34.99,5.541,0.8643,28.5
4,Happy,0.2274,12.833,1.7706,19.2658,0.731,0.1787,2.5341,4.2235,5.6368,...,0.4386,0.0162,1.2508,0.6985,0.0,0.3086,37.167,3.5737,3.2164,20.9
5,Happy,1.9783,15.6146,1.9027,11.643,0.4681,0.604,1.6611,7.4902,4.8022,...,0.1359,0.1057,1.3138,0.5134,0.0,0.8457,34.3854,8.3812,1.1326,30.4
6,Happy,2.8161,15.6106,5.2532,12.3748,0.7853,0.4062,1.5976,5.0095,4.0482,...,0.1083,0.1083,1.4622,0.3656,0.0,0.4197,34.3894,7.1622,1.029,21.9
7,Happy,2.2555,8.1682,1.305,28.7417,0.5478,0.0967,1.6111,2.9322,3.1738,...,0.0806,0.0161,2.2072,0.3061,0.0,0.4511,41.8237,1.2083,1.4661,19.9
8,Happy,0.0,2.3695,0.1541,37.5265,0.2312,0.8284,0.655,0.3468,0.7706,...,1.2329,0.5009,1.9264,0.0193,0.0193,0.1156,47.6401,3.429,0.6165,3.4
9,Happy,3.2979,12.2644,2.6748,14.2705,0.8511,0.4863,1.3982,5.7143,2.1884,...,0.0,0.0152,4.9848,0.3951,0.0,0.4255,37.7204,6.1246,1.8389,26.6


In [94]:
# Create our features
X = pd.get_dummies(df.drop('Happiness', axis=1))

# Create our target
y = df['Happiness']

In [95]:
X.describe()

Unnamed: 0,Alcoholic_Beverages,Animal_Products,Animal_fats,Cereal_Excluding_Beer,Eggs,Fish_Seafood,Fruits_Excluding_Wine,Meat,Milk_Excluding_Butter,Miscellaneous,...,Pulses,Spices,Starchy_Roots,Stimulants,Sugar_Crops,Treenuts,Vegetal_Products,Vegetable_Oils,Vegtables,Obesity
count,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,...,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0
mean,1.374335,9.418774,1.36041,20.808085,0.464232,0.575042,1.864257,3.782418,3.085221,0.121138,...,1.075519,0.180331,2.825885,0.315014,0.019878,0.288201,40.58221,4.990855,1.124566,18.410949
std,1.115354,4.908184,1.373477,6.604714,0.312974,0.567862,1.276899,2.200806,2.081442,0.161789,...,1.239935,0.246362,3.552728,0.333263,0.077923,0.290579,4.907847,2.293868,0.656202,9.515125
min,0.0,1.6237,0.0,9.4378,0.0188,0.0,0.1471,0.298,0.1322,0.0,...,0.0,0.0,0.2938,0.0,0.0,0.0,27.7089,0.9325,0.0957,0.0
25%,0.3552,5.1831,0.4046,14.7546,0.1519,0.2211,1.1835,2.0121,1.1423,0.0215,...,0.29,0.0361,1.0753,0.0773,0.0,0.0602,36.8016,3.1239,0.6122,8.2
50%,1.2768,9.1727,0.9322,20.5847,0.4464,0.4062,1.6032,3.5247,2.9614,0.0742,...,0.6624,0.0837,1.5019,0.1827,0.0,0.2041,40.8418,4.6461,1.0339,21.3
75%,2.0617,13.2061,1.9741,25.6053,0.6661,0.7951,2.2195,5.2181,4.5698,0.164,...,1.4987,0.2016,2.6709,0.4356,0.0,0.4216,44.8407,6.6657,1.4661,25.7
max,5.1574,22.2911,7.8007,37.5265,1.4461,4.4183,8.8056,10.5674,9.9441,1.1822,...,7.5638,1.2202,19.6143,2.009,0.593,1.421,48.3864,10.3839,3.3524,37.3


In [96]:
# Check the balance of our target values
y.value_counts()

Happy      93
Unhappy    44
Name: Happiness, dtype: int64

### Training Dataset

In [97]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({'Unhappy': 31, 'Happy': 71})

### BalancedRandomForestClassifier

In [98]:
# Resample the training data with the BalancedRandomForestClassifier
# YOUR CODE HERE
# Creating the decision tree classifier instance.
from imblearn.ensemble import BalancedRandomForestClassifier
model_brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
# Fitting the model.
model_brfc.fit(X_train, y_train)
y_pred_brfc = model_brfc.predict(X_test)

In [99]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
balanced_accuracy_score(y_test, y_pred_brfc)

0.763986013986014

In [100]:
# Display the confusion matrix
# YOUR CODE HERE
cm_brfc = confusion_matrix(y_test, y_pred_brfc)
cm_brfc_df = pd.DataFrame(cm_brfc, index=['actual_happy', 'actual_unhappy'], columns=['predicted_happy', 'predicted_unhappy'])
cm_brfc_df

Unnamed: 0,predicted_happy,predicted_unhappy
actual_happy,15,7
actual_unhappy,2,11


In [101]:
# Print the imbalanced classification report
# YOUR CODE HERE
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred_brfc))

                   pre       rec       spe        f1       geo       iba       sup

      Happy       0.88      0.68      0.85      0.77      0.76      0.57        22
    Unhappy       0.61      0.85      0.68      0.71      0.76      0.59        13

avg / total       0.78      0.74      0.79      0.75      0.76      0.57        35



### Feature Importance

In [102]:
# List the features sorted in descending order by feature importance
# YOUR CODE HERE
importance = sorted(zip(model_brfc.feature_importances_, X.columns), reverse=True)
importance

[(0.149442929799742, 'Vegetal_Products'),
 (0.12961676682348552, 'Animal_Products'),
 (0.09843198939467887, 'Eggs'),
 (0.07590682403510633, 'Animal_fats'),
 (0.06992262812975143, 'Meat'),
 (0.05864068976220862, 'Cereal_Excluding_Beer'),
 (0.05416890965606372, 'Alcoholic_Beverages'),
 (0.04767027431817313, 'Stimulants'),
 (0.03722386608824089, 'Milk_Excluding_Butter'),
 (0.035051756694154064, 'Starchy_Roots'),
 (0.03418295753778202, 'Oilcrops'),
 (0.0324013711034197, 'Obesity'),
 (0.02876545798429756, 'Vegtables'),
 (0.028384793237279826, 'Fish_Seafood'),
 (0.025606041313795055, 'Fruits_Excluding_Wine'),
 (0.018877036086403464, 'Pulses'),
 (0.017838183639324184, 'Spices'),
 (0.015528986657952406, 'Vegetable_Oils'),
 (0.013477005358437983, 'Offals'),
 (0.013237050590880753, 'Treenuts'),
 (0.010049484124381326, 'Miscellaneous'),
 (0.005574997664441131, 'Sugar_Crops')]

### EasyEnsembleClassifier

In [103]:
# Train the EasyEnsembleClassifier
# YOUR CODE HERE
from imblearn.ensemble import EasyEnsembleClassifier
model_eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
model_eec.fit(X_train, y_train)
y_pred_eec = model_eec.predict(X_test)

In [104]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
bal_acc_score_eec = balanced_accuracy_score(y_test, y_pred_eec)
print(bal_acc_score_eec)

0.763986013986014


In [105]:
# Display the confusion matrix
# YOUR CODE HERE
cm_eec = confusion_matrix(y_test, y_pred_eec)
cm_eec_df = pd.DataFrame(cm_eec, index=['actual_happy', 'actual_unhappy'], columns=['predicted_happy', 'predicted_unhappy'])
cm_eec_df

Unnamed: 0,predicted_happy,predicted_unhappy
actual_happy,15,7
actual_unhappy,2,11


In [106]:
# Print the imbalanced classification report
# YOUR CODE HERE
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred_eec))

                   pre       rec       spe        f1       geo       iba       sup

      Happy       0.88      0.68      0.85      0.77      0.76      0.57        22
    Unhappy       0.61      0.85      0.68      0.71      0.76      0.59        13

avg / total       0.78      0.74      0.79      0.75      0.76      0.57        35



### RandomOverSampler

In [107]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'Unhappy': 71, 'Happy': 71})

In [108]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [109]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.7255244755244755

In [110]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['actual_happy', 'actual_unhappy'],  columns=['predicted_happy', 'predicted_unhappy'])
cm_df

Unnamed: 0,predicted_happy,predicted_unhappy
actual_happy,15,7
actual_unhappy,3,10


In [111]:
# Print the imbalanced classification report
# YOUR CODE HERE
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      Happy       0.83      0.68      0.77      0.75      0.72      0.52        22
    Unhappy       0.59      0.77      0.68      0.67      0.72      0.53        13

avg / total       0.74      0.71      0.74      0.72      0.72      0.52        35



### SMOTE

In [112]:
# Resample the training data with SMOTE
# YOUR CODE HERE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

In [113]:
# Train the Logistic Regression model using the resampled data
# YOUR CODE HERE
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [114]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7797202797202798

In [115]:
# Display the confusion matrix
# YOUR CODE HERE
cm_smote = confusion_matrix(y_test, y_pred)
cm_smote_df = pd.DataFrame(cm_smote, index=['actual_happy', 'actual_unhappy'], columns = ['predicted_happy', 'predicted_unhappy'])
cm_smote_df

Unnamed: 0,predicted_happy,predicted_unhappy
actual_happy,14,8
actual_unhappy,1,12


In [116]:
# Print the imbalanced classification report
# YOUR CODE HERE
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      Happy       0.93      0.64      0.92      0.76      0.77      0.57        22
    Unhappy       0.60      0.92      0.64      0.73      0.77      0.60        13

avg / total       0.81      0.74      0.82      0.75      0.77      0.58        35



### ClusterCentroids Resampler

In [117]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
# YOUR CODE HERE
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

In [118]:
# Train the Logistic Regression model using the resampled data
# YOUR CODE HERE
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [119]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.7797202797202798

In [120]:
# Display the confusion matrix
# YOUR CODE HERE
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
cm_cluster_centroids = confusion_matrix(y_test, y_pred)
cm_cluster_centroids_df = pd.DataFrame(cm_cluster_centroids, index=['actual_happy', 'actual_unhappy'], columns=['predicted_happy', 'predicted_unhappy'])
cm_cluster_centroids_df

Unnamed: 0,predicted_happy,predicted_unhappy
actual_happy,15,7
actual_unhappy,2,11


In [121]:
# Print the imbalanced classification report
# YOUR CODE HERE
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      Happy       0.88      0.68      0.85      0.77      0.76      0.57        22
    Unhappy       0.61      0.85      0.68      0.71      0.76      0.59        13

avg / total       0.78      0.74      0.79      0.75      0.76      0.57        35



### SMOTEENN

In [122]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
# YOUR CODE HERE
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [123]:
# Train the Logistic Regression model using the resampled data
# YOUR CODE HERE
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [124]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.763986013986014

In [125]:
# Display the confusion matrix
# YOUR CODE HERE
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
cm_smoteenn = confusion_matrix(y_test, y_pred)
cm_smoteenn_df = pd.DataFrame(cm_smoteenn, index=['actual_happy', 'actual_unhappy'], columns = ['predicted_happy', 'predicted_unhappy'])
cm_smoteenn_df

Unnamed: 0,predicted_happy,predicted_unhappy
actual_happy,15,7
actual_unhappy,3,10


In [126]:
# Print the imbalanced classification report
# YOUR CODE HERE
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      Happy       0.83      0.68      0.77      0.75      0.72      0.52        22
    Unhappy       0.59      0.77      0.68      0.67      0.72      0.53        13

avg / total       0.74      0.71      0.74      0.72      0.72      0.52        35

