# Analysis of The World Happiness Report

The World Happiness Report is an annual publication that ranks countries based on their citizens' happiness levels. This notebook does an exploratory data analysis on the data using Data Science principles.

---
By Wanga Mulaudzi
<br>
21 July 2024

## Import Statements

In [88]:
import geopandas as gpd # For loading a map of the world
import io # For handling bytes
import ipywidgets as widgets # For interactive plotting
from IPython.display import display # For interactive plotting
import matplotlib.pyplot as plt # For plotting
import pandas as pd # For handling the data in tabular form
import requests # For downloading data
import seaborn as sns # For plotting
from sklearn.cluster import KMeans # For unsupervised clustering
from sklearn.decomposition import PCA # For calculating principal components of features
from sklearn.ensemble import RandomForestRegressor # # For supervised learning
from sklearn.metrics import mean_squared_error, r2_score # For calculating metrics of trained model
from sklearn.model_selection import train_test_split # For train test splitting the data
from sklearn.preprocessing import StandardScaler # For notmalizing the features

## Read in the Data

In [2]:
happiness_2015 = pd.read_csv("data/2015.csv")
happiness_2015.head(3)

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204


In [3]:
happiness_2016 = pd.read_csv("data/2016.csv")
happiness_2016.head(3)

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Lower Confidence Interval,Upper Confidence Interval,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Denmark,Western Europe,1,7.526,7.46,7.592,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939
1,Switzerland,Western Europe,2,7.509,7.428,7.59,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463
2,Iceland,Western Europe,3,7.501,7.333,7.669,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137


In [4]:
happiness_2017 = pd.read_csv("data/2017.csv")
happiness_2017.head(3)

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715


In [5]:
happiness_2018 = pd.read_csv("data/2018.csv")
happiness_2018.head(3)

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408


In [6]:
happiness_2019 = pd.read_csv("data/2019.csv")
happiness_2019.head(3)

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341


### Make column names consistent

In [7]:
happiness_2017.head(3)

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715


In [8]:
happiness_2017.rename(columns={"Happiness.Rank": "Happiness Rank",
                               "Happiness.Score": "Happiness Score",
                               "Whisker.high": "Upper Confidence Interval",
                               "Whisker.low": "Lower Confidence Interval",
                               "Economy..GDP.per.Capita.": "Economy (GDP per Capita)",
                               "Health..Life.Expectancy.": "Health (Life Expectancy)",
                               "Trust..Government.Corruption.": "Trust (Government Corruption)",
                               "Dystopia.Residual": "Dystopia Residual"},
                      inplace=True)

happiness_2017.head(3)

Unnamed: 0,Country,Happiness Rank,Happiness Score,Upper Confidence Interval,Lower Confidence Interval,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Generosity,Trust (Government Corruption),Dystopia Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715


In [9]:
happiness_2018.head(3)

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408


In [10]:
happiness_2018.rename(columns={"Overall rank": "Happiness Rank",
                               "Country or region": "Country",
                               "Score": "Happiness Score",
                               "GDP per capita": "Economy (GDP per Capita)",
                               "Social support": "Social Support",
                               "Healthy life expectancy": "Health (Life Expectancy)",
                               "Freedom to make life choices": "Freedom",
                               "Perceptions of corruption": "Trust (Government Corruption)"},
                      inplace=True)

happiness_2018.head(3)

Unnamed: 0,Happiness Rank,Country,Happiness Score,Economy (GDP per Capita),Social Support,Health (Life Expectancy),Freedom,Generosity,Trust (Government Corruption)
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408


In [11]:
happiness_2019.head(3)

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341


In [12]:
happiness_2019.rename(columns={"Overall rank": "Happiness Rank",
                               "Country or region": "Country",
                               "Score": "Happiness Score",
                               "GDP per capita": "Economy (GDP per Capita)",
                               "Social support": "Social Support",
                               "Healthy life expectancy": "Health (Life Expectancy)",
                               "Freedom to make life choices": "Freedom",
                               "Perceptions of corruption": "Trust (Government Corruption)"},
                      inplace=True)

happiness_2019.head(3)

Unnamed: 0,Happiness Rank,Country,Happiness Score,Economy (GDP per Capita),Social Support,Health (Life Expectancy),Freedom,Generosity,Trust (Government Corruption)
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341


## Understand the Data

### Check for NaNs, empty rows, missing values, etc.

In [13]:
happiness_2015.isnull().sum().sort_values(ascending=False)/len(happiness_2015)

Country                          0.0
Region                           0.0
Happiness Rank                   0.0
Happiness Score                  0.0
Standard Error                   0.0
Economy (GDP per Capita)         0.0
Family                           0.0
Health (Life Expectancy)         0.0
Freedom                          0.0
Trust (Government Corruption)    0.0
Generosity                       0.0
Dystopia Residual                0.0
dtype: float64

In [14]:
happiness_2016.isnull().sum().sort_values(ascending=False)/len(happiness_2016)

Country                          0.0
Region                           0.0
Happiness Rank                   0.0
Happiness Score                  0.0
Lower Confidence Interval        0.0
Upper Confidence Interval        0.0
Economy (GDP per Capita)         0.0
Family                           0.0
Health (Life Expectancy)         0.0
Freedom                          0.0
Trust (Government Corruption)    0.0
Generosity                       0.0
Dystopia Residual                0.0
dtype: float64

In [15]:
happiness_2017.isnull().sum().sort_values(ascending=False)/len(happiness_2017)

Country                          0.0
Happiness Rank                   0.0
Happiness Score                  0.0
Upper Confidence Interval        0.0
Lower Confidence Interval        0.0
Economy (GDP per Capita)         0.0
Family                           0.0
Health (Life Expectancy)         0.0
Freedom                          0.0
Generosity                       0.0
Trust (Government Corruption)    0.0
Dystopia Residual                0.0
dtype: float64

In [16]:
happiness_2018.isnull().sum().sort_values(ascending=False)/len(happiness_2018)

Trust (Government Corruption)    0.00641
Happiness Rank                   0.00000
Country                          0.00000
Economy (GDP per Capita)         0.00000
Happiness Score                  0.00000
Social Support                   0.00000
Health (Life Expectancy)         0.00000
Freedom                          0.00000
Generosity                       0.00000
dtype: float64

In [17]:
happiness_2018[happiness_2018.isnull().any(axis=1)]

Unnamed: 0,Happiness Rank,Country,Happiness Score,Economy (GDP per Capita),Social Support,Health (Life Expectancy),Freedom,Generosity,Trust (Government Corruption)
19,20,United Arab Emirates,6.774,2.096,0.776,0.67,0.284,0.186,


United Arab Emirates has NaN for its perception of corruption so we will remove it.

In [18]:
happiness_2018.dropna(inplace=True)
len(happiness_2018)

155

In [19]:
happiness_2019.isnull().sum().sort_values(ascending=False)/len(happiness_2019)

Happiness Rank                   0.0
Country                          0.0
Happiness Score                  0.0
Economy (GDP per Capita)         0.0
Social Support                   0.0
Health (Life Expectancy)         0.0
Freedom                          0.0
Generosity                       0.0
Trust (Government Corruption)    0.0
dtype: float64

### Check for duplicated rows

In [20]:
happiness_2015.duplicated().sum()

np.int64(0)

In [21]:
happiness_2016.duplicated().sum()

np.int64(0)

In [22]:
happiness_2017.duplicated().sum()

np.int64(0)

In [23]:
happiness_2018.duplicated().sum()

np.int64(0)

In [24]:
happiness_2019.duplicated().sum()

np.int64(0)

### Make sure numerical columns are of type float

In [25]:
happiness_2015.dtypes

Country                           object
Region                            object
Happiness Rank                     int64
Happiness Score                  float64
Standard Error                   float64
Economy (GDP per Capita)         float64
Family                           float64
Health (Life Expectancy)         float64
Freedom                          float64
Trust (Government Corruption)    float64
Generosity                       float64
Dystopia Residual                float64
dtype: object

In [26]:
happiness_2016.dtypes

Country                           object
Region                            object
Happiness Rank                     int64
Happiness Score                  float64
Lower Confidence Interval        float64
Upper Confidence Interval        float64
Economy (GDP per Capita)         float64
Family                           float64
Health (Life Expectancy)         float64
Freedom                          float64
Trust (Government Corruption)    float64
Generosity                       float64
Dystopia Residual                float64
dtype: object

In [27]:
happiness_2017.dtypes

Country                           object
Happiness Rank                     int64
Happiness Score                  float64
Upper Confidence Interval        float64
Lower Confidence Interval        float64
Economy (GDP per Capita)         float64
Family                           float64
Health (Life Expectancy)         float64
Freedom                          float64
Generosity                       float64
Trust (Government Corruption)    float64
Dystopia Residual                float64
dtype: object

In [28]:
happiness_2018.dtypes

Happiness Rank                     int64
Country                           object
Happiness Score                  float64
Economy (GDP per Capita)         float64
Social Support                   float64
Health (Life Expectancy)         float64
Freedom                          float64
Generosity                       float64
Trust (Government Corruption)    float64
dtype: object

In [29]:
happiness_2019.dtypes

Happiness Rank                     int64
Country                           object
Happiness Score                  float64
Economy (GDP per Capita)         float64
Social Support                   float64
Health (Life Expectancy)         float64
Freedom                          float64
Generosity                       float64
Trust (Government Corruption)    float64
dtype: object

## Data Augmentation

### Add year columns

In [30]:
happiness_2015["Year"] = 2015
happiness_2016["Year"] = 2016
happiness_2017["Year"] = 2017
happiness_2018["Year"] = 2018
happiness_2019["Year"] = 2019

### Concatenate the tables together

In [31]:
tables_list = [happiness_2015, happiness_2016, happiness_2017, happiness_2018, happiness_2019]

## Analysis
### Choropleth Plots

In [108]:
# Download world map data
url = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip"
response = requests.get(url)
world = gpd.read_file(io.BytesIO(response.content))

# Create a dropdown widget so that a user can plot the choropleth maps per year
years_list = ["2015", "2016", "2017", "2018", "2019"] # List of years
dropdown = widgets.Dropdown(
    options=years_list,
    value="2015",
    description="Year"
)

def plot_choropleth(year):
    # Get the index of the selected year in years_list
    df_index = years_list.index(year)
    df = tables_list[df_index]  # Corresponding dataframe

    # Merge happiness data with world map data
    world_happiness = world.merge(df, how="left", left_on=["SOVEREIGNT"], right_on=["Country"])

    # Create figure and axis
    fig, ax = plt.subplots(figsize=(15, 7))

    # Plot the map
    world_happiness.plot(column="Happiness Score",
                         ax=ax,
                         legend=True,
                         legend_kwds={"label": "Happiness Score"},
                         cmap="viridis",
                         missing_kwds={"color": "lightgrey"})

    # Remove axis
    ax.axis("off")

    # Set title
    plt.title(f"World Happiness Report - {year}", fontsize=16)

    plt.savefig(f"images/world-happiness-report-{year}.png")
    plt.show()

# Create an output widget to display the plot
output = widgets.Output()

# Function to update the plot
def update_plot(change):
    year = change["new"]

    with output:
        output.clear_output(wait=True)
        plot_choropleth(year)

# Connect the dropdown to the update function
dropdown.observe(update_plot, names="value")

# Display the dropdown and initial plot
display(dropdown, output)
update_plot({"new": "2015"})
plt.savefig("World Happiness Report")

Dropdown(description='Year', options=('2015', '2016', '2017', '2018', '2019'), value='2015')

Output()

<Figure size 640x480 with 0 Axes>

![Plot Name](images/world-happiness-report-2015.png)
![Plot Name](images/world-happiness-report-2016.png)
![Plot Name](images/world-happiness-report-2017.png)
![Plot Name](images/world-happiness-report-2018.png)
![Plot Name](images/world-happiness-report-2019.png)

### Correlations between happiness scores and other factors

In [113]:
# Create a dropdown widget so that a user can plot the correlation between happiness scores
# and other factors
years_list = ["2015", "2016", "2017", "2018", "2019"] # List of years
dropdown_years_heatmap = widgets.Dropdown(options=years_list, value="2015", description="Year")

def update_heatmap(selected_year):
    # Get the index of the selected year in years_list
    df_index = years_list.index(selected_year)
    df = tables_list[df_index]  # Corresponding dataframe based on selected_year

    # Correlation matrix figure
    plt.figure(figsize=(8,4))

    correlation_matrix = df[["Happiness Score", "Economy (GDP per Capita)", "Health (Life Expectancy)",
                             "Freedom", "Trust (Government Corruption)", "Generosity"]].corr()

    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")

    plt.title(f"Correlation Heatmap of Happiness Factors for {selected_year}")

    plt.savefig(f"images/correlation-factors-{selected_year}.png")
    plt.show()

# Create an output widget to display the plot
output = widgets.Output()

# Function to update the plot
def update_plot_heatmap(change):
    year = change["new"]

    with output:
        output.clear_output(wait=True)
        update_heatmap(year)

# Connect the dropdown to the update function
dropdown_years_heatmap.observe(update_plot_heatmap, names="value")

# Display the dropdown and initial plot
display(dropdown_years_heatmap, output)
update_plot_heatmap({"new": "2015"})

Dropdown(description='Year', options=('2015', '2016', '2017', '2018', '2019'), value='2015')

Output()

![Plot Name](images/correlation-factors-2015.png)
![Plot Name](images/correlation-factors-2016.png)
![Plot Name](images/correlation-factors-2017.png)
![Plot Name](images/correlation-factors-2018.png)
![Plot Name](images/correlation-factors-2019.png)

### Happiness Score over time per country

In [35]:
# Need to concatenate the tables
tables_concat = pd.concat(objs=[happiness_2015, happiness_2016, happiness_2017, happiness_2018, happiness_2019])
tables_concat.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual,Year,Lower Confidence Interval,Upper Confidence Interval,Social Support
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,2015,,,
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201,2015,,,
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204,2015,,,
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531,2015,,,
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176,2015,,,


In [64]:
# Groupby country
country_group = tables_concat[["Country", "Happiness Score", "Year"]].groupby("Country").agg({
    "Happiness Score": lambda x: list(x),
    "Year": lambda x: list(x)
}).reset_index()

country_group.head()

Unnamed: 0,Country,Happiness Score,Year
0,Afghanistan,"[3.575, 3.36, 3.79399991035461, 3.632, 3.203]","[2015, 2016, 2017, 2018, 2019]"
1,Albania,"[4.959, 4.655, 4.64400005340576, 4.586, 4.719]","[2015, 2016, 2017, 2018, 2019]"
2,Algeria,"[5.605, 6.355, 5.87200021743774, 5.295, 5.211]","[2015, 2016, 2017, 2018, 2019]"
3,Angola,"[4.033, 3.866, 3.79500007629395, 3.795]","[2015, 2016, 2017, 2018]"
4,Argentina,"[6.574, 6.65, 6.59899997711182, 6.388, 6.086]","[2015, 2016, 2017, 2018, 2019]"


In [115]:
# Create a dropdown widget so that a user can plot the choropleth maps per year
# List of countries in alphabetical order
countries_list = sorted(list(country_group["Country"].value_counts().index))
dropdown_countries_happiness = widgets.Dropdown(options=countries_list,
                                                value="Switzerland",
                                                description="Country")

def plot_happiness_over_time(selected_country):
    # Get the dataframe corresponging to selected_country
    filtered_country_group = country_group[country_group["Country"] == selected_country]

    # Create figure and axis
    fig, ax = plt.subplots(figsize=(6, 4))

    # Plot the happiness score over time
    plt.bar(x=filtered_country_group["Year"].values[0], height=filtered_country_group["Happiness Score"].values[0], color="black")

    # Set title
    plt.title(f"Happiness Score Over Time For {selected_country}")
    plt.xlabel("Year")
    plt.ylabel("Happiness Score")

    # Annotate each bar with its value
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x() + p.get_width() / 2., height, f"{height:.2f}", ha="center", va="bottom")

    plt.ylim(0, 10)

    plt.savefig(f"images/happiness-over-time-{selected_country}.png")
    plt.show()

# Create an output widget to display the plot
output = widgets.Output()

# Function to update the plot
def update_plot_happiness(change):
    selected_country = change["new"]

    with output:
        output.clear_output(wait=True)
        plot_happiness_over_time(selected_country)

# Connect the dropdown to the update function
dropdown_countries_happiness.observe(update_plot_happiness, names="value")

# Display the dropdown and initial plot
display(dropdown_countries_happiness, output)
update_plot_happiness({"new": "Switzerland"})

Dropdown(description='Country', index=145, options=('Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina'…

Output()

![Plot Name](images/happiness-over-time-Japan.png)
![Plot Name](images/happiness-over-time-Netherlands.png)
![Plot Name](images/happiness-over-time-South-Africa.png)
![Plot Name](images/happiness-over-time-Switzerland.png)
![Plot Name](images/happiness-over-time-Zimbabwe.png)

### Most contributing factor to the Happiness Score

In [116]:
# Create a dropdown widget so that a user can plot the correlation between happiness scores
# and other factors
years_list = ["2015", "2016", "2017", "2018", "2019"] # List of years
dropdown_years_factors = widgets.Dropdown(options=years_list, value="2015", description="Year")

def update_contributing_factor(selected_year):
    # Get the index of the selected year in years_list
    df_index = years_list.index(selected_year)
    df = tables_list[df_index]  # Corresponding dataframe based on selected_year

    # Correlation matrix figure
    fig, ax = plt.subplots(figsize=(6, 4))

    av_contribution = df[["Happiness Score", "Economy (GDP per Capita)", "Health (Life Expectancy)",
                        "Freedom", "Trust (Government Corruption)", "Generosity"]].mean().sort_values(ascending=False)

    colors = ["red", "green", "blue", "purple", "orange"]

    # Plot the happiness score over time
    av_contribution.plot(kind="bar", color=colors)

    # Set title
    plt.title("Average Contribution of Factors to Happiness Score")
    plt.xlabel("Factor")
    plt.ylabel("Average Contribution")

    # Annotate each bar with its value
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x() + p.get_width() / 2., height, f"{height:.2f}", ha="center", va="bottom")

    plt.ylim(0, 10)

    plt.savefig(f"images/av-factor-contribution-{selected_year}.png")
    plt.show()

# Create an output widget to display the plot
output = widgets.Output()

# Function to update the plot
def update_plot_factors(change):
    year = change["new"]

    with output:
        output.clear_output(wait=True)
        update_contributing_factor(year)

# Connect the dropdown to the update function
dropdown_years_factors.observe(update_plot_factors, names="value")

# Display the dropdown and initial plot
display(dropdown_years_factors, output)
update_plot_factors({"new": "2015"})

Dropdown(description='Year', options=('2015', '2016', '2017', '2018', '2019'), value='2015')

Output()

![Plot Name](images/av-factor-contribution-2015.png)
![Plot Name](images/av-factor-contribution-2016.png)
![Plot Name](images/av-factor-contribution-2017.png)
![Plot Name](images/av-factor-contribution-2018.png)
![Plot Name](images/av-factor-contribution-2019.png)

### Unsupervised Learning
#### K-Means Clustering

In [117]:
# Create a dropdown widget so that a user can plot the correlation between happiness scores
# and other factors
years_list = ["2015", "2016", "2017", "2018", "2019"] # List of years
dropdown_years_kmeans = widgets.Dropdown(options=years_list, value="2015", description="Year")

def update_contributing_factor(selected_year):
    # Get the index of the selected year in years_list
    df_index = years_list.index(selected_year)
    df = tables_list[df_index]  # Corresponding dataframe based on selected_year

    # Prepare data
    features = ["Economy (GDP per Capita)", "Health (Life Expectancy)", "Freedom",
                "Trust (Government Corruption)", "Generosity"]

    X = df[features]

    # Normalize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Perform K-means clustering
    n_clusters = len(features)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(X_scaled)

    # Perform PCA to reduce to 2 dimensions for visualization
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)

    # Visualize clusters
    fig, ax = plt.subplots(figsize=(8, 8))

    # Create a scatter plot for each cluster and store the color used
    cluster_colors = {}

    for i in range(n_clusters):
        cluster_points = X_pca[cluster_labels == i]

        # Scatter Plot
        scatter = plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=features[i])
        cluster_colors[i] = scatter.get_facecolor()[0]

    # Plot centroids
    centroids_pca = pca.transform(kmeans.cluster_centers_)
    plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], s=300, c="black", marker="*", label="Centroids")

    # Highlighter countries
    highlight_countries = ["Switzerland", "United States", "Japan", "Brazil", "India", "South Africa"]

    # Overlay highlighted countries with different markers
    for i, country in enumerate(df["Country"]):

        if country in highlight_countries:
            # Get the color of the cluster
            cluster_index = cluster_labels[i]
            cluster_color = cluster_colors[cluster_index]

            # Diamond marker 'D' for highlighted countries
            plt.scatter(X_pca[i, 0], X_pca[i, 1], color=cluster_color, marker="D", s=50)
            plt.annotate(country, (X_pca[i, 0], X_pca[i, 1]), xytext=(5, 5), textcoords="offset points")

    # Customize the plot
    plt.title("Clusters of Countries based on Happiness Factors", fontsize=16)
    plt.xlabel("First Principal Component", fontsize=12)
    plt.ylabel("Second Principal Component", fontsize=12)

    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.7)
    plt.tight_layout()

    plt.savefig(f"images/kmeans-{selected_year}.png")
    plt.show()

    # Print the variance explained by the first two principal components
    print(f"Variance explained: {pca.explained_variance_ratio_.sum():.2f}")


# Create an output widget to display the plot
output = widgets.Output()

# Function to update the plot
def update_plot_factors(change):
    year = change["new"]

    with output:
        output.clear_output(wait=True)
        update_contributing_factor(year)

# Connect the dropdown to the update function
dropdown_years_factors.observe(update_plot_factors, names="value")

# Display the dropdown and initial plot
display(dropdown_years_factors, output)
update_plot_factors({"new": "2015"})

Dropdown(description='Year', index=4, options=('2015', '2016', '2017', '2018', '2019'), value='2019')

Output()

![Plot Name](images/kmeans-2015.png)
![Plot Name](images/kmeans-2016.png)
![Plot Name](images/kmeans-2017.png)
![Plot Name](images/kmeans-2018.png)
![Plot Name](images/kmeans-2019.png)

### Supervised Learning
#### Random Forest Regressor

In [118]:
# Create a dropdown widget so that a user can plot the correlation between happiness scores
# and other factors
years_list = ["2015", "2016", "2017", "2018", "2019"] # List of years
dropdown_years_rforest = widgets.Dropdown(options=years_list, value="2015", description="Year")

def update_random_forest(selected_year):
    # Get the index of the selected year in years_list
    df_index = years_list.index(selected_year)
    df = tables_list[df_index]  # Corresponding dataframe based on selected_year

    # Prepare data
    features = ["Economy (GDP per Capita)", "Health (Life Expectancy)", "Freedom",
                "Trust (Government Corruption)", "Generosity"]

    # Features and target
    X = df[features]
    y = df["Happiness Score"]

    # Train test split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize Random Forest Regressor and fit to the data
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Predicted happiness score
    y_pred = model.predict(X_test)

    # Mean squared error
    mse = mean_squared_error(y_test, y_pred)

    # R-squared
    r2 = r2_score(y_test, y_pred)

    # Feature importance
    importance = pd.DataFrame({"Feature": features, "Importance": model.feature_importances_})
    importance = importance.sort_values("Importance", ascending=False)

    # Plotting
    plt.figure(figsize=(6,4))

    colors = ["red", "green", "blue", "purple", "orange"]
    sns.barplot(x="Feature", y="Importance", data=importance, palette=colors, hue="Feature")

    plt.title(f"Feature Importance in Predicting Happiness Score for {selected_year}")

    # Rotate x-axis labels
    plt.xticks(rotation=90, ha="right")

    plt.ylim(0,1)

    plt.savefig(f"images/random-forest-{selected_year}.png")
    plt.show()

    print("Mean Squared Error: %.3f"%(mse))
    print("R-squared Score: %.3f"%(r2))

# Create an output widget to display the plot
output = widgets.Output()

# Function to update the plot
def update_plot_rforest(change):
    year = change["new"]

    with output:
        output.clear_output(wait=True)
        update_random_forest(year)

# Connect the dropdown to the update function
dropdown_years_rforest.observe(update_plot_rforest, names="value")

# Display the dropdown and initial plot
display(dropdown_years_rforest, output)
update_plot_rforest({"new": "2015"})

Dropdown(description='Year', options=('2015', '2016', '2017', '2018', '2019'), value='2015')

Output()

![Plot Name](images/random-forest-2015.png)
![Plot Name](images/random-forest-2016.png)
![Plot Name](images/random-forest-2017.png)
![Plot Name](images/random-forest-2018.png)
![Plot Name](images/random-forest-2019.png)