In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


# Define Paths
pathWorldBankIndex = r"world_bank_development_indicators.csv"
pathHappinessIndex = r"WHR25_Data_Figure_2.1v3.xlsx"
output_path = "A.csv"

# Load Data
print("Loading data")
try:
    df_wb = pd.read_csv(pathWorldBankIndex)
    df_wh = pd.read_excel(pathHappinessIndex)
except FileNotFoundError as e:
    print(f"Error: {e}")
    exit()
print("Data succesfully load.")

Loading data
Data succesfully load.


In [41]:
# Pre Process
# World Bank: Convert date to Year
df_wb['date_obj'] = pd.to_datetime(df_wb['date'], format='%d/%m/%Y', errors='coerce')
df_wb['Year_Num'] = df_wb['date_obj'].dt.year # Created a new column 

# Happiness: Ensure Year is numeric
df_wh['Year'] = pd.to_numeric(df_wh['Year'], errors='coerce')

# Filter Years 2012 to 2024
df_wb = df_wb[(df_wb['Year_Num'] >= 2012) & (df_wb['Year_Num'] <= 2024)].copy()
df_wh = df_wh[(df_wh['Year'] >= 2012) & (df_wh['Year'] <= 2024)].copy()

# Create the per country data set
# World Bank Averages
numeric_cols_wb = df_wb.select_dtypes(include=[np.number]).columns.tolist()
if 'Year_Num' in numeric_cols_wb: numeric_cols_wb.remove('Year_Num')
wb_grouped = df_wb.groupby('country')[numeric_cols_wb].mean()

# Happiness Averages
target_col_original = 'Life evaluation (3-year average)'
target_col_new = 'Life_Evaluation_Avg'
wh_grouped = df_wh.groupby('Country name')[target_col_original].mean()

# Merge t hem together
final_df = wb_grouped.merge(wh_grouped, left_index=True, right_index=True, how='inner')
final_df.rename(columns={target_col_original: target_col_new}, inplace=True)

# Plug mean of column, If column empty just 0.
for col in final_df.columns:
    if final_df[col].isnull().all():
        final_df[col] = 0
    else:
        final_df[col] = final_df[col].fillna(final_df[col].mean())


In [42]:
# Save the file as A
final_df.to_csv(output_path)
print(f"Merged dataset saved to {output_path}")
print(f"Countries analyzed: {len(final_df)}\n")

Merged dataset saved to A.csv
Countries analyzed: 147



In [43]:
# Calculate and Print Correlations
print(f"{'Index Name':<55} | {'Pearson':<10} | {'Spearman':<10}")
print("-" * 80)

# We iterate through all columns except the target itself
indicators = [col for col in final_df.columns if col != target_col_new]


results = []

for ind in indicators:
    # Pearson
    p_corr = final_df[ind].corr(final_df[target_col_new], method='pearson')
    # Spearman
    s_corr = final_df[ind].corr(final_df[target_col_new], method='spearman')
    
    results.append((ind, p_corr, s_corr))

# Sort by absolute Pearson correlation (Maybe spearman would be better, i do not konw)
results.sort(key=lambda x: abs(x[2]), reverse=True)

for ind, p, s in results:
    print(f"{ind:<55} | {p:>9.4f}  | {s:>9.4f}")

Index Name                                              | Pearson    | Spearman  
--------------------------------------------------------------------------------
life_expectancy_at_birth                                |    0.8220  |    0.8342
individuals_using_internet%                             |    0.8210  |    0.8239
access_to_electricity%                                  |    0.7166  |    0.8114
human_capital_index                                     |    0.8134  |    0.8050
goverment_effectiveness_estimate                        |    0.7783  |    0.7571
regulatory_quality_estimate                             |    0.7391  |    0.7353
government_health_expenditure%                          |    0.7263  |    0.7307
logistic_performance_index                              |    0.7367  |    0.7250
birth_rate                                              |   -0.7261  |   -0.7014
rule_of_law_estimate                                    |    0.7276  |    0.6794
control_of_corruption_estim

In [44]:
# Load dataset
input_file = "A.csv"
output_folder = "plots_selected"
df = pd.read_csv(input_file, index_col=0)

# Create output folder
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Target variable (index - and life evaluation correlations
target = 'Life_Evaluation_Avg'

# List of specific indicators to plot (These are the things that i find interesting to look at)
indicators = [
    'life_expectancy_at_birth',
    'individuals_using_internet%',
    'access_to_electricity%',
    'human_capital_index',
    'goverment_effectiveness_estimate',
    'government_health_expenditure%',
    'birth_rate',
    'rule_of_law_estimate',
    'CO2_emisions',
    'renewvable_energy_consumption%',
    'government_expenditure_on_education%'
]

print(f"Generating plots for {len(indicators)}")

for ind in indicators:
    # Check if column exists in the dataset
    if ind not in df.columns:
        print(f"Skipping '{ind}': Column not found in A.csv") # I expect no error here
        continue

    plot_data = df[[ind, target]].dropna()
    x = plot_data[ind]
    y = plot_data[target]

    # Calculate correlations
    pearson = x.corr(y, method='pearson')
    spearman = x.corr(y, method='spearman')

    # Create Plot
    plt.figure(figsize=(10, 6))
    plt.grid(True, linestyle='--', alpha=0.5)
    
    plt.scatter(x, y, alpha=0.7, c='teal', edgecolors='black', s=70)

    # Add Trend Line
    if len(x) > 1:
        z = np.polyfit(x, y, 1)
        p = np.poly1d(z)
        x_trend = np.linspace(x.min(), x.max(), 100)
        plt.plot(x_trend, p(x_trend), "r--", linewidth=2, label='Linear Trend')

    # Labels and Title
    plt.xlabel(ind)
    plt.ylabel("Life Evaluation")
    plt.title(f"{ind} vs Life Evaluation")

    # Display Correlation Stats on Plot
    stats = f"Pearson: {pearson:.4f}\nSpearman: {spearman:.4f}"
    plt.text(0.05, 0.95, stats, transform=plt.gca().transAxes, fontsize=11, verticalalignment='top', 
             bbox=dict(boxstyle="round", facecolor="white", alpha=0.9))
    # SAVE
    safe_name = ind.replace("%", "pct").replace(" ", "_")
    plt.savefig(os.path.join(output_folder, f"{safe_name}.png"))
    plt.close()
    
    print(f"Saved: {safe_name}.png | Pearson: {pearson:.4f} | Spearman: {spearman:.4f}")

print(f"\nFolder: '{output_folder}'")

Generating plots for 11
Saved: life_expectancy_at_birth.png | Pearson: 0.8220 | Spearman: 0.8342
Saved: individuals_using_internetpct.png | Pearson: 0.8210 | Spearman: 0.8239
Saved: access_to_electricitypct.png | Pearson: 0.7166 | Spearman: 0.8114
Saved: human_capital_index.png | Pearson: 0.8134 | Spearman: 0.8050
Saved: goverment_effectiveness_estimate.png | Pearson: 0.7783 | Spearman: 0.7571
Saved: government_health_expenditurepct.png | Pearson: 0.7263 | Spearman: 0.7307
Saved: birth_rate.png | Pearson: -0.7261 | Spearman: -0.7014
Saved: rule_of_law_estimate.png | Pearson: 0.7276 | Spearman: 0.6794
Saved: CO2_emisions.png | Pearson: 0.0797 | Spearman: 0.5080
Saved: renewvable_energy_consumptionpct.png | Pearson: -0.5186 | Spearman: -0.4965
Saved: government_expenditure_on_educationpct.png | Pearson: 0.3829 | Spearman: 0.4008

Folder: 'plots_selected'


In [45]:
# Take the top 30 and worst 30
input_file = "A.csv"
output_file_best = "B.csv"
output_file_worst = "C.csv"

print(f"Loading {input_file}")
try:
    df = pd.read_csv(input_file, index_col=0)
except FileNotFoundError:
    print(f"Error: {input_file} not found.")
    exit()

# We sort in descending order (highest happiness first)
df_sorted_desc = df.sort_values(by='Life_Evaluation_Avg', ascending=False)
df_top30 = df_sorted_desc.head(30)

# Save to B.csv
df_top30.to_csv(output_file_best)
print(f"Top 30 best countries saved to '{output_file_best}'.")

# We sort in ascending order (lowest happiness first)
df_sorted_asc = df.sort_values(by='Life_Evaluation_Avg', ascending=True)
df_bottom30 = df_sorted_asc.head(30)

# Save to C.csv
df_bottom30.to_csv(output_file_worst)
print(f"\nTop 30 lowest countries saved to '{output_file_worst}'.")

Loading A.csv
Top 30 best countries saved to 'B.csv'.

Top 30 lowest countries saved to 'C.csv'.


In [46]:
# Load the Top 30 dataset
input_file = "B.csv"
target_col = "Life_Evaluation_Avg"

print(f"Loading {input_file}...")
try:
    df = pd.read_csv(input_file, index_col=0)
except FileNotFoundError:
    print(f"Error: {input_file} not found.")
    exit()

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if target_col in numeric_cols:
    numeric_cols.remove(target_col)

results = []

print(f"\nCalculating correlations for {len(numeric_cols)} - '{target_col}'\n")

# Calculate Correlations
for col in numeric_cols:
    # Pearson
    pearson = df[col].corr(df[target_col], method='pearson')
    # Spearman 
    spearman = df[col].corr(df[target_col], method='spearman')
    
    results.append((col, pearson, spearman))

# SORT
results.sort(key=lambda x: abs(x[2]), reverse=True)

# PRINT
print(f"{'Index Name':<55} | {'Pearson':<10} | {'Spearman':<10}")
print("-" * 80)

for name, p, s in results:
    print(f"{name:<55} | {p:>9.4f}  | {s:>9.4f}")

Loading B.csv...

Calculating correlations for 48 - 'Life_Evaluation_Avg'

Index Name                                              | Pearson    | Spearman  
--------------------------------------------------------------------------------
voice_and_accountability_estimate                       |    0.6605  |    0.7944
rule_of_law_estimate                                    |    0.6809  |    0.7539
control_of_corruption_estimate                          |    0.6352  |    0.6752
goverment_effectiveness_estimate                        |    0.6010  |    0.6730
regulatory_quality_estimate                             |    0.6302  |    0.6240
government_expenditure_on_education%                    |    0.6623  |    0.6240
regulatory_quality_std                                  |    0.3543  |    0.5818
life_expectancy_at_birth                                |    0.6087  |    0.5764
individuals_using_internet%                             |    0.5707  |    0.5626
research_and_development_expendit

In [47]:
# Load the Bottom 30 dataset
input_file = "C.csv"
target_col = "Life_Evaluation_Avg"

print(f"Loading {input_file} (Bottom 30)...")
try:
    df = pd.read_csv(input_file, index_col=0)
except FileNotFoundError:
    print(f"Error: {input_file} not found.")
    exit()

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if target_col in numeric_cols:
    numeric_cols.remove(target_col)

results = []

print(f"\nCalculating correlations for {len(numeric_cols)} - '{target_col}'\n")

for col in numeric_cols:
    # Pearson 
    pearson = df[col].corr(df[target_col], method='pearson')
    # Spearman 
    spearman = df[col].corr(df[target_col], method='spearman')
    
    results.append((col, pearson, spearman))

# sort
results.sort(key=lambda x: abs(x[2]), reverse=True)

# print
print(f"{'Index Name (Bottom 30)':<55} | {'Pearson':<10} | {'Spearman':<10}")
print("-" * 80)

for name, p, s in results:
    print(f"{name:<55} | {p:>9.4f}  | {s:>9.4f}")

Loading C.csv (Bottom 30)...

Calculating correlations for 48 - 'Life_Evaluation_Avg'

Index Name (Bottom 30)                                  | Pearson    | Spearman  
--------------------------------------------------------------------------------
CO2_emisions                                            |    0.1143  |    0.4403
GDP_current_US                                          |    0.1283  |    0.4198
gini_index                                              |   -0.2007  |   -0.3919
political_stability_std                                 |   -0.4083  |   -0.3789
other_greenhouse_emisions                               |    0.1231  |    0.3717
expense%                                                |   -0.3819  |   -0.3601
control_of_corruption_std                               |   -0.3542  |   -0.3517
voice_and_accountability_std                            |   -0.3505  |   -0.3444
population                                              |    0.1208  |    0.3353
time_to_get_operation

In [48]:
# Setup paths
# Plots for B
input_file = "B.csv"
output_folder = "Bplots"
target = 'Life_Evaluation_Avg'

# Create folder
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

print(f"Loading {input_file}")
try:
    df = pd.read_csv(input_file, index_col=0)
except FileNotFoundError:
    print(f"Error: {input_file} not found.")
    exit()

indicators = [
    'life_expectancy_at_birth',
    'individuals_using_internet%',
    'access_to_electricity%',
    'human_capital_index',
    'goverment_effectiveness_estimate',
    'government_health_expenditure%',
    'birth_rate',
    'CO2_emisions',
    'renewvable_energy_consumption%',
    'voice_and_accountability_estimate',  
    'control_of_corruption_estimate'   
]

print(f"Generating plots for {len(indicators)} indices")

# Loop and Plot
for ind in indicators:
    if ind not in df.columns:
        print(f"'{ind}': Column not found in CSV.")
        continue

    subset = df[[ind, target]].dropna()
    x = subset[ind]
    y = subset[target]

    if len(x) < 2:
        print(f"'{ind}': Not enough data points.")
        continue

    # Calculate Correlations
    pearson = x.corr(y, method='pearson')
    spearman = x.corr(y, method='spearman')

    # Setup Plot
    plt.figure(figsize=(10, 6))
    plt.grid(True, linestyle='--', alpha=0.5)

    plt.scatter(x, y, alpha=0.8, c='royalblue', edgecolors='black', s=80)

    # Trend Line
    try:
        z = np.polyfit(x, y, 1)
        p = np.poly1d(z)
        x_trend = np.linspace(x.min(), x.max(), 100)
        plt.plot(x_trend, p(x_trend), "r--", linewidth=2, label='Linear Trend')
    except Exception as e:
        print(f"error {ind}: {e}")

    # Titles and Labels
    plt.xlabel(ind)
    plt.ylabel("Life Evaluation (Top 30)")
    plt.title(f"{ind} vs Happiness\n(Top 30 Countries Only)")

    stats_text = f"Pearson: {pearson:.4f}\nSpearman: {spearman:.4f}"
    plt.text(0.05, 0.95, stats_text, transform=plt.gca().transAxes,
             fontsize=11, verticalalignment='top',
             bbox=dict(boxstyle="round", facecolor="white", alpha=0.9))

    ### Label Outliers (Min/Max X) (This part Taken from Chat GPT)
    try:
        max_idx = x.idxmax()
        min_idx = x.idxmin()
        plt.text(x[max_idx], y[max_idx], f" {max_idx}", fontsize=9, fontweight='bold')
        plt.text(x[min_idx], y[min_idx], f" {min_idx}", fontsize=9, fontweight='bold')
    except:
        pass
    ### Chat gpt ends
    
    # SAVE 
    safe_name = ind.replace("%", "pct").replace(" ", "_")
    save_path = os.path.join(output_folder, f"{safe_name}.png")
    plt.savefig(save_path)
    plt.close()

    print(f"Saved: {safe_name}.png")

print(f"\nFinish ->'{output_folder}' folder.")

Loading B.csv
Generating plots for 11 indices
Saved: life_expectancy_at_birth.png
Saved: individuals_using_internetpct.png
Saved: access_to_electricitypct.png
Saved: human_capital_index.png
Saved: goverment_effectiveness_estimate.png
Saved: government_health_expenditurepct.png
Saved: birth_rate.png
Saved: CO2_emisions.png
Saved: renewvable_energy_consumptionpct.png
Saved: voice_and_accountability_estimate.png
Saved: control_of_corruption_estimate.png

Finish ->'Bplots' folder.


In [49]:
#The same thing, but for bottom 30 countries
input_file = "C.csv"
output_folder = "Cplots"
target = 'Life_Evaluation_Avg'

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

print(f"Loading {input_file}")
try:
    df = pd.read_csv(input_file, index_col=0)
except FileNotFoundError:
    print(f"Error: {input_file} not found.")
    exit()

indicators = [
    'life_expectancy_at_birth',
    'individuals_using_internet%',
    'access_to_electricity%',
    'human_capital_index',
    'goverment_effectiveness_estimate',
    'government_health_expenditure%',
    'birth_rate',
    'CO2_emisions',
    'renewvable_energy_consumption%',
    'voice_and_accountability_estimate', 
    'control_of_corruption_estimate',     
    'GDP_current_US'                      
]

print(f"Generating plots for {len(indicators)}")

# Loop and Plot
for ind in indicators:
    if ind not in df.columns:
        print(f"'{ind}': Column not found in CSV.")
        continue

    subset = df[[ind, target]].dropna()
    x = subset[ind]
    y = subset[target]

    if len(x) < 2:
        print(f"'{ind}': Not enough data points.")
        continue

    # Calculate Correlations
    pearson = x.corr(y, method='pearson')
    spearman = x.corr(y, method='spearman')

    # Setup Plot
    plt.figure(figsize=(10, 6))
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.scatter(x, y, alpha=0.8, c='salmon', edgecolors='black', s=80)

    # Trend Line
    try:
        z = np.polyfit(x, y, 1)
        p = np.poly1d(z)
        x_trend = np.linspace(x.min(), x.max(), 100)
        plt.plot(x_trend, p(x_trend), "b--", linewidth=2, label='Linear Trend')
    except Exception as e:
        print(f"error {ind}: {e}")

    # Titles and Labels
    plt.xlabel(ind)
    plt.ylabel("Life Evaluation (Bottom 30)")
    plt.title(f"{ind} vs Happiness\n(Bottom 30 Countries Only)")

    stats_text = f"Pearson: {pearson:.4f}\nSpearman: {spearman:.4f}"
    plt.text(0.05, 0.95, stats_text, transform=plt.gca().transAxes,
             fontsize=11, verticalalignment='top',
             bbox=dict(boxstyle="round", facecolor="white", alpha=0.9))

    ### Label Outliers (Min/Max X) (This is also taken from chat gpt)
    try:
        max_idx = x.idxmax()
        min_idx = x.idxmin()
        plt.text(x[max_idx], y[max_idx], f" {max_idx}", fontsize=9, fontweight='bold')
        # Only label min if it's different country
        if min_idx != max_idx:
            plt.text(x[min_idx], y[min_idx], f" {min_idx}", fontsize=9, fontweight='bold')
    except:
        pass
    ### End of chat gpt
    
    # SAVE
    safe_name = ind.replace("%", "pct").replace(" ", "_")
    save_path = os.path.join(output_folder, f"{safe_name}.png")
    plt.savefig(save_path)
    plt.close()

    print(f"Saved: {safe_name}.png")

print(f"\n Finished -> '{output_folder}' folder.")

Loading C.csv
Generating plots for 12
Saved: life_expectancy_at_birth.png
Saved: individuals_using_internetpct.png
Saved: access_to_electricitypct.png
Saved: human_capital_index.png
Saved: goverment_effectiveness_estimate.png
Saved: government_health_expenditurepct.png
Saved: birth_rate.png
Saved: CO2_emisions.png
Saved: renewvable_energy_consumptionpct.png
Saved: voice_and_accountability_estimate.png
Saved: control_of_corruption_estimate.png
Saved: GDP_current_US.png

 Finished -> 'Cplots' folder.
