# Countries, Flags and Happiness

This notebook explores the relationship between countries' flag features and their happiness scores.

In [None]:
# Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Load datasets
happiness_df = pd.read_csv('WorldHappiness.csv')
flags_df = pd.read_csv('flags.csv', sep=';')

In [None]:
# Standardize country names
happiness_df['Country'] = happiness_df['Country'].str.strip().str.lower()
flags_df['name'] = flags_df['name'].str.strip().str.lower()

In [None]:
# Merge datasets
merged_df = pd.merge(happiness_df, flags_df, left_on='Country', right_on='name', how='inner')
merged_df.columns = merged_df.columns.str.strip()
print(f'Merged shape: {merged_df.shape}')

In [None]:
# Select features and target
feature_cols = ['red', 'green', 'blue', 'gold', 'white', 'black', 'orange',
    'circles', 'crosses', 'saltires', 'quarters', 'sunstars',
    'crescent', 'triangle', 'icon', 'animate']

X = merged_df[feature_cols]
y = merged_df['Happiness Score']

In [None]:
# 1. Histogram of Happiness Score
sns.histplot(merged_df['Happiness Score'], bins=20, kde=True)
plt.title('Distribution of Happiness Score (Cleaned)')
plt.xlabel('Happiness Score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# 2. Top 15 Happiest Countries
top15 = merged_df.sort_values(by='Happiness Score', ascending=False).head(15)
sns.barplot(data=top15, y='Country', x='Happiness Score')
plt.title('Top 15 Happiest Countries')
plt.xlabel('Happiness Score')
plt.ylabel('Country')
plt.show()

In [None]:
# 3. Happiness Score by Region
sns.boxplot(data=merged_df, x='Region', y='Happiness Score')
plt.xticks(rotation=90)
plt.title('Happiness Score by Region (Cleaned)')
plt.xlabel('Region')
plt.ylabel('Happiness Score')
plt.show()

In [None]:
# 4. Correlation Heatmap
plt.figure(figsize=(14, 10))
corr = merged_df.select_dtypes(include='number').corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', annot_kws={'size': 6})
plt.title('Correlation Heatmap (Cleaned Data)', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=8)
plt.yticks(fontsize=8)
plt.tight_layout()
plt.show()

In [None]:
# 5. Freedom vs Happiness Score
sns.scatterplot(data=merged_df, x='Freedom', y='Happiness Score', hue='Region')
plt.title('Freedom vs Happiness Score (Cleaned Data)')
plt.xlabel('Freedom')
plt.ylabel('Happiness Score')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

from scipy.stats import pearsonr
r_value, p_value = pearsonr(merged_df['Freedom'], merged_df['Happiness Score'])
print(f'Hypothesis Testing: Freedom vs Happiness Score')
print(f'Correlation Coefficient (r): {r_value:.4f}')
print(f'p-value: {p_value:.4f}')

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('R² Score:', r2_score(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))

In [None]:
# Show sample predictions
comparison_df = X_test.copy()
comparison_df['Actual Happiness'] = y_test
comparison_df['Predicted Happiness'] = y_pred
comparison_df['Country'] = merged_df.loc[y_test.index, 'Country'].values
comparison_df['Error'] = abs(comparison_df['Actual Happiness'] - comparison_df['Predicted Happiness'])

sample_results = comparison_df[['Country', 'Actual Happiness', 'Predicted Happiness', 'Error']].sort_values(by='Error', ascending=False).head(3)
print(sample_results)

In [None]:
# 1. Histogram of Happiness Score
sns.histplot(merged_df['Happiness Score'], bins=20, kde=True)
plt.title('Distribution of Happiness Score (Cleaned)')
plt.xlabel('Happiness Score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# 2. Top 15 Happiest Countries
top15 = merged_df.sort_values(by='Happiness Score', ascending=False).head(15)
sns.barplot(data=top15, y='Country', x='Happiness Score')
plt.title('Top 15 Happiest Countries')
plt.xlabel('Happiness Score')
plt.ylabel('Country')
plt.show()

In [None]:
# 3. Happiness Score by Region
sns.boxplot(data=merged_df, x='Region', y='Happiness Score')
plt.xticks(rotation=90)
plt.title('Happiness Score by Region (Cleaned)')
plt.xlabel('Region')
plt.ylabel('Happiness Score')
plt.show()

In [None]:
# 4. Correlation Heatmap
plt.figure(figsize=(14, 10))
corr = merged_df.select_dtypes(include='number').corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', annot_kws={'size': 6})
plt.title('Correlation Heatmap (Cleaned Data)', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=8)
plt.yticks(fontsize=8)
plt.tight_layout()
plt.show()

In [None]:
# 5. Freedom vs Happiness Score
sns.scatterplot(data=merged_df, x='Freedom', y='Happiness Score', hue='Region')
plt.title('Freedom vs Happiness Score (Cleaned Data)')
plt.xlabel('Freedom')
plt.ylabel('Happiness Score')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

from scipy.stats import pearsonr
r_value, p_value = pearsonr(merged_df['Freedom'], merged_df['Happiness Score'])
print(f'Hypothesis Testing: Freedom vs Happiness Score')
print(f'Correlation Coefficient (r): {r_value:.4f}')
print(f'p-value: {p_value:.4f}')