# Countries, Flags and Happiness (Classifier Version)

This notebook uses classification to predict happiness category based on flag features.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import pearsonr
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


In [None]:
happiness_df = pd.read_csv('WorldHappiness.csv')
flags_df = pd.read_csv('flags.csv', sep=';')

happiness_df['Country'] = happiness_df['Country'].str.strip().str.lower()
flags_df['name'] = flags_df['name'].str.strip().str.lower()

merged_df = pd.merge(happiness_df, flags_df, left_on='Country', right_on='name', how='inner')
merged_df.columns = merged_df.columns.str.strip()

print(f"Merged shape: {merged_df.shape}")


In [None]:
sns.histplot(merged_df['Happiness Score'], bins=20, kde=True)
plt.title('Distribution of Happiness Score')
plt.xlabel('Happiness Score')
plt.ylabel('Frequency')
plt.show()


In [None]:
top15 = merged_df.sort_values(by='Happiness Score', ascending=False).head(15)
sns.barplot(data=top15, y='Country', x='Happiness Score')
plt.title('Top 15 Happiest Countries')
plt.xlabel('Happiness Score')
plt.ylabel('Country')
plt.show()


In [None]:
sns.boxplot(data=merged_df, x='Region', y='Happiness Score')
plt.xticks(rotation=90)
plt.title('Happiness Score by Region')
plt.tight_layout()
plt.show()

corr_matrix = merged_df.select_dtypes(include='number').corr()
top_corr_features = corr_matrix['Happiness Score'].abs().sort_values(ascending=False).head(8).index
filtered_corr = corr_matrix.loc[top_corr_features, top_corr_features]
plt.figure(figsize=(10, 6))
sns.heatmap(filtered_corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Top Correlated Features with Happiness Score')
plt.tight_layout()
plt.show()


In [None]:
sns.scatterplot(data=merged_df, x='Freedom', y='Happiness Score', hue='Region')
plt.title('Freedom vs Happiness Score')
plt.tight_layout()
plt.show()

r_value, p_value = pearsonr(merged_df['Freedom'], merged_df['Happiness Score'])
print(f"Pearson r: {r_value:.4f}")
print(f"p-value: {p_value:.8f}")


In [None]:
# Create classification target
median_happiness = merged_df['Happiness Score'].median()
merged_df['Happiness Class'] = merged_df['Happiness Score'].apply(lambda x: 'high' if x >= median_happiness else 'low')

feature_cols = ['red', 'green', 'blue', 'gold', 'white', 'black', 'orange', 'circles',
                'crosses', 'saltires', 'quarters', 'sunstars', 'crescent', 'triangle',
                'icon', 'animate']

X = merged_df[feature_cols]
y = merged_df['Happiness Class']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())
