In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import ElasticNet, RidgeCV, LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.impute import KNNImputer 

In [2]:
#Loading data from happiness (to have our "Y") + extracting only the column of interest, i.e Score

h2015=pd.read_csv("2015.csv")
h2016=pd.read_csv("2016.csv")
h2017=pd.read_csv("2017.csv")
h2017.at[55,"Country"]="Hong Kong"
h2018=pd.read_csv("2018.csv")
h2019=pd.read_csv("2019.csv")
h2020=pd.read_csv("2020.csv")
h2021=pd.read_csv("2021.csv")
# h2022=pd.read_csv("2022.csv").sort_values(by=['Country'])["Happiness score"]

years=[h2015,h2016,h2017,h2018,h2019,h2020,h2021]

In [3]:
h2015.drop(columns=["Happiness Rank","Standard Error","Family","Region"],axis=1, inplace=True)
h2015.drop(h2015.columns[-1],axis=1,inplace=True)

In [4]:
h2015

Unnamed: 0,Country,Happiness Score,Economy (GDP per Capita),Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity
0,Switzerland,7.587,1.39651,0.94143,0.66557,0.41978,0.29678
1,Iceland,7.561,1.30232,0.94784,0.62877,0.14145,0.43630
2,Denmark,7.527,1.32548,0.87464,0.64938,0.48357,0.34139
3,Norway,7.522,1.45900,0.88521,0.66973,0.36503,0.34699
4,Canada,7.427,1.32629,0.90563,0.63297,0.32957,0.45811
...,...,...,...,...,...,...,...
153,Rwanda,3.465,0.22208,0.42864,0.59201,0.55191,0.22628
154,Benin,3.340,0.28665,0.31910,0.48450,0.08010,0.18260
155,Syria,3.006,0.66320,0.72193,0.15684,0.18906,0.47179
156,Burundi,2.905,0.01530,0.22396,0.11850,0.10062,0.19727


In [5]:
h2016.drop(columns=["Happiness Rank","Family","Region","Upper Confidence Interval","Lower Confidence Interval"],axis=1, inplace=True)
h2016.drop(h2016.columns[-1],axis=1,inplace=True)
h2016

Unnamed: 0,Country,Happiness Score,Economy (GDP per Capita),Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity
0,Denmark,7.526,1.44178,0.79504,0.57941,0.44453,0.36171
1,Switzerland,7.509,1.52733,0.86303,0.58557,0.41203,0.28083
2,Iceland,7.501,1.42666,0.86733,0.56624,0.14975,0.47678
3,Norway,7.498,1.57744,0.79579,0.59609,0.35776,0.37895
4,Finland,7.413,1.40598,0.81091,0.57104,0.41004,0.25492
...,...,...,...,...,...,...,...
152,Benin,3.484,0.39499,0.21028,0.39747,0.06681,0.20180
153,Afghanistan,3.360,0.38227,0.17344,0.16430,0.07112,0.31268
154,Togo,3.303,0.28123,0.24811,0.34678,0.11587,0.17517
155,Syria,3.069,0.74719,0.62994,0.06912,0.17233,0.48397


In [6]:
h2017.drop(columns=["Happiness.Rank","Whisker.high","Whisker.low","Family","Dystopia.Residual"],axis=1,inplace=True)
                    

In [7]:
h2017=h2017.rename(columns={"Happiness.Score":"Happiness Score","Economy..GDP.per.Capita.":"Economy (GDP per Capita)",
                            "Trust..Government.Corruption.":"Trust (Government Corruption)",
                            "Health..Life.Expectancy.":"Health (Life Expectancy)"})

In [8]:
h2018.drop(columns=["Overall rank","Social support"],axis=1,inplace=True)
h2018=h2018.rename(columns={"Country or region":"Country",
                            "GDP per capita":"Economy (GDP per Capita)",
                           "Score":"Happiness Score",
                            "Healthy life expectancy":"Health (Life Expectancy)",
                           "Freedom to make life choices":"Freedom",
                           "Perceptions of corruption":"Trust (Government Corruption)"})

In [9]:
h2019.drop(columns=["Overall rank","Social support"],axis=1,inplace=True)
h2019=h2019.rename(columns={"Country or region":"Country","Score":"Happiness Score",
                      "GDP per capita":"Economy (GDP per Capita)",
                      "Freedom to make life choices":"Freedom",
                      "Perceptions of corruption":"Trust (Government Corruption)",
                    "Healthy life expectancy":"Health (Life Expectancy)"
                     })

In [10]:
h2019

Unnamed: 0,Country,Happiness Score,Economy (GDP per Capita),Health (Life Expectancy),Freedom,Generosity,Trust (Government Corruption)
0,Finland,7.769,1.340,0.986,0.596,0.153,0.393
1,Denmark,7.600,1.383,0.996,0.592,0.252,0.410
2,Norway,7.554,1.488,1.028,0.603,0.271,0.341
3,Iceland,7.494,1.380,1.026,0.591,0.354,0.118
4,Netherlands,7.488,1.396,0.999,0.557,0.322,0.298
...,...,...,...,...,...,...,...
151,Rwanda,3.334,0.359,0.614,0.555,0.217,0.411
152,Tanzania,3.231,0.476,0.499,0.417,0.276,0.147
153,Afghanistan,3.203,0.350,0.361,0.000,0.158,0.025
154,Central African Republic,3.083,0.026,0.105,0.225,0.235,0.035


In [11]:
h2020.drop(columns=["Regional indicator","Standard error of ladder score","upperwhisker","lowerwhisker",
                    "Social support","Logged GDP per capita","Explained by: Social support","Ladder score in Dystopia",
                   "Dystopia + residual","Freedom to make life choices",
                   "Healthy life expectancy","Generosity", "Perceptions of corruption"],axis=1,inplace=True)

In [12]:
h2020=h2020.rename(columns={"Country name":"Country",
                            "Ladder score":"Happiness Score",
                            "Explained by: Generosity":"Generosity",
                      "Explained by: Log GDP per capita":"Economy (GDP per Capita)",
                      "Explained by: Freedom to make life choices":"Freedom",
                      "Explained by: Perceptions of corruption":"Trust (Government Corruption)",
                    "Explained by: Healthy life expectancy":"Health (Life Expectancy)"
                     })

In [13]:
h2020

Unnamed: 0,Country,Happiness Score,Economy (GDP per Capita),Health (Life Expectancy),Freedom,Generosity,Trust (Government Corruption)
0,Finland,7.8087,1.285190,0.961271,0.662317,0.159670,0.477857
1,Denmark,7.6456,1.326949,0.979333,0.665040,0.242793,0.495260
2,Switzerland,7.5599,1.390774,1.040533,0.628954,0.269056,0.407946
3,Iceland,7.5045,1.326502,1.000843,0.661981,0.362330,0.144541
4,Norway,7.4880,1.424207,1.008072,0.670201,0.287985,0.434101
...,...,...,...,...,...,...,...
148,Central African Republic,3.4759,0.041072,0.000000,0.292814,0.253513,0.028265
149,Rwanda,3.3123,0.343243,0.572383,0.604088,0.235705,0.485542
150,Zimbabwe,3.2992,0.425564,0.375038,0.377405,0.151349,0.080929
151,South Sudan,2.8166,0.289083,0.208809,0.065609,0.209935,0.111157


In [14]:
h2021.drop(columns=["Regional indicator","Standard error of ladder score","upperwhisker","lowerwhisker",
                    "Social support","Logged GDP per capita","Explained by: Social support","Ladder score in Dystopia",
                   "Dystopia + residual","Freedom to make life choices",
                   "Healthy life expectancy","Generosity", "Perceptions of corruption"],axis=1,inplace=True)

h2021=h2021.rename(columns={"Country name":"Country",
                            "Ladder score":"Happiness Score",
                            "Explained by: Generosity":"Generosity",
                      "Explained by: Log GDP per capita":"Economy (GDP per Capita)",
                      "Explained by: Freedom to make life choices":"Freedom",
                      "Explained by: Perceptions of corruption":"Trust (Government Corruption)",
                    "Explained by: Healthy life expectancy":"Health (Life Expectancy)"
                     })

In [15]:
h2021

Unnamed: 0,Country,Happiness Score,Economy (GDP per Capita),Health (Life Expectancy),Freedom,Generosity,Trust (Government Corruption)
0,Finland,7.842,1.446,0.741,0.691,0.124,0.481
1,Denmark,7.620,1.502,0.763,0.686,0.208,0.485
2,Switzerland,7.571,1.566,0.816,0.653,0.204,0.413
3,Iceland,7.554,1.482,0.772,0.698,0.293,0.170
4,Netherlands,7.464,1.501,0.753,0.647,0.302,0.384
...,...,...,...,...,...,...,...
144,Lesotho,3.512,0.451,0.007,0.405,0.103,0.015
145,Botswana,3.467,1.099,0.340,0.539,0.027,0.088
146,Rwanda,3.415,0.364,0.407,0.627,0.227,0.493
147,Zimbabwe,3.145,0.457,0.243,0.359,0.157,0.075


In [16]:
for i in range(len(years)):
    print(i,":",years[i].shape)

0 : (158, 7)
1 : (157, 7)
2 : (155, 7)
3 : (156, 7)
4 : (156, 7)
5 : (153, 7)
6 : (149, 7)


In [17]:
# Get the column order of h2015
column_order = h2015.columns

# Iterate over the years DataFrames
for i in range(len(years)):
    # Reindex the DataFrame to match the column order of h2015
    years[i] = years[i].reindex(columns=column_order)



In [18]:
h2018

Unnamed: 0,Country,Happiness Score,Economy (GDP per Capita),Health (Life Expectancy),Freedom,Generosity,Trust (Government Corruption)
0,Finland,7.632,1.305,0.874,0.681,0.202,0.393
1,Norway,7.594,1.456,0.861,0.686,0.286,0.340
2,Denmark,7.555,1.351,0.868,0.683,0.284,0.408
3,Iceland,7.495,1.343,0.914,0.677,0.353,0.138
4,Switzerland,7.487,1.420,0.927,0.660,0.256,0.357
...,...,...,...,...,...,...,...
151,Yemen,3.355,0.442,0.343,0.244,0.083,0.064
152,Tanzania,3.303,0.455,0.381,0.481,0.270,0.097
153,South Sudan,3.254,0.337,0.177,0.112,0.224,0.106
154,Central African Republic,3.083,0.024,0.010,0.305,0.218,0.038


In [19]:
concatenated_df = pd.concat([h2015, h2016])
concatenated_df=concatenated_df.reset_index(drop=True)
concatenated_df = pd.concat([concatenated_df, h2017])
concatenated_df=concatenated_df.reset_index(drop=True)
concatenated_df = pd.concat([concatenated_df, h2018])
concatenated_df=concatenated_df.reset_index(drop=True)
concatenated_df = pd.concat([concatenated_df, h2019])
concatenated_df=concatenated_df.reset_index(drop=True)
concatenated_df = pd.concat([concatenated_df, h2020])
concatenated_df=concatenated_df.reset_index(drop=True)
concatenated_df = pd.concat([concatenated_df, h2021])
concatenated_df=concatenated_df.reset_index(drop=True)


In [20]:
concatenated_df = concatenated_df[concatenated_df != 0].dropna()

In [21]:
concatenated_df=concatenated_df.reset_index(drop=True)


In [22]:
concatenated_df.corr()

Unnamed: 0,Happiness Score,Economy (GDP per Capita),Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity
Happiness Score,1.0,0.79002,0.734426,0.55877,0.407588,0.099994
Economy (GDP per Capita),0.79002,1.0,0.768292,0.378027,0.333797,-0.0416
Health (Life Expectancy),0.734426,0.768292,1.0,0.353753,0.292482,-0.003778
Freedom,0.55877,0.378027,0.353753,1.0,0.433425,0.216602
Trust (Government Corruption),0.407588,0.333797,0.292482,0.433425,1.0,0.284519
Generosity,0.099994,-0.0416,-0.003778,0.216602,0.284519,1.0


In [23]:
full_data=concatenated_df

In [24]:
full_data

Unnamed: 0,Country,Happiness Score,Economy (GDP per Capita),Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity
0,Switzerland,7.587,1.39651,0.94143,0.66557,0.41978,0.29678
1,Iceland,7.561,1.30232,0.94784,0.62877,0.14145,0.43630
2,Denmark,7.527,1.32548,0.87464,0.64938,0.48357,0.34139
3,Norway,7.522,1.45900,0.88521,0.66973,0.36503,0.34699
4,Canada,7.427,1.32629,0.90563,0.63297,0.32957,0.45811
...,...,...,...,...,...,...,...
1042,Malawi,3.600,0.11300,0.29800,0.48400,0.13400,0.21300
1043,Lesotho,3.512,0.45100,0.00700,0.40500,0.01500,0.10300
1044,Botswana,3.467,1.09900,0.34000,0.53900,0.08800,0.02700
1045,Rwanda,3.415,0.36400,0.40700,0.62700,0.49300,0.22700


In [25]:
first_col = "Happiness Score"
last_col = "Generosity"

# Swap the first and last columns with column names
column_names = full_data.columns.tolist()
first_col_idx = column_names.index(first_col)
last_col_idx = column_names.index(last_col)

column_names[first_col_idx], column_names[last_col_idx] = column_names[last_col_idx], column_names[first_col_idx]

full_data = full_data.reindex(columns=column_names)

In [26]:
X=full_data.drop(columns=["Happiness Score","Country"],axis=1)
y=full_data["Happiness Score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Create and fit an ElasticNet regression model
enet = ElasticNet()
enet.fit(X_train, y_train)

# Predict on the test set
y_pred = enet.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)


Mean Squared Error (MSE): 1.2328206808209534


In [27]:
# Define the hyperparameter grid elastic net
param_grid = {
    'alpha': [0.01, 0.1, 1, 10],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
    'max_iter': [100, 500, 1000, 10000]
}

# Perform grid search with cross-validation
elasticnet_cv = GridSearchCV(enet, param_grid, cv=5, scoring='neg_mean_squared_error')
elasticnet_cv.fit(X_train_scaled, y_train)

# Get the best hyperparameters
best_params = elasticnet_cv.best_params_
print("Best Hyperparameters:", best_params)

# Predict on the test set using the best model
y_pred = elasticnet_cv.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print("MSE =",mse)


Best Hyperparameters: {'alpha': 0.01, 'l1_ratio': 0.1, 'max_iter': 100}
MSE = 0.307748331613569


In [28]:
elasticNet_model=ElasticNet(alpha=0.01,l1_ratio=0.1,max_iter=100)
elasticNet_model.fit(X_train_scaled,y_train)

ElasticNet(alpha=0.01, l1_ratio=0.1, max_iter=100)

In [29]:
linear_regression = LinearRegression()

# Train the model
linear_regression.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = linear_regression.predict(X_test_scaled)

# Calculate the mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Print the MSE
print("Mean Squared Error (MSE) For Linear Regression:", mse)

Mean Squared Error (MSE) For Linear Regression: 0.3058543499752488


In [30]:
#Ridge regression 
ridge_model = Ridge(alpha=1.0)  
ridge_model.fit(X_train_scaled, y_train)
y_pred_ridge = ridge_model.predict(X_test_scaled)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

#Lasso regression
lasso_model = Lasso(alpha=1.0)  
lasso_model.fit(X_train_scaled, y_train)
y_pred_lasso = lasso_model.predict(X_test_scaled)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)

# Print the MSE for Ridge and Lasso models
print("Mean Squared Error (MSE) - Ridge Regression:", mse_ridge)
print("Mean Squared Error (MSE) - Lasso Regression:", mse_lasso)

Mean Squared Error (MSE) - Ridge Regression: 0.30608058725221804
Mean Squared Error (MSE) - Lasso Regression: 1.2328206808209534


In [31]:
listofalphas=[]
i=0.001
while i<=10:
    listofalphas.append(i)
    i+=0.001
    
ridge_model_cv = RidgeCV(alphas=listofalphas) 
ridge_model_cv.fit(X_train_scaled, y_train)
y_pred_ridge = ridge_model_cv.predict(X_test_scaled)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print("Best alpha for Ridge regression:", ridge_model_cv.alpha_)
print("Mean Squared Error (MSE) - Ridge Regression:", mse_ridge)

# Lasso
lasso_model_cv = LassoCV(alphas=[0.001,0.01,0.1,1,10])  
lasso_model_cv.fit(X_train_scaled, y_train)
y_pred_lasso = lasso_model_cv.predict(X_test_scaled)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
print("Best alpha for Lasso regression:", lasso_model_cv.alpha_)
print("Mean Squared Error (MSE) - Lasso Regression:", mse_lasso)

Best alpha for Ridge regression: 2.858999999999796
Mean Squared Error (MSE) - Ridge Regression: 0.306506332499195
Best alpha for Lasso regression: 0.001
Mean Squared Error (MSE) - Lasso Regression: 0.3059685409534556


In [32]:
elasticNet_model.coef_

array([0.07622539, 0.78398298, 0.46622185, 0.39683408, 0.05798746])

In [33]:
full_data.drop(columns=["Country"],inplace=True)
linear_coeff_magnitudes = np.abs(linear_regression.coef_)
ridge_coeff_magnitudes = np.abs(ridge_model_cv.coef_)
lasso_coeff_magnitudes = np.abs(lasso_model_cv.coef_)
elasticNet_coeff_magnitudes = np.abs(elasticNet_model.coef_)

# Sort the magnitudes and corresponding variable names in descending order for each model
sorted_linear_indices = np.argsort(linear_coeff_magnitudes)[::-1]
sorted_linear_magnitudes = linear_coeff_magnitudes[sorted_linear_indices]
sorted_linear_variable_names = full_data.columns[sorted_linear_indices]

sorted_ridge_indices = np.argsort(ridge_coeff_magnitudes)[::-1]
sorted_ridge_magnitudes = ridge_coeff_magnitudes[sorted_ridge_indices]
sorted_ridge_variable_names = full_data.columns[sorted_ridge_indices]

sorted_lasso_indices = np.argsort(lasso_coeff_magnitudes)[::-1]
sorted_lasso_magnitudes = lasso_coeff_magnitudes[sorted_lasso_indices]
sorted_lasso_variable_names = full_data.columns[sorted_lasso_indices]

sorted_elasticnet_indices = np.argsort(elasticNet_coeff_magnitudes)[::-1]
sorted_elasticnet_magnitudes = elasticNet_coeff_magnitudes[sorted_elasticnet_indices]
sorted_elasticnet_variable_names = full_data.columns[sorted_elasticnet_indices]

# Print the importance ranking for each model
print("Variable Importance - Linear Regression:")
for i, (name, magnitude) in enumerate(zip(sorted_linear_variable_names, sorted_linear_magnitudes), start=1):
    print(f"{i}. {name}: {magnitude}")

print("\nVariable Importance - Ridge Regression:")
for i, (name, magnitude) in enumerate(zip(sorted_ridge_variable_names, sorted_ridge_magnitudes), start=1):
    print(f"{i}. {name}: {magnitude}")

print("\nVariable Importance - Lasso Regression:")
for i, (name, magnitude) in enumerate(zip(sorted_lasso_variable_names, sorted_lasso_magnitudes), start=1):
    print(f"{i}. {name}: {magnitude}")

print("\nVariable Importance - ElasticNet Regression:")
for i, (name, magnitude) in enumerate(zip(sorted_elasticnet_variable_names, sorted_elasticnet_magnitudes), start=1):
    print(f"{i}. {name}: {magnitude}")

Variable Importance - Linear Regression:
1. Economy (GDP per Capita): 0.8102950755853906
2. Health (Life Expectancy): 0.459164589610854
3. Freedom: 0.4023890814956888
4. Generosity: 0.08046939250608638
5. Trust (Government Corruption): 0.052449715638837824

Variable Importance - Ridge Regression:
1. Economy (GDP per Capita): 0.8004715921498305
2. Health (Life Expectancy): 0.46245006333700744
3. Freedom: 0.40067684128492664
4. Generosity: 0.07932581081897815
5. Trust (Government Corruption): 0.05454104415322549

Variable Importance - Lasso Regression:
1. Economy (GDP per Capita): 0.8088001497074949
2. Health (Life Expectancy): 0.4583600223985168
3. Freedom: 0.4013031972367628
4. Generosity: 0.07912029485616824
5. Trust (Government Corruption): 0.052621809681851824

Variable Importance - ElasticNet Regression:
1. Economy (GDP per Capita): 0.7839829821529005
2. Health (Life Expectancy): 0.4662218474586902
3. Freedom: 0.3968340773524773
4. Generosity: 0.07622538677102672
5. Trust (Governme

In [35]:
full_data.describe()

Unnamed: 0,Generosity,Economy (GDP per Capita),Health (Life Expectancy),Freedom,Trust (Government Corruption),Happiness Score
count,1047.0,1047.0,1047.0,1047.0,1047.0,1047.0
mean,0.210441,0.926686,0.617915,0.437448,0.129309,5.441526
std,0.117279,0.395947,0.241831,0.147077,0.108314,1.113597
min,0.00199,0.0153,0.005565,0.00589,0.00083,2.8166
25%,0.12394,0.6323,0.443,0.34743,0.056669,4.5835
50%,0.197,0.986,0.6458,0.452764,0.094596,5.43
75%,0.266854,1.244305,0.801,0.553509,0.161968,6.2563
max,0.838075,1.870766,1.141,0.724,0.55191,7.842
