In [518]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score

In [519]:
with open('dataframes/df2015', 'rb') as f_2015:
    df2015 = pickle.load(f_2015)
f_2015.close()

with open('dataframes/df2016', 'rb') as f_2016:
    df2016 = pickle.load(f_2016)
f_2016.close()

with open('dataframes/df2017', 'rb') as f_2017:
    df2017 = pickle.load(f_2017)
f_2017.close()

with open('dataframes/df2018', 'rb') as f_2018:
    df2018 = pickle.load(f_2018)
f_2018.close()

with open('dataframes/df2019', 'rb') as f_2019:
    df2019 = pickle.load(f_2019)
f_2019.close()

with open('dataframes/df2020', 'rb') as f_2020:
    df2020 = pickle.load(f_2020)
f_2020.close()

# Linear Regression
- The data in this set lends itself to linear regression since the happiness score is positively correlated with all its features as seen in the EDA steps. 
- GDP is the highest correlated features. 
- Since there are only 7 features, it's likely that we won't need to perform PCA on the dataset since there is no worthy feature to remove. These have all been engineered to be linearly correlated with our target (happiness). 
- Here, we will explore different ways regression can be applied to this dataset to make predictions.

### Question 1: What is the happines score of each country for 2020 given the 2020 features?
- we predicted for MOST countries based on SOME countries for this one

In [520]:
Y = df2020['Score'] # target 
X = df2020.iloc[:, 1:].drop(['Score'],axis='columns') # features 
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=.2)
# print(train_x.shape)
# print(test_y.shape) # ok! 

In [521]:
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x.iloc[:, 1:])
test_x = scaler.transform(test_x.iloc[:, 1:])

In [522]:
lr_reg = LinearRegression()
lr_reg.fit(train_x, train_y) # keep the countries # index without it 
hyp = lr_reg.predict(test_x)

In [523]:
# metrics
res = test_y - hyp
r2 = r2_score(test_y, hyp)
var = explained_variance_score(test_y, hyp)
mse = mean_squared_error(test_y, hyp)
print('R2 score...          \t', r2)
print('Variance explained...\t', var)
print('Mean squared error...\t', mse)

R2 score...          	 0.9677902169395654
Variance explained...	 0.9679045293192847
Mean squared error...	 0.024234444966055584


### Question 2: What is the happiness score of each country for 2020 given the the data from past years (2015-2019)?
- This is kind of a hot mess
- See how Ariana does this?

In [524]:
Y = df_feats['Score'] # target 
X = df_feats.iloc[:, 1:].drop(['Score'],axis='columns') # features 
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=.2)
# print(train_x.shape)
# print(test_y.shape) # ok! 

In [525]:
# scaler = StandardScaler()
# train_x = scaler.fit_transform(train_x.iloc[:, 1:])
# test_x = scaler.fit(test_x.iloc[:, 1:])

In [526]:
# lr_reg = LinearRegression()
# lr_reg = GridSearchCV(lr, param_grid={}, n_jobs = 5)
# lr_reg.fit(train_x.iloc[:, 2:], train_y) # keep the countries # index without it 
# hyp = lr_reg.predict(test_x.iloc[:, 2:])

In [527]:
# r2_score = r2_score(test_y, hyp)
# var = explained_variance_score(test_y, hyp)
# mse = mean_squared_error(test_y, hyp)
# print('R2 score...\t', r2_score)
# print('Variance explained...\t', var)
# print('Mean squared error...\t', mse)

### Question 3: Given that we predict the each feature for 2020, can we obtain a valid happiness score?
- We will predict the GDP for 2020 given 2015-2019, Health Life Expectancy for 2020 given 2015-2019, etc. 
- Once those features are predicted, we will predict the happiness score! 

### Preliminary preprocessing
- Fill na
- Encode countries

In [528]:
# put all features (2015-2019) together
df_all = pd.concat([df2015, df2016, df2017, df2018, df2019])
df_all[df_all['Perceptions of corruption'].isna()] # one null value 
mean = df_all[df_all['Country or region'] == 'United Arab Emirates']['Perceptions of corruption'].mean() # get avg 
df_all = df_all.fillna(mean) # fill na with mean

In [529]:
# one hot encode countries
ohe = OneHotEncoder()
le = LabelEncoder()

In [530]:
df_all

Unnamed: 0,year,Country or region,Overall rank,Score,GDP per capita,Social support,Healthy life expectancy,Freedom,Perceptions of corruption,Generosity
0,2015,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678
1,2015,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.43630
2,2015,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139
3,2015,Norway,4,7.522,1.45900,1.33095,0.88521,0.66973,0.36503,0.34699
4,2015,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811
...,...,...,...,...,...,...,...,...,...,...
149,2019,Malawi,150,3.410,0.19100,0.56000,0.49500,0.44300,0.08900,0.21800
150,2019,Yemen,151,3.380,0.28700,1.16300,0.46300,0.14300,0.07700,0.10800
151,2019,Rwanda,152,3.334,0.35900,0.71100,0.61400,0.55500,0.41100,0.21700
152,2019,Tanzania,153,3.231,0.47600,0.88500,0.49900,0.41700,0.14700,0.27600


### GDP

In [531]:
train_y = df_all['GDP per capita'].to_numpy()
train_x = df_all.iloc[:, 5:]
test_y = df2020['GDP per capita'].to_numpy()
test_x = df2020.iloc[:, 4:9]
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [532]:
lr = LinearRegression()
lr.fit(train_x, train_y)
hyp_gdp = lr.predict(test_x)

In [533]:
r2 = r2_score(test_y, hyp_gdp)
var = explained_variance_score(test_y, hyp_gdp)
mse = mean_squared_error(test_y, hyp_gdp)
print('R2 score...\t', r2)
print('Variance explained...\t', var)
print('Mean squared error...\t', mse)

R2 score...	 0.3945308607083984
Variance explained...	 0.7075677218115671
Mean squared error...	 0.07888287353495015


### Healthy Life Exptectancy

In [534]:
train_y = df_all['Healthy life expectancy'].to_numpy()
train_x = df_all.drop(columns=['Healthy life expectancy']).iloc[:, 4:]
test_y = df2020['Healthy life expectancy'].to_numpy()
test_x = df2020.drop(columns=['Healthy life expectancy']).iloc[:, 3:8]
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [535]:
lr = LinearRegression()
lr.fit(train_x, train_y)
hyp_hle = lr.predict(test_x)

In [536]:
r2 = r2_score(test_y, hyp_hle)
var = explained_variance_score(test_y, hyp_hle)
mse = mean_squared_error(test_y, hyp_hle)
print('R2 score...\t', r2)
print('Variance explained...\t', var)
print('Mean squared error...\t', mse)

R2 score...	 0.6010243371199526
Variance explained...	 0.7332283890756466
Mean squared error...	 0.022183520464343288


### Social support

In [537]:
train_y = df_all['Social support'].to_numpy()
train_x = df_all.drop(columns=['Social support']).iloc[:, 4:]
test_y = df2020['Social support'].to_numpy()
test_x = df2020.drop(columns=['Social support']).iloc[:, 3:8]
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [538]:
lr = LinearRegression()
lr.fit(train_x, train_y)
hyp_ss = lr.predict(test_x)

In [539]:
r2 = r2_score(test_y, hyp_ss)
var = explained_variance_score(test_y, hyp_ss)
mse = mean_squared_error(test_y, hyp_ss)
print('R2 score...\t', r2)
print('Variance explained...\t', var)
print('Mean squared error...\t', mse)

R2 score...	 0.571364047938336
Variance explained...	 0.5909787714362185
Mean squared error...	 0.031910509966876936


### Freedom to make life choices

In [540]:
train_y = df_all['Freedom'].to_numpy()
train_x = df_all.drop(columns=['Freedom']).iloc[:, 4:]
test_y = df2020['Freedom'].to_numpy()
test_x = df2020.drop(columns=['Freedom']).iloc[:, 3:8]
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [541]:
lr = LinearRegression()
lr.fit(train_x, train_y)
hyp_fre = lr.predict(test_x)

In [542]:
r2 = r2_score(test_y, hyp_fre)
var = explained_variance_score(test_y, hyp_fre)
mse = mean_squared_error(test_y, hyp_fre)
print('R2 score...\t', r2)
print('Variance explained...\t', var)
print('Mean squared error...\t', mse)

R2 score...	 0.3020329165732277
Variance explained...	 0.34587174347434957
Mean squared error...	 0.013600444882096191


### Perceptions of corruption

In [543]:
train_y = df_all['Perceptions of corruption'].to_numpy()
train_x = df_all.drop(columns=['Perceptions of corruption']).iloc[:, 4:]
test_y = df2020['Perceptions of corruption'].to_numpy()
test_x = df2020.drop(columns=['Perceptions of corruption']).iloc[:, 3:8]
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [544]:
lr = LinearRegression()
lr.fit(train_x, train_y)
hyp_per = lr.predict(test_x)

In [545]:
r2 = r2_score(test_y, hyp_per)
var = explained_variance_score(test_y, hyp_per)
mse = mean_squared_error(test_y, hyp_per)
print('R2 score...\t', r2)
print('Variance explained...\t', var)
print('Mean squared error...\t', mse)

R2 score...	 0.2765555424876336
Variance explained...	 0.28050303180713676
Mean squared error...	 0.009715280050987927


### Generosity

In [546]:
train_y = df_all['Generosity'].to_numpy()
train_x = df_all.drop(columns=['Generosity']).iloc[:, 4:]
test_y = df2020['Generosity'].to_numpy()
test_x = df2020.drop(columns=['Generosity']).iloc[:, 3:8]
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [547]:
lr = LinearRegression()
lr.fit(train_x, train_y)
hyp_gen = lr.predict(test_x)

In [548]:
r2 = r2_score(test_y, hyp_gen)
var = explained_variance_score(test_y, hyp_gen)
mse = mean_squared_error(test_y, hyp_gen)
print('R2 score...\t', r2)
print('Variance explained...\t', var)
print('Mean squared error...\t', mse)

R2 score...	 -0.008795808338402544
Variance explained...	 0.16812774065605385
Mean squared error...	 0.010294877011567371


## Predicting happiness score/rank

In [549]:
data={'GDP per capita':hyp_gdp, 'Social support': hyp_ss, 
     'Healthy life expectancy': hyp_hle, 'Freedom': hyp_fre, 
      'Perceptions of corruption': hyp_per, 'Generosity': hyp_gen}
df_hyp = pd.DataFrame(data=data) # dataframe of hypothesis # test_x 

In [550]:
X = df_hyp.to_numpy()
Y = df2020['Score']
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=.2)

In [551]:
lr = LinearRegression()
lr.fit(train_x, train_y)
hyp = lr.predict(test_x)

In [552]:
r2 = r2_score(test_y, hyp)
var = explained_variance_score(test_y, hyp)
mse = mean_squared_error(test_y, hyp)
print('R2 score...\t', r2)
print('Variance explained...\t', var)
print('Mean squared error...\t', mse)

R2 score...	 0.7396240576013514
Variance explained...	 0.7478980210683996
Mean squared error...	 0.28719154667616514


## Viewing Results

# References
- [Happiness Data Analysis 2015-2019](https://www.kaggle.com/kojisera/happiness-data-analysis-2015-2019) - See top happy 30 countries scatter plot for 2015-2019