In [185]:
import pandas as pd
import statsmodels.api as sm

In [172]:
suicide = pd.read_csv("src/suicide.csv", usecols=("country", "year", "suicides_no", "population"))
suicide = suicide[(suicide["year"] == 2015) | (suicide["year"] == 2016)]
country_change = {"Russian Federation": "Russia", "Republic of Korea": "South Korea"}
suicide.loc[:, "country"] = suicide["country"].apply(
    lambda x: x if x not in country_change else country_change[x]
)
suicide = suicide.groupby(["country", "year"]).apply(
    lambda s: s["suicides_no"].sum()/s["population"].sum()
).to_frame().rename(columns={0: "suicides"})

In [173]:
suicide

Unnamed: 0_level_0,Unnamed: 1_level_0,suicides
country,year,Unnamed: 2_level_1
Antigua and Barbuda,2015,0.000011
Argentina,2015,0.000077
Armenia,2015,0.000026
Armenia,2016,0.000028
Australia,2015,0.000136
...,...,...
Turkmenistan,2015,0.000027
Ukraine,2015,0.000188
United Kingdom,2015,0.000080
United States,2015,0.000147


In [174]:
def wh_read(year: int):
    wh_year = pd.read_csv(f"src/whr_{year}.csv", usecols=("Country", "Happiness Score"))
    wh_year.rename(columns={"Country": "country"}, inplace=True)
    wh_year.loc[:, "year"] = year
    wh_year.set_index(["country", "year"], inplace=True)
    
    # Normalize happiness score by year
    wh_year.loc[:, "Happiness Score"] = wh_year["Happiness Score"].apply(
        lambda x: x/10
    )
#     h_max = wh_year["Happiness Score"].max()
#     h_min = wh_year["Happiness Score"].min()
#     h_diff = h_max - h_min
    
#     wh_year.loc[:, "Happiness Score"] = wh_year["Happiness Score"].apply(
#         lambda x: (x - h_min)/ (h_diff)
#     )
    
    return wh_year

In [175]:
wh = pd.concat([wh_read(2015), wh_read(2016)])
wh

Unnamed: 0_level_0,Unnamed: 1_level_0,Happiness Score
country,year,Unnamed: 2_level_1
Switzerland,2015,0.7587
Iceland,2015,0.7561
Denmark,2015,0.7527
Norway,2015,0.7522
Canada,2015,0.7427
...,...,...
Benin,2016,0.3484
Afghanistan,2016,0.3360
Togo,2016,0.3303
Syria,2016,0.3069


In [179]:
# for country in suicide.groupby(level=[0]).first().index.unique():
#     try:
#         x = wh.loc[country]
#     except KeyError:
#         print(country, "not found")

Antigua and Barbuda not found
Cuba not found
Grenada not found
Saint Vincent and Grenadines not found
Seychelles not found


In [184]:
df = pd.merge(suicide, wh, left_index=True, right_index=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,suicides,Happiness Score
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Argentina,2015,0.000077,0.6574
Armenia,2015,0.000026,0.4350
Armenia,2016,0.000028,0.4360
Australia,2015,0.000136,0.7284
Austria,2015,0.000152,0.7200
...,...,...,...
Turkmenistan,2015,0.000027,0.5548
Ukraine,2015,0.000188,0.4681
United Kingdom,2015,0.000080,0.6867
United States,2015,0.000147,0.7119


In [186]:
Y = df["suicides"]
X = df["Happiness Score"]
model = sm.OLS(Y, X)
result = model.fit()
print(result.pvalues)
result.summary()

Happiness Score    1.727355e-21
dtype: float64


0,1,2,3
Dep. Variable:,suicides,R-squared (uncentered):,0.733
Model:,OLS,Adj. R-squared (uncentered):,0.73
Method:,Least Squares,F-statistic:,189.8
Date:,"Thu, 06 May 2021",Prob (F-statistic):,1.73e-21
Time:,11:58:38,Log-Likelihood:,568.9
No. Observations:,70,AIC:,-1136.0
Df Residuals:,69,BIC:,-1134.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Happiness Score,0.0002,1.38e-05,13.778,0.000,0.000,0.000

0,1,2,3
Omnibus:,12.456,Durbin-Watson:,1.719
Prob(Omnibus):,0.002,Jarque-Bera (JB):,13.013
Skew:,0.984,Prob(JB):,0.00149
Kurtosis:,3.767,Cond. No.,1.0


In [189]:
results.params

Happiness Score    0.00019
dtype: float64