In [62]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from pathlib import Path

np.random.seed(42)

plt.rc('font', size=12)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=12)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

pd.set_option('display.max_columns', None)

In [63]:
datapath = Path() / "data" / "lifesat"

oced_bli = pd.read_csv(datapath / "oecd_bli.csv")
gdp_er_capita = pd.read_csv(datapath / "gdp_per_capita.csv")

In [64]:
gdp_er_capita_2020=gdp_er_capita[gdp_er_capita["Year"]==2020]
gdp_er_capita_2020=gdp_er_capita.rename(columns={"GDP per capita, PPP (constant 2017 international $)": "GDP per capita (USD)"})
gdp_er_capita_2020

Unnamed: 0,Entity,Code,Year,GDP per capita (USD)
0,Afghanistan,AFG,2002,1189.784668
1,Afghanistan,AFG,2003,1235.810063
2,Afghanistan,AFG,2004,1200.278013
3,Afghanistan,AFG,2005,1286.793659
4,Afghanistan,AFG,2006,1315.789117
...,...,...,...,...
7104,Zimbabwe,ZWE,2016,3173.610829
7105,Zimbabwe,ZWE,2017,3274.611198
7106,Zimbabwe,ZWE,2018,3341.665418
7107,Zimbabwe,ZWE,2019,3027.656038


In [65]:
oced_bli["Indicator"].value_counts()
oced_bli = oced_bli[oced_bli["INEQUALITY"]=="TOT"]
oced_bli = oced_bli.pivot(
    index = "Country",
    columns = "Indicator",
    values = "Value"
)
oced_bli=oced_bli.reset_index()

In [66]:
gdp_er_capita_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7109 entries, 0 to 7108
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Entity                7109 non-null   object 
 1   Code                  5729 non-null   object 
 2   Year                  7109 non-null   int64  
 3   GDP per capita (USD)  7109 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 222.3+ KB


In [67]:
full_country_stats = oced_bli.merge(
    gdp_er_capita_2020,
    left_on = "Country",
    right_on = "Entity",
    how = "inner"
)

In [68]:
full_country_stats = full_country_stats[["Country", "Life satisfaction", "GDP per capita (USD)"]]
full_country_stats = full_country_stats.sort_values(by = "GDP per capita (USD)")

In [69]:
full_country_stats.to_csv(datapath / "lifesat_full.csv", index=False)

In [73]:
min_gdp = 23_500
max_gdp = 62_500

country_stats = full_country_stats[
    (full_country_stats["GDP per capita (USD)"] >= min_gdp)
    & (full_country_stats["GDP per capita (USD)"] <= max_gdp)
]
country_stats.to_csv(datapath / "lifesat.cvs", index=False)