In [1]:
import os
import pandas as pd

def preprocess_header(df):
    df.columns = df.columns.str.replace(" ", "_")
    df.columns = df.columns.str.lower()
    return df

def fix_happiness_index(df):
    df.columns = df.columns.str.replace(" ", "_")


energy_df = pd.read_csv("Energy Consumption.csv")
energy_df = preprocess_header(energy_df)

happiness_df = pd.read_csv("Happiness Index.csv")
happiness_df = preprocess_header(happiness_df)
happiness_df["happiness_score"] = happiness_df["happiness_score"].str.replace(",", ".").astype(float)

air_pollution_df = pd.read_csv("Air Quality.csv")
# air_pollution_df = pd.read_csv("Air Pollutants.csv")
air_pollution_df = preprocess_header(air_pollution_df)

gdp_df = pd.read_csv("GDP.csv")
gdp_df = preprocess_header(gdp_df)

unemployed_rate_df = pd.read_csv("Unemployed Rate.csv")
unemployed_rate_df = preprocess_header(unemployed_rate_df)

cpi_df = pd.read_csv("CPI.csv")
cpi_df = preprocess_header(cpi_df)

hdi_df = pd.read_csv("Human Development Index.csv")
hdi_df = preprocess_header(hdi_df)

max_year = air_pollution_df["year"].max()
energy_df_filtered = energy_df[energy_df["year"] <= max_year]
happiness_df_filtered = happiness_df[happiness_df["year"] <= max_year]


In [2]:
energy_df_filtered["year"].unique()
energy_df_filtered.head()

Unnamed: 0,country,year,biofuel_consumption,coal_consumption,fossil_fuel_consumption,gas_consumption,hydro_consumption,low_carbon_consumption,nuclear_consumption,oil_consumption,other_renewable_consumption,renewables_consumption,solar_consumption,wind_consumption,total_consumption
0,Australia,2015,2.387,540.411,1500.658,389.95,37.935,100.613,0.0,570.297,11.741,100.613,16.684,31.866,3303.155
1,Bangladesh,2015,0.0,30.633,381.858,258.541,2.419,3.16,0.0,92.684,0.011,3.16,0.716,0.014,773.196
2,Brazil,2015,221.571,204.981,2070.909,429.365,968.777,1446.048,38.075,1436.562,159.228,1407.973,0.159,58.237,8441.885
3,Canada,2015,20.805,214.41,2595.714,1103.324,1029.236,1423.599,261.261,1277.979,31.881,1162.338,7.796,72.62,9200.963
4,China,2015,27.299,22329.041,30885.832,1946.895,3001.359,4250.154,442.87,6609.896,172.514,3807.284,106.32,499.793,74079.257


In [3]:
happiness_df_filtered["year"].unique()
happiness_df_filtered.head()

Unnamed: 0,country,happiness_score,year
0,Canada,7.427,2015
1,Netherlands,7.378,2015
2,Australia,7.284,2015
3,Mexico,7.187,2015
4,United States,7.119,2015


In [4]:
def calculate_aqi(concentration, breakpoints):
    for i in range(len(breakpoints)):
        if breakpoints[i][0] <= concentration <= breakpoints[i][1]:
            c_low, c_high = breakpoints[i][0], breakpoints[i][1]
            aqi_low, aqi_high = breakpoints[i][2], breakpoints[i][3]
            aqi = ((aqi_high - aqi_low) / (c_high - c_low)) * (concentration - c_low) + aqi_low
            return aqi
    return None

def aqi_category(aqi, breakpoints):
    for i in range(len(breakpoints)):
        if breakpoints[i][2] <= aqi <= breakpoints[i][3]:
            return breakpoints[i][4]
    return None

pm25_breakpoints = [
    (0, 9, 0, 50, "GOOD"), (9.1, 35.4, 51, 100, "MODERATE"), (35.5, 55.4, 101, 150, "UNHEALTHY FOR SENSITIVE"),
    (55.5, 125.4, 151, 200, "UNHEALTHY"), (125.5, 225.4, 201, 300, "VERY UNHEALTHY"), (225.5, 325.4, 301, 500, "HAZARDOUS"), (325.5, 99999.9, 501, 999, "VERY HAZARDOUS")
]

air_pollution_df['aqi_pm25_concentration'] = air_pollution_df['average_pm25_concentration'].apply(lambda x: calculate_aqi(x, pm25_breakpoints))
air_pollution_df['aqi_pm25_category'] = air_pollution_df['aqi_pm25_concentration'].apply(lambda x: aqi_category(x, pm25_breakpoints))

In [5]:
air_pollution_df.head()

Unnamed: 0,country,pm10_concentration,pm25_concentration,no2_concentration,total_air_pollutants_concentration,average_pm10_concentration,average_pm25_concentration,average_no2_concentration,average_air_pollutants_concentration,year,aqi_pm25_concentration,aqi_pm25_category
0,Australia,530.684,126.687,155.421,812.792,17.118839,4.086677,5.013581,26.219097,2015,22.703763,GOOD
1,Bangladesh,1130.275,703.252,280.955,2114.482,125.586111,78.139111,31.217222,234.942444,2015,166.870049,UNHEALTHY
2,Brazil,1755.658,172.35,745.426,2673.434,30.269966,2.971552,12.852172,46.09369,2015,16.508621,GOOD
3,Canada,329.438,1033.378,1228.758,2591.574,2.139208,6.710247,7.978948,16.828403,2015,37.279149,GOOD
4,China,10768.633,24332.529,3728.533,38829.695,21.932043,49.557086,7.593754,79.082882,2015,135.612924,UNHEALTHY FOR SENSITIVE


In [6]:
gdp_df.head()

Unnamed: 0,country,gdp,gdp_per_capita,year
0,Australia,1140000000000.0,47858.459864,2015
1,Bangladesh,552000000000.0,3497.433948,2015
2,Brazil,3020000000000.0,14718.194722,2015
3,Canada,1570000000000.0,43938.059091,2015
4,China,18300000000000.0,13130.370278,2015


In [7]:
unemployed_rate_df.head()

Unnamed: 0,country,unemployed_rate,year
0,United Arab Emirates,1.792,2015
1,Australia,6.055,2015
2,Bangladesh,4.382,2015
3,Brazil,8.538,2015
4,Canada,6.945,2015


In [8]:
cpi_df.head()

Unnamed: 0,country,cpi,year
0,United Arab Emirates,4.069966,2015
1,Australia,1.508367,2015
2,Bangladesh,6.19428,2015
3,Brazil,9.029901,2015
4,Canada,1.125241,2015


In [9]:
hdi_df.head()

Unnamed: 0,country,human_development_index,year
0,United Arab Emirates,0.86,2015
1,Australia,0.933,2015
2,Bangladesh,0.604,2015
3,Brazil,0.752,2015
4,Canada,0.927,2015


## Task 3

In [10]:
energy_features_for_task_three = ["country", "year", "total_consumption"]
happiness_features_for_task_three = ["country", "year", "happiness_score"]
air_pollution_features_for_task_three = ["country", "year", "aqi_pm25_concentration", "aqi_pm25_category"]

task_three_df = pd.merge(energy_df_filtered[energy_features_for_task_three], happiness_df_filtered[happiness_features_for_task_three], on=["country", "year"])
task_three_df = pd.merge(task_three_df, air_pollution_df[air_pollution_features_for_task_three], on=["country", "year"])
task_three_df.head()

Unnamed: 0,country,year,total_consumption,happiness_score,aqi_pm25_concentration,aqi_pm25_category
0,Australia,2015,3303.155,7.284,22.703763,GOOD
1,Bangladesh,2015,773.196,4.694,166.870049,UNHEALTHY
2,Brazil,2015,8441.885,6.983,16.508621,GOOD
3,Canada,2015,9200.963,7.427,37.279149,GOOD
4,China,2015,74079.257,5.14,135.612924,UNHEALTHY FOR SENSITIVE


In [11]:
task_three_df_countries = task_three_df["country"].unique()
print(len(task_three_df_countries))

29


In [12]:
task_three_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   country                 147 non-null    object 
 1   year                    147 non-null    int64  
 2   total_consumption       147 non-null    float64
 3   happiness_score         147 non-null    float64
 4   aqi_pm25_concentration  145 non-null    float64
 5   aqi_pm25_category       145 non-null    object 
dtypes: float64(3), int64(1), object(2)
memory usage: 7.0+ KB


In [13]:
def scale_series(series):
    return 100 * (series / series.max())

scaled_df = task_three_df.copy()
for column in task_three_df.select_dtypes(include=['float64']).columns:
    scaled_df[f"indexed_{column}"] = scale_series(task_three_df[column])
    
# write csv file
scaled_df.to_csv("task_three_scaled.csv", index=False)

scaled_df.head()

Unnamed: 0,country,year,total_consumption,happiness_score,aqi_pm25_concentration,aqi_pm25_category,indexed_total_consumption,indexed_happiness_score,indexed_aqi_pm25_concentration
0,Australia,2015,3303.155,7.284,22.703763,GOOD,3.718519,97.275641,12.947259
1,Bangladesh,2015,773.196,4.694,166.870049,UNHEALTHY,0.870424,62.686966,95.160865
2,Brazil,2015,8441.885,6.983,16.508621,GOOD,9.503432,93.255876,9.414359
3,Canada,2015,9200.963,7.427,37.279149,GOOD,10.357962,99.185363,21.259154
4,China,2015,74079.257,5.14,135.612924,UNHEALTHY FOR SENSITIVE,83.394548,68.643162,77.335886


In [14]:
scaled_df[scaled_df["country"] == "United States"]

Unnamed: 0,country,year,total_consumption,happiness_score,aqi_pm25_concentration,aqi_pm25_category,indexed_total_consumption,indexed_happiness_score,indexed_aqi_pm25_concentration
23,United States,2015,53417.831,7.119,40.200137,GOOD,60.134997,95.072115,22.924904
47,United States,2016,53585.287,7.104,36.144582,GOOD,60.32351,94.871795,20.612145
69,United States,2017,53992.588,6.993,38.212483,GOOD,60.782028,93.389423,21.791405
92,United States,2018,55937.215,6.886,38.951303,GOOD,62.971187,91.96047,22.212732
116,United States,2019,55632.207,6.892,36.073097,GOOD,62.627825,92.040598,20.571379
133,United States,2020,51835.538,6.9396,37.469787,GOOD,58.353734,92.676282,21.367869
144,United States,2021,54639.668,6.951,38.54976,GOOD,61.510477,92.828526,21.983744


## Task 6

In [16]:
energy_features_for_task_six = ["country", "year", "total_consumption"]
happiness_features_for_task_six = ["country", "year", "happiness_score"]
air_pollution_features_for_task_six = ["country", "year", "aqi_pm25_concentration", "aqi_pm25_category"]
gdp_df_features_for_task_six = ["country", "year", "gdp_per_capita", "gdp"]

task_six_df = pd.merge(energy_df_filtered[energy_features_for_task_six], happiness_df_filtered[happiness_features_for_task_six], on=["country", "year"])
task_six_df = pd.merge(task_six_df, air_pollution_df[air_pollution_features_for_task_six], on=["country", "year"])
task_six_df = pd.merge(task_six_df, gdp_df[gdp_df_features_for_task_six], on=["country", "year"])
task_six_df = pd.merge(task_six_df, unemployed_rate_df, on=["country", "year"])
task_six_df = pd.merge(task_six_df, cpi_df, on=["country", "year"])
task_six_df = pd.merge(task_six_df, hdi_df, on=["country", "year"])

task_six_df.head()

Unnamed: 0,country,year,total_consumption,happiness_score,aqi_pm25_concentration,aqi_pm25_category,gdp_per_capita,gdp,unemployed_rate,cpi,human_development_index
0,Australia,2015,3303.155,7.284,22.703763,GOOD,47858.459864,1140000000000.0,6.055,1.508367,0.933
1,Bangladesh,2015,773.196,4.694,166.870049,UNHEALTHY,3497.433948,552000000000.0,4.382,6.19428,0.604
2,Brazil,2015,8441.885,6.983,16.508621,GOOD,14718.194722,3020000000000.0,8.538,9.029901,0.752
3,Canada,2015,9200.963,7.427,37.279149,GOOD,43938.059091,1570000000000.0,6.945,1.125241,0.927
4,China,2015,74079.257,5.14,135.612924,UNHEALTHY FOR SENSITIVE,13130.370278,18300000000000.0,4.65,1.437024,0.741


In [17]:
task_six_df.to_csv("task_six_scaled.csv", index=False)

In [18]:
task_six_df = task_six_df.dropna()

In [19]:
task_six_df

Unnamed: 0,country,year,total_consumption,happiness_score,aqi_pm25_concentration,aqi_pm25_category,gdp_per_capita,gdp,unemployed_rate,cpi,human_development_index
0,Australia,2015,3303.155,7.284,22.703763,GOOD,47858.459864,1.140000e+12,6.055,1.508367,0.933
1,Bangladesh,2015,773.196,4.694,166.870049,UNHEALTHY,3497.433948,5.520000e+11,4.382,6.194280,0.604
2,Brazil,2015,8441.885,6.983,16.508621,GOOD,14718.194722,3.020000e+12,8.538,9.029901,0.752
3,Canada,2015,9200.963,7.427,37.279149,GOOD,43938.059091,1.570000e+12,6.945,1.125241,0.927
4,China,2015,74079.257,5.140,135.612924,UNHEALTHY FOR SENSITIVE,13130.370278,1.830000e+13,4.650,1.437024,0.741
...,...,...,...,...,...,...,...,...,...,...,...
142,Thailand,2021,2873.876,5.985,72.554127,MODERATE,15362.891611,1.100000e+12,1.215,1.230395,0.797
143,United Arab Emirates,2021,2449.781,6.561,95.733460,MODERATE,77521.457480,7.260000e+11,3.105,-0.013860,0.931
144,United States,2021,54639.668,6.951,38.549760,GOOD,56676.956116,1.910000e+13,5.349,4.697859,0.921
145,Italy,2022,3720.476,6.467,0.000000,GOOD,36248.164556,2.140000e+12,8.069,8.201290,0.906


In [20]:
from scipy.stats import pearsonr
background = ["total_consumption", "happiness_score"]
sub_factor = ["aqi_pm25_concentration", "gdp_per_capita", "gdp", "unemployed_rate", "cpi", "human_development_index"]
factor = ["environment", "finance", "living_quality"]

pearsonr_df = pd.DataFrame(columns=["background", "aqi_pm25_concentration", "gdp_per_capita", "gdp", "unemployed_rate", "cpi", "human_development_index"])

# Iterate over each combination of background and sub_factor
i = 0
for bg in background:
    sub_factor_pearsonr = []
    for sf in sub_factor:
        list1 = task_six_df[bg]
        list2 = task_six_df[sf]
        corr, _ = pearsonr(list1, list2)
        corr = round(corr, 3)
        sub_factor_pearsonr.append(corr)
        print(f"{bg} and {sf} correlation: {corr:.3f}")
    pearsonr_df.loc[i] = [bg] + sub_factor_pearsonr
    i += 1


pearsonr_df.set_index("background", inplace=True)



total_consumption and aqi_pm25_concentration correlation: 0.201
total_consumption and gdp_per_capita correlation: 0.004
total_consumption and gdp correlation: 0.979
total_consumption and unemployed_rate correlation: -0.088
total_consumption and cpi correlation: -0.070
total_consumption and human_development_index correlation: -0.006
happiness_score and aqi_pm25_concentration correlation: -0.635
happiness_score and gdp_per_capita correlation: 0.751
happiness_score and gdp correlation: -0.116
happiness_score and unemployed_rate correlation: -0.043
happiness_score and cpi correlation: -0.436
happiness_score and human_development_index correlation: 0.797


In [21]:
pearsonr_df

Unnamed: 0_level_0,aqi_pm25_concentration,gdp_per_capita,gdp,unemployed_rate,cpi,human_development_index
background,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
total_consumption,0.201,0.004,0.979,-0.088,-0.07,-0.006
happiness_score,-0.635,0.751,-0.116,-0.043,-0.436,0.797


In [35]:
from sklearn.preprocessing import StandardScaler
environment = ["aqi_pm25_concentration"]

living_quality = ["human_development"]



for bg in background:
    task_six_df_cp = task_six_df.copy()
    if bg == "total_consumption":
        finance = ["gdp", "cpi", "unemployed_rate"]
    else:
        finance = ["gdp_per_capita", "cpi", "unemployed_rate"]
    scaler = StandardScaler()
    task_six_df_cp[finance] = scaler.fit_transform(task_six_df_cp[finance])
    task_six_df_cp['finance'] = task_six_df_cp[finance].mean(axis=1)
    corr, _ = pearsonr(task_six_df_cp['finance'], task_six_df_cp[bg])
    pearsonr_df.loc[bg, "finance"] = round(corr, 3)
    
pearsonr_df['environment'] = pearsonr_df["aqi_pm25_concentration"]
pearsonr_df['living_quality'] = pearsonr_df["human_development_index"]

In [36]:
pearsonr_df

Unnamed: 0_level_0,aqi_pm25_concentration,gdp_per_capita,gdp,unemployed_rate,cpi,human_development_index,finance,environment,living_quality
background,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
total_consumption,0.201,0.004,0.979,-0.088,-0.07,-0.006,0.457,0.201,-0.006
happiness_score,-0.635,0.751,-0.116,-0.043,-0.436,0.797,0.17,-0.635,0.797


In [27]:
pearsonr_df.to_csv("pearsonr_df.csv", index=False)