In [1]:
import os
import pandas as pd
import numpy as np
cwd = os.getcwd()
parentwd = os.path.dirname(cwd)

In [2]:
parentwd


'/Users/boraberke/drive/kth/kthgit/studyability/data'

In [3]:
dataset_names = [
    "temperature_cities",
    "cost_of_living_cities",
    "ef_education_index",
]
paths = dict()
for dataset in dataset_names:
    paths[dataset] = os.path.join(parentwd,f"{dataset}.csv")

In [4]:
paths

{'temperature_cities': '/Users/boraberke/drive/kth/kthgit/studyability/data/temperature_cities.csv',
 'cost_of_living_cities': '/Users/boraberke/drive/kth/kthgit/studyability/data/cost_of_living_cities.csv',
 'ef_education_index': '/Users/boraberke/drive/kth/kthgit/studyability/data/ef_education_index.csv'}

In [5]:
# read datasets
dfs = {}
for name,path in paths.items():
    dfs[name] = pd.read_csv(path)

In [6]:
dfs["temperature_cities"].head()

Unnamed: 0,country,city,jan,feb,mar,apr,jun,jul,aug,sep,oct,nov,dec,year_avg,alpha_2,alpha_3,region,sub_region
0,Algeria,Algiers,11.2,11.9,12.8,14.7,21.3,24.6,25.2,23.2,19.4,15.2,12.1,17.4,DZ,DZA,Africa,Northern Africa
1,Algeria,Tamanrasset,12.8,15.0,18.1,22.2,28.9,28.7,28.2,26.5,22.4,17.3,13.9,21.7,DZ,DZA,Africa,Northern Africa
2,Algeria,Reggane,16.0,18.2,23.1,27.9,36.4,39.8,38.4,35.5,29.2,22.0,17.8,28.3,DZ,DZA,Africa,Northern Africa
3,Angola,Luanda,26.7,28.5,28.6,28.2,23.9,22.1,22.1,23.5,25.2,26.7,26.9,25.8,AO,AGO,Africa,Sub-Saharan Africa
4,Benin,Cotonou,27.3,28.5,28.9,28.6,26.5,25.8,25.6,26.0,26.7,27.6,27.3,27.2,BJ,BEN,Africa,Sub-Saharan Africa


In [7]:
dfs["cost_of_living_cities"].head()

Unnamed: 0,rank,cost_of_living_index,rent_index,cost_of_living_plus_rent_index,groceries_index,restaurant_price_index,local_purchasing_power_index,year,country,city,alpha_2,alpha_3,region,sub_region
0,1,140.4,93.6,118.1,144.1,141.9,74.3,2023-06-01,Bermuda,Hamilton,BM,BMU,Americas,Northern America
1,2,130.0,46.4,90.2,137.4,121.7,113.2,2023-06-01,Switzerland,Basel,CH,CHE,Europe,Western Europe
2,3,128.5,69.1,100.2,120.4,129.7,104.0,2023-06-01,Switzerland,Zurich,CH,CHE,Europe,Western Europe
3,4,120.6,53.4,88.6,116.1,114.7,99.4,2023-06-01,Switzerland,Lausanne,CH,CHE,Europe,Western Europe
4,5,119.1,66.7,94.1,112.6,120.7,121.4,2023-06-01,Switzerland,Zug,CH,CHE,Europe,Western Europe


In [8]:
dfs["ef_education_index"].head()

Unnamed: 0,rank,country,ef_score,level,year,alpha_2,alpha_3,region,sub_region
0,1,Norway,69.09,Very High Proficiency,2011,NO,NOR,Europe,Northern Europe
1,2,Netherlands,67.93,Very High Proficiency,2011,NL,NLD,Europe,Western Europe
2,3,Denmark,66.91,Very High Proficiency,2011,DK,DNK,Europe,Northern Europe
3,4,Sweden,66.26,Very High Proficiency,2011,SE,SWE,Europe,Northern Europe
4,5,Finland,65.38,Very High Proficiency,2011,FI,FIN,Europe,Northern Europe


In [None]:
# use 2024-mid year data for cost of living
dfs["cost_of_living_cities"] = dfs["cost_of_living_cities"].loc[dfs["cost_of_living_cities"].year == "2024-06-01"]

# # use 2022 data for temperature
# dfs["temperature_countries"] = dfs["temperature_countries"].loc[dfs["temperature_countries"].year == 2022]

# use 2024 data for ef_education_index
dfs["ef_education_index"] = dfs["ef_education_index"].loc[dfs["ef_education_index"].year == 2024]


native_speaking = pd.DataFrame({"rank":[0, 0, 0,0,0,0],
                    "country": ["United States", "Australia","Canada","United Kingdom","Ireland","New Zealand"],
                    "ef_score":[650.0, 650.0, 650.0, 650.0, 650.0, 650.0],
                    "level":["Native", "Native", "Native","Native","Native","Native"],
                    "alpha_2":["US", "AU", "CA","GB","IE","NZ"],
                     "region":["Americas", "Oceania", "Americas","Europe","Europe","Oceania"],
                     "sub_region":["Northern America", "Australia and New Zealand", "Northern America","Northern Europe","Northern Europe","Australia and New Zealand"]
                               })



dfs["ef_education_index"] = pd.concat([dfs["ef_education_index"], native_speaking], ignore_index=True)

In [None]:
merged = dfs["cost_of_living_cities"].merge(dfs["temperature_cities"][['year_avg','city']], how="left",on="city")
merged_with_ef = merged.merge(dfs["ef_education_index"][['ef_score',"level","alpha_2"]], how="left",on="alpha_2")

In [None]:
merged_with_ef.head()

In [None]:
# there should not be any null values
# these values are not found in city temperature dataset
nan_rows = merged_with_ef[merged_with_ef['year_avg'].isnull()]
nan_rows

In [None]:
 merged_with_ef[merged_with_ef['ef_score'].isnull()]

In [None]:
# post process add missing
missing_countries = pd.DataFrame({"rank":[117, 118, 119],
                    "country": ["Brunei", "Malta","Puerto Rico"],
                    "ef_score":[393.0, 392.0, 391.0],
                    "level":["Very Low Proficiency", "Very Low Proficiency", "Very Low Proficiency"],
                    "alpha_2":["BN", "MT", "PR"],
                     "region":["Asia", "Europe", "Americas"],
                     "sub_region":["South-eastern Asia", "Southern Europe", "Americas and the Caribbean"]
                               })
merged_with_ef = pd.concat([merged_with_ef, missing_countries], ignore_index=True)

In [None]:
# observe that the post processed countries are added
merged_with_ef.tail()

In [15]:
# observe that the post processed countries are added
merged_with_ef.tail()

Unnamed: 0,rank,cost_of_living_index,rent_index,cost_of_living_plus_rent_index,groceries_index,restaurant_price_index,local_purchasing_power_index,year,country,city,alpha_2,alpha_3,region,sub_region,year_avg,ef_score,level
216,217,19.1,3.3,11.5,16.8,15.3,27.3,2024-06-01,Pakistan,Lahore,PK,PAK,Asia,Southern Asia,24.3,493.0,Low Proficiency
217,218,19.0,2.9,11.3,17.6,13.1,28.0,2024-06-01,Pakistan,Karachi,PK,PAK,Asia,Southern Asia,26.0,493.0,Low Proficiency
218,117,,,,,,,,Brunei,,BN,,Asia,South-eastern Asia,,393.0,Very Low Proficiency
219,118,,,,,,,,Malta,,MT,,Europe,Southern Europe,,392.0,Very Low Proficiency
220,119,,,,,,,,Puerto Rico,,PR,,Americas,Americas and the Caribbean,,391.0,Very Low Proficiency


In [16]:
merged_with_ef = merged_with_ef[['cost_of_living_index',
                                         'rent_index',
                                         'cost_of_living_plus_rent_index',
                                         'groceries_index',
                                         'restaurant_price_index',
                                         'local_purchasing_power_index',
                                         'country',
                                         'city',
                                         'alpha_2',
                                         'year_avg',
                                         'ef_score',
                                         'level']]
                                         

In [17]:
merged_with_ef.to_csv(os.path.join(cwd,"city_level.csv"),index=False, header=['cost_of_living_index',
                                         'rent_index',
                                         'cost_of_living_plus_rent_index',
                                         'groceries_index',
                                         'restaurant_price_index',
                                         'local_purchasing_power_index',
                                         'country',
                                         'city',
                                         'country_code',
                                         'temp_year_avg',
                                         'ef_score',
                                         'ef_level'])