In [89]:
import os
import pandas as pd
import numpy as np
cwd = os.getcwd()
parentwd = os.path.dirname(cwd)

In [90]:
parentwd


'/Users/boraberke/drive/kth/kthgit/studyability/data'

In [91]:
dataset_names = [
    "temperature_cities",
    "cost_of_living_cities",
    "ef_education_index",
]
paths = dict()
for dataset in dataset_names:
    paths[dataset] = os.path.join(parentwd,f"{dataset}.csv")

In [92]:
paths

{'temperature_cities': '/Users/boraberke/drive/kth/kthgit/studyability/data/temperature_cities.csv',
 'cost_of_living_cities': '/Users/boraberke/drive/kth/kthgit/studyability/data/cost_of_living_cities.csv',
 'ef_education_index': '/Users/boraberke/drive/kth/kthgit/studyability/data/ef_education_index.csv'}

In [93]:
# read datasets
dfs = {}
for name,path in paths.items():
    dfs[name] = pd.read_csv(path)

In [94]:
dfs["temperature_cities"].head()

Unnamed: 0,country,city,jan,feb,mar,apr,jun,jul,aug,sep,oct,nov,dec,year_avg,alpha_2,alpha_3,region,sub_region
0,Algeria,Algiers,11.2,11.9,12.8,14.7,21.3,24.6,25.2,23.2,19.4,15.2,12.1,17.4,DZ,DZA,Africa,Northern Africa
1,Algeria,Tamanrasset,12.8,15.0,18.1,22.2,28.9,28.7,28.2,26.5,22.4,17.3,13.9,21.7,DZ,DZA,Africa,Northern Africa
2,Algeria,Reggane,16.0,18.2,23.1,27.9,36.4,39.8,38.4,35.5,29.2,22.0,17.8,28.3,DZ,DZA,Africa,Northern Africa
3,Angola,Luanda,26.7,28.5,28.6,28.2,23.9,22.1,22.1,23.5,25.2,26.7,26.9,25.8,AO,AGO,Africa,Sub-Saharan Africa
4,Benin,Cotonou,27.3,28.5,28.9,28.6,26.5,25.8,25.6,26.0,26.7,27.6,27.3,27.2,BJ,BEN,Africa,Sub-Saharan Africa


In [95]:
dfs["cost_of_living_cities"].head()

Unnamed: 0,rank,cost_of_living_index,rent_index,cost_of_living_plus_rent_index,groceries_index,restaurant_price_index,local_purchasing_power_index,year,country,city,alpha_2,alpha_3,region,sub_region
0,1,140.4,93.6,118.1,144.1,141.9,74.3,2023-06-01,Bermuda,Hamilton,BM,BMU,Americas,Northern America
1,2,130.0,46.4,90.2,137.4,121.7,113.2,2023-06-01,Switzerland,Basel,CH,CHE,Europe,Western Europe
2,3,128.5,69.1,100.2,120.4,129.7,104.0,2023-06-01,Switzerland,Zurich,CH,CHE,Europe,Western Europe
3,4,120.6,53.4,88.6,116.1,114.7,99.4,2023-06-01,Switzerland,Lausanne,CH,CHE,Europe,Western Europe
4,5,119.1,66.7,94.1,112.6,120.7,121.4,2023-06-01,Switzerland,Zug,CH,CHE,Europe,Western Europe


In [96]:
dfs["ef_education_index"].head()

Unnamed: 0,rank,country,ef_score,level,year,alpha_2,alpha_3,region,sub_region
0,1,Norway,69.09,Very High Proficiency,2011,NO,NOR,Europe,Northern Europe
1,2,Netherlands,67.93,Very High Proficiency,2011,NL,NLD,Europe,Western Europe
2,3,Denmark,66.91,Very High Proficiency,2011,DK,DNK,Europe,Northern Europe
3,4,Sweden,66.26,Very High Proficiency,2011,SE,SWE,Europe,Northern Europe
4,5,Finland,65.38,Very High Proficiency,2011,FI,FIN,Europe,Northern Europe


In [103]:
# use 2024-mid year data for cost of living
dfs["cost_of_living_cities"] = dfs["cost_of_living_cities"].loc[dfs["cost_of_living_cities"].year == "2024-06-01"]

# # use 2022 data for temperature
# dfs["temperature_countries"] = dfs["temperature_countries"].loc[dfs["temperature_countries"].year == 2022]

# use 2024 data for ef_education_index
dfs["ef_education_index"] = dfs["ef_education_index"].loc[dfs["ef_education_index"].year == 2024]


native_speaking = pd.DataFrame({"rank":[0, 0, 0,0,0,0],
                    "country": ["United States", "Australia","Canada","United Kingdom","Ireland","New Zealand"],
                    "ef_score":[650.0, 650.0, 650.0, 650.0, 650.0, 650.0],
                    "level":["Native", "Native", "Native","Native","Native","Native"],
                    "alpha_2":["US", "AU", "CA","GB","IE","NZ"],
                     "region":["Americas", "Oceania", "Americas","Europe","Europe","Oceania"],
                     "sub_region":["Northern America", "Australia and New Zealand", "Northern America","Northern Europe","Northern Europe","Australia and New Zealand"]
                               })

dfs["ef_education_index"] = pd.concat([dfs["ef_education_index"], native_speaking], ignore_index=True)

In [104]:
merged = dfs["cost_of_living_cities"].merge(dfs["temperature_cities"][['year_avg','city']], how="left",on="city")
merged_with_ef = merged.merge(dfs["ef_education_index"][['ef_score',"level","alpha_2"]], how="left",on="alpha_2")

In [105]:
merged_with_ef.head()

Unnamed: 0,rank,cost_of_living_index,rent_index,cost_of_living_plus_rent_index,groceries_index,restaurant_price_index,local_purchasing_power_index,year,country,city,alpha_2,alpha_3,region,sub_region,year_avg,ef_score,level
0,1,101.7,59.6,81.5,111.4,101.5,152.2,2024-06-01,Switzerland,Geneva,CH,CHE,Europe,Western Europe,,550.0,High Proficiency
1,2,100.4,53.5,77.9,105.9,102.8,181.2,2024-06-01,Switzerland,Zurich,CH,CHE,Europe,Western Europe,,550.0,High Proficiency
2,3,100.0,100.0,100.0,100.0,100.0,100.0,2024-06-01,United States,New York,US,USA,Americas,Northern America,,650.0,Native
3,4,90.5,77.9,84.5,101.8,88.2,162.1,2024-06-01,United States,San Francisco,US,USA,Americas,Northern America,,650.0,Native
4,5,85.8,71.2,78.8,96.9,82.5,128.3,2024-06-01,United States,Boston,US,USA,Americas,Northern America,10.9,650.0,Native


In [106]:
# there should not be any null values
# these values are not found in city temperature dataset
nan_rows = merged_with_ef[merged_with_ef['year_avg'].isnull()]
nan_rows

Unnamed: 0,rank,cost_of_living_index,rent_index,cost_of_living_plus_rent_index,groceries_index,restaurant_price_index,local_purchasing_power_index,year,country,city,alpha_2,alpha_3,region,sub_region,year_avg,ef_score,level
0,1,101.7,59.6,81.5,111.4,101.5,152.2,2024-06-01,Switzerland,Geneva,CH,CHE,Europe,Western Europe,,550.0,High Proficiency
1,2,100.4,53.5,77.9,105.9,102.8,181.2,2024-06-01,Switzerland,Zurich,CH,CHE,Europe,Western Europe,,550.0,High Proficiency
2,3,100.0,100.0,100.0,100.0,100.0,100.0,2024-06-01,United States,New York,US,USA,Americas,Northern America,,650.0,Native
3,4,90.5,77.9,84.5,101.8,88.2,162.1,2024-06-01,United States,San Francisco,US,USA,Americas,Northern America,,650.0,Native
5,6,83.9,42.1,63.8,91.1,89.3,104.9,2024-06-01,Iceland,Reykjavik,IS,ISL,Europe,Northern Europe,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,210,23.9,7.4,16.0,24.6,19.7,121.7,2024-06-01,India,Delhi,IN,IND,Asia,Southern Asia,,490.0,Low Proficiency
210,211,22.8,6.4,15.0,25.6,16.4,127.2,2024-06-01,India,Hyderabad,IN,IND,Asia,Southern Asia,,490.0,Low Proficiency
211,212,22.8,4.8,14.2,26.3,16.9,63.4,2024-06-01,India,Ahmedabad,IN,IND,Asia,Southern Asia,,490.0,Low Proficiency
213,214,21.8,5.5,14.0,26.9,12.3,101.6,2024-06-01,India,Chennai,IN,IND,Asia,Southern Asia,,490.0,Low Proficiency


In [107]:
 merged_with_ef[merged_with_ef['ef_score'].isnull()]

Unnamed: 0,rank,cost_of_living_index,rent_index,cost_of_living_plus_rent_index,groceries_index,restaurant_price_index,local_purchasing_power_index,year,country,city,alpha_2,alpha_3,region,sub_region,year_avg,ef_score,level
5,6,83.9,42.1,63.8,91.1,89.3,104.9,2024-06-01,Iceland,Reykjavik,IS,ISL,Europe,Northern Europe,,,
82,83,56.4,15.9,36.9,59.4,50.3,41.4,2024-06-01,Trinidad and Tobago,Port of Spain,TT,TTO,Americas,Latin America and the Caribbean,,,
95,96,51.5,22.4,37.6,49.9,48.4,80.2,2024-06-01,Slovenia,Ljubljana,SI,SVN,Europe,Southern Europe,10.9,,
96,97,50.9,11.9,32.2,43.3,48.8,68.3,2024-06-01,Latvia,Riga,LV,LVA,Europe,Northern Europe,6.2,,
105,106,47.0,18.2,33.2,62.2,26.8,99.3,2024-06-01,Taiwan,Taipei,TW,TWN,,,23.0,,
142,143,37.9,13.3,26.1,36.7,30.1,58.8,2024-06-01,Montenegro,Podgorica,ME,MNE,Europe,Southern Europe,15.3,,
147,148,36.4,7.8,22.7,35.5,23.6,64.3,2024-06-01,Bosnia and Herzegovina,Sarajevo,BA,BIH,Europe,Southern Europe,10.1,,
159,160,34.6,7.6,21.6,33.4,24.3,53.3,2024-06-01,North Macedonia,Skopje,MK,MKD,Europe,Southern Europe,12.4,,
194,195,29.4,8.3,19.3,28.6,21.9,50.7,2024-06-01,Kosovo,Pristina,,,,,,,


In [110]:
merged_with_ef = merged_with_ef[['cost_of_living_index',
                                         'rent_index',
                                         'cost_of_living_plus_rent_index',
                                         'groceries_index',
                                         'restaurant_price_index',
                                         'local_purchasing_power_index',
                                         'country',
                                         'city',
                                         'alpha_2',
                                         'year_avg',
                                         'ef_score',
                                         'level']]
                                         

In [111]:
merged_with_ef.to_csv(os.path.join(cwd,"city_level.csv"),index=False, header=['cost_of_living_index',
                                         'rent_index',
                                         'cost_of_living_plus_rent_index',
                                         'groceries_index',
                                         'restaurant_price_index',
                                         'local_purchasing_power_index',
                                         'country',
                                         'city',
                                         'country_code',
                                         'temp_year_avg',
                                         'ef_score',
                                         'ef_level'])