In [20]:
import os
import pandas as pd
import numpy as np
cwd = os.getcwd()
parentwd = os.path.dirname(cwd)

In [21]:
parentwd


'/Users/boraberke/drive/kth/kthgit/studyability/data'

In [22]:
dataset_names = [
    "country_city_universities",
    "university_tuition_fees",
    "university_rankings",
]
paths = dict()
for dataset in dataset_names:
    paths[dataset] = os.path.join(parentwd,f"{dataset}.csv")

In [23]:
paths

{'country_city_universities': '/Users/boraberke/drive/kth/kthgit/studyability/data/country_city_universities.csv',
 'university_tuition_fees': '/Users/boraberke/drive/kth/kthgit/studyability/data/university_tuition_fees.csv',
 'university_rankings': '/Users/boraberke/drive/kth/kthgit/studyability/data/university_rankings.csv'}

In [24]:
# read datasets
dfs = {}
for name,path in paths.items():
    dfs[name] = pd.read_csv(path)

In [25]:
dfs["country_city_universities"].head()

Unnamed: 0,university,country,city,alpha_2,alpha_3,region,sub_region
0,University of South Africa,South Africa,Pretoria,ZA,ZAF,Africa,Sub-Saharan Africa
1,Cairo University,Egypt,Cairo,EG,EGY,Africa,Northern Africa
2,National Autonomous University of Mexico,Mexico,Mexico,MX,MEX,Americas,Latin America and the Caribbean
3,Alexandria University,Egypt,Alexandria,EG,EGY,Africa,Northern Africa
4,Ain Shams University,Egypt,Cairo,EG,EGY,Africa,Northern Africa


In [26]:
dfs["university_tuition_fees"].head()

Unnamed: 0,2025_rank,university,tuition_fee_yearly_usd,method
0,1,Massachusetts Institute of Technology (MIT),53790,official
1,2,Imperial College London,44083,scrape-topuniversities
2,3,University of Oxford,40000,estimated
3,4,Harvard University,50000,estimated
4,5,University of Cambridge,40000,estimated


In [27]:
dfs["university_rankings"].head()

Unnamed: 0,2025_rank,2024_rank,university,alpha_2,academic_reputation,employer_reputation,international_students,employment_outcomes,sustainability,qs_overall_score
0,1,1,Massachusetts Institute of Technology (MIT),US,100.0,100.0,86.8,100.0,99.0,100.0
1,2,6,Imperial College London,UK,98.5,99.5,99.6,93.4,99.7,98.5
2,3,3,University of Oxford,UK,100.0,100.0,97.7,100.0,85.0,96.9
3,4,4,Harvard University,US,100.0,100.0,69.0,100.0,84.4,96.8
4,5,2,University of Cambridge,UK,100.0,100.0,94.8,100.0,84.8,96.7


In [28]:
dfs["university_rankings"].university = dfs["university_rankings"].university.str.title()
dfs["university_rankings"].university = dfs["university_rankings"].university.str.strip()
dfs["university_tuition_fees"].university = dfs["university_tuition_fees"].university.str.title()
dfs["university_tuition_fees"].university = dfs["university_tuition_fees"].university.str.strip()
dfs["country_city_universities"].university = dfs["country_city_universities"].university.str.title()
dfs["country_city_universities"].university = dfs["country_city_universities"].university.str.strip()

In [29]:
#custom fix for uni names
dfs["university_rankings"] = dfs["university_rankings"].replace("Ucl", "University College London")
dfs["university_tuition_fees"] = dfs["university_tuition_fees"].replace("Ucl", "University College London")

In [30]:
# not all of the names are matching perfectly. User rapidfuzz to find the closest strings.
from rapidfuzz import process

uni_rankings_uni = dfs["university_rankings"].university.values.tolist()
dfs["university_tuition_fees"]['university'] = dfs["university_tuition_fees"].university.apply(lambda x: process.extractOne(x, uni_rankings_uni)[0])

In [31]:
country_city_unis = dfs["country_city_universities"].university.values.tolist()
dfs["university_rankings"]['university_closest'] = dfs["university_rankings"].university.apply(lambda x: process.extractOne(x, country_city_unis)[0])

In [32]:
merged = dfs["university_rankings"].merge(dfs["university_tuition_fees"][['university','tuition_fee_yearly_usd','method']], how="left",on="university")
merged_with_cities = merged.merge(dfs["country_city_universities"][['university',"city",'region','sub_region']], how="left",left_on="university_closest", right_on="university")

In [33]:
merged_with_cities.head()

Unnamed: 0,2025_rank,2024_rank,university_x,alpha_2,academic_reputation,employer_reputation,international_students,employment_outcomes,sustainability,qs_overall_score,university_closest,tuition_fee_yearly_usd,method,university_y,city,region,sub_region
0,1,1,Massachusetts Institute Of Technology (Mit),US,100.0,100.0,86.8,100.0,99.0,100.0,Massachusetts Institute Of Technology,53790.0,official,Massachusetts Institute Of Technology,Cambridge,Americas,Northern America
1,2,6,Imperial College London,UK,98.5,99.5,99.6,93.4,99.7,98.5,Imperial College London,44083.0,scrape-topuniversities,Imperial College London,London,Europe,Northern Europe
2,3,3,University Of Oxford,UK,100.0,100.0,97.7,100.0,85.0,96.9,University Of Oxford,40000.0,estimated,University Of Oxford,Oxford,Europe,Northern Europe
3,4,4,Harvard University,US,100.0,100.0,69.0,100.0,84.4,96.8,Harvard University,50000.0,estimated,Harvard University,Cambridge,Americas,Northern America
4,5,2,University Of Cambridge,UK,100.0,100.0,94.8,100.0,84.8,96.7,University Of Cambridge,40000.0,estimated,University Of Cambridge,Cambridge,Europe,Northern Europe


In [34]:
# there should not be any null values
# these values are not found in tuition fee dataset
nan_rows = merged_with_cities[merged_with_cities['tuition_fee_yearly_usd'].isnull()]
nan_rows

Unnamed: 0,2025_rank,2024_rank,university_x,alpha_2,academic_reputation,employer_reputation,international_students,employment_outcomes,sustainability,qs_overall_score,university_closest,tuition_fee_yearly_usd,method,university_y,city,region,sub_region
897,851-900,951-1000,"Vietnam National University, Hanoi",VN,15.3,18.7,1.2,60.1,6.4,-,"Vietnam National University, Hanoi",,,"Vietnam National University, Hanoi",Hanoi,Asia,South-eastern Asia
898,851-900,801-850,Vilnius Gediminas Technical University,LT,8.7,13.8,32.9,6.9,2.0,-,Cairo University,,,Cairo University,Cairo,Africa,Northern Africa
899,851-900,761-770,Virginia Commonwealth University,US,7.1,8.5,3.2,8.2,8.5,-,Virginia Commonwealth University,,,Virginia Commonwealth University,Richmond,Americas,Northern America
900,851-900,951-1000,Yildiz Technical University,TR,9.7,24.7,18.5,19.9,35.1,-,Yıldız Technical University,,,Yıldız Technical University,Istanbul,Asia,Western Asia
902,771-780,801-850,China University Of Geosciences,CN,5.1,1.4,2.7,1.4,4.0,-,China University Of Geosciences (Wuhan),,,China University Of Geosciences (Wuhan),Wuhan,Asia,Eastern Asia
903,1001-1200,1001-1200,Harbin Engineering University,CN,5.4,2.4,2.2,3.8,1.3,-,Harbin Engineering University,,,Harbin Engineering University,Harbin,Asia,Eastern Asia
904,901-950,851-900,Catania University,IT,11.6,3.0,4.5,5.7,3.6,-,University Of Catania,,,University Of Catania,Catania,Europe,Southern Europe
908,901-950,1001-1200,German Jordanian University,JO,8.6,15.1,16.3,8.5,1.5,-,Cairo University,,,Cairo University,Cairo,Africa,Northern Africa
909,901-950,1001-1200,Imam Mohammad Ibn Saud Islamic University – Imsiu,SA,14.8,7.6,4.2,26.5,1.0,-,University Of South Africa,,,University Of South Africa,Pretoria,Africa,Sub-Saharan Africa
910,901-950,851-900,Kanazawa University,JP,10.5,3.9,5.8,1.8,1.6,-,Kanazawa University,,,Kanazawa University,Kanazawa,Asia,Eastern Asia


In [35]:
# there should not be any null values
# these values are not found in tuition fee dataset
nan_rows = merged_with_cities[merged_with_cities['city'].isnull()]
nan_rows

Unnamed: 0,2025_rank,2024_rank,university_x,alpha_2,academic_reputation,employer_reputation,international_students,employment_outcomes,sustainability,qs_overall_score,university_closest,tuition_fee_yearly_usd,method,university_y,city,region,sub_region
218,219,217,National Yang Ming Chiao Tung University,TW,32.9,62.7,19.2,58.3,38.8,43.9,National Yang Ming Chiao Tung University,10000.0,estimated,National Yang Ming Chiao Tung University,,Asia,Eastern Asia
485,485,505,National Sun Yat-Sen University,TW,26.5,24.9,10.8,11.4,48.9,24.8,National Sun Yat-Sen University,8000.0,estimated,National Sun Yat-Sen University,,Asia,Eastern Asia
618,611-620,577,Taipei Medical University (Tmu),TW,10.6,12.5,13.7,8.1,8.3,-,Taipei Medical University,1800.0,scrape-topuniversities,Taipei Medical University,,Asia,Eastern Asia


In [36]:
# add university ids
merged_with_cities['university_id'] = merged_with_cities.index

In [37]:
merged_with_cities = merged_with_cities[['2025_rank',
                                         '2024_rank',
                                         'university_x',
                                         'alpha_2',
                                         'academic_reputation',
                                         'employer_reputation',
                                         'international_students',
                                         'employment_outcomes',
                                         'sustainability',
                                         'qs_overall_score',
                                         'tuition_fee_yearly_usd',
                                         'method',
                                         'city',
                                         'region',
                                         'sub_region',
                                         'university_id']]

In [38]:
merged_with_cities.to_csv(os.path.join(cwd,"university_level.csv"),index=False, header=['2025_rank',
                                                      '2024_rank',
                                                      'university',
                                                      'country_code',
                                                      'academic_reputation',
                                                      'employer_reputation',
                                                      'international_students',
                                                      'employment_outcomes',
                                                      'sustainability',
                                                      'qs_overall_score',
                                                      'tuition_fee_yearly_usd',
                                                      'method',
                                                      'city',
                                                      'region',
                                                      'sub_region',
                                                      'university_id'])