In [36]:
import os
import pandas as pd
import numpy as np
cwd = os.getcwd()
parentwd = os.path.dirname(cwd)

In [37]:
parentwd


'/Users/ujanjan/Documents/KTH/DH2321-information-visualization/studyability/data'

In [38]:
dataset_names = [
    "country_city_universities",
    "university_tuition_fees",
    "university_rankings",
]
paths = dict()
for dataset in dataset_names:
    paths[dataset] = os.path.join(parentwd,f"{dataset}.csv")

In [39]:
paths

{'country_city_universities': '/Users/ujanjan/Documents/KTH/DH2321-information-visualization/studyability/data/country_city_universities.csv',
 'university_tuition_fees': '/Users/ujanjan/Documents/KTH/DH2321-information-visualization/studyability/data/university_tuition_fees.csv',
 'university_rankings': '/Users/ujanjan/Documents/KTH/DH2321-information-visualization/studyability/data/university_rankings.csv'}

In [40]:
# read datasets
dfs = {}
for name,path in paths.items():
    dfs[name] = pd.read_csv(path)

In [41]:
dfs["country_city_universities"].head()

Unnamed: 0,university,country,city,alpha_2,alpha_3,region,sub_region
0,University of South Africa,South Africa,Pretoria,ZA,ZAF,Africa,Sub-Saharan Africa
1,Cairo University,Egypt,Cairo,EG,EGY,Africa,Northern Africa
2,National Autonomous University of Mexico,Mexico,Mexico,MX,MEX,Americas,Latin America and the Caribbean
3,Alexandria University,Egypt,Alexandria,EG,EGY,Africa,Northern Africa
4,Ain Shams University,Egypt,Cairo,EG,EGY,Africa,Northern Africa


In [42]:
dfs["university_tuition_fees"].head()

Unnamed: 0,2025_rank,university,tuition_fee_yearly_usd,method
0,1,Massachusetts Institute of Technology (MIT),53790,official
1,2,Imperial College London,44083,scrape-topuniversities
2,3,University of Oxford,40000,estimated
3,4,Harvard University,50000,estimated
4,5,University of Cambridge,40000,estimated


In [43]:
dfs["university_rankings"].head()

Unnamed: 0,2025_rank,2024_rank,university,alpha_2,academic_reputation,employer_reputation,faculty_student,citations_per_faculty,international_faculty,international_students,international_research_network,employment_outcomes,sustainability,qs_overall_score
0,1,1,Massachusetts Institute of Technology (MIT),US,100.0,100.0,100.0,100.0,99.3,86.8,96.0,100.0,99.0,100.0
1,2,6,Imperial College London,UK,98.5,99.5,98.2,93.9,100.0,99.6,97.4,93.4,99.7,98.5
2,3,3,University of Oxford,UK,100.0,100.0,100.0,84.8,98.1,97.7,100.0,100.0,85.0,96.9
3,4,4,Harvard University,US,100.0,100.0,96.3,100.0,74.1,69.0,99.6,100.0,84.4,96.8
4,5,2,University of Cambridge,UK,100.0,100.0,100.0,84.6,100.0,94.8,99.3,100.0,84.8,96.7


In [44]:
dfs["university_rankings"].university = dfs["university_rankings"].university.str.title()
dfs["university_rankings"].university = dfs["university_rankings"].university.str.strip()
dfs["university_tuition_fees"].university = dfs["university_tuition_fees"].university.str.title()
dfs["university_tuition_fees"].university = dfs["university_tuition_fees"].university.str.strip()
dfs["country_city_universities"].university = dfs["country_city_universities"].university.str.title()
dfs["country_city_universities"].university = dfs["country_city_universities"].university.str.strip()

In [45]:
#custom fix for uni names
dfs["university_rankings"] = dfs["university_rankings"].replace("Ucl", "University College London")
dfs["university_tuition_fees"] = dfs["university_tuition_fees"].replace("Ucl", "University College London")

In [46]:
# not all of the names are matching perfectly. User rapidfuzz to find the closest strings.
from rapidfuzz import process

uni_rankings_uni = dfs["university_rankings"].university.values.tolist()
dfs["university_tuition_fees"]['university'] = dfs["university_tuition_fees"].university.apply(lambda x: process.extractOne(x, uni_rankings_uni)[0])

In [47]:
country_city_unis = dfs["country_city_universities"].university.values.tolist()
dfs["university_rankings"]['university_closest'] = dfs["university_rankings"].university.apply(lambda x: process.extractOne(x, country_city_unis)[0])

In [48]:
merged = dfs["university_rankings"].merge(dfs["university_tuition_fees"][['university','tuition_fee_yearly_usd','method']], how="left",on="university")
merged_with_cities = merged.merge(dfs["country_city_universities"][['university',"city",'region','sub_region']], how="left",left_on="university_closest", right_on="university")

In [49]:
merged_with_cities.head()

Unnamed: 0,2025_rank,2024_rank,university_x,alpha_2,academic_reputation,employer_reputation,faculty_student,citations_per_faculty,international_faculty,international_students,...,employment_outcomes,sustainability,qs_overall_score,university_closest,tuition_fee_yearly_usd,method,university_y,city,region,sub_region
0,1,1,Massachusetts Institute Of Technology (Mit),US,100.0,100.0,100.0,100.0,99.3,86.8,...,100.0,99.0,100.0,Massachusetts Institute Of Technology,53790,official,Massachusetts Institute Of Technology,Cambridge,Americas,Northern America
1,2,6,Imperial College London,UK,98.5,99.5,98.2,93.9,100.0,99.6,...,93.4,99.7,98.5,Imperial College London,44083,scrape-topuniversities,Imperial College London,London,Europe,Northern Europe
2,3,3,University Of Oxford,UK,100.0,100.0,100.0,84.8,98.1,97.7,...,100.0,85.0,96.9,University Of Oxford,40000,estimated,University Of Oxford,Oxford,Europe,Northern Europe
3,4,4,Harvard University,US,100.0,100.0,96.3,100.0,74.1,69.0,...,100.0,84.4,96.8,Harvard University,50000,estimated,Harvard University,Cambridge,Americas,Northern America
4,5,2,University Of Cambridge,UK,100.0,100.0,100.0,84.6,100.0,94.8,...,100.0,84.8,96.7,University Of Cambridge,40000,estimated,University Of Cambridge,Cambridge,Europe,Northern Europe


In [50]:
# there should not be any null values
# these values are not found in tuition fee dataset
nan_rows = merged_with_cities[merged_with_cities['tuition_fee_yearly_usd'].isnull()]
nan_rows

Unnamed: 0,2025_rank,2024_rank,university_x,alpha_2,academic_reputation,employer_reputation,faculty_student,citations_per_faculty,international_faculty,international_students,...,employment_outcomes,sustainability,qs_overall_score,university_closest,tuition_fee_yearly_usd,method,university_y,city,region,sub_region


In [51]:
# there should not be any null values
# these values are not found in tuition fee dataset
nan_rows = merged_with_cities[merged_with_cities['city'].isnull()]
nan_rows

Unnamed: 0,2025_rank,2024_rank,university_x,alpha_2,academic_reputation,employer_reputation,faculty_student,citations_per_faculty,international_faculty,international_students,...,employment_outcomes,sustainability,qs_overall_score,university_closest,tuition_fee_yearly_usd,method,university_y,city,region,sub_region


In [52]:
# add university ids
merged_with_cities['university_id'] = merged_with_cities.index

In [18]:
merged_with_cities = merged_with_cities[['2025_rank',
                                         '2024_rank',
                                         'university_x',
                                         'alpha_2',
                                         'academic_reputation',
                                         'employer_reputation',
                                         'faculty_student',
                                         'citations_per_faculty',
                                         'international_faculty',
                                         'international_students',
                                         'international_research_network',
                                         'employment_outcomes',
                                         'sustainability',
                                         'qs_overall_score',
                                         'tuition_fee_yearly_usd',
                                         'method',
                                         'city',
                                         'region',
                                         'sub_region',
                                         'university_id']]

In [19]:
merged_with_cities.to_csv(os.path.join(cwd,"university_level.csv"),index=False, header=['2025_rank',
                                                      '2024_rank',
                                                      'university',
                                                      'country_code',
                                                      'academic_reputation',                            
                                                      'employer_reputation',
                                                      'faculty_student',
                                                      'citations_per_faculty',
                                                      'international_faculty',                                  
                                                      'international_students',
                                                      'international_research_network',                                  
                                                      'employment_outcomes',
                                                      'sustainability',
                                                      'qs_overall_score',
                                                      'tuition_fee_yearly_usd',
                                                      'method',
                                                      'city',
                                                      'region',
                                                      'sub_region',
                                                      'university_id'])