In [35]:
# Libraries
import numpy as np
import pandas as pd
import csv
from bs4 import BeautifulSoup
import requests
import re
import os
from urllib.request import urlopen
import time
from concurrent.futures import ThreadPoolExecutor

import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from collections import Counter
from functools import reduce
import json
import heapq

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="my_geocoder")
import plotly.express as px
import plotly.graph_objects as go

# 1. Data collection

## 1.1. Get the list of master's degree courses
We created a file named 'urls.txt' that contains all the urls associated with the url of each master page.
for reaching such purpose, we iterate over all 400 pages and took the link for every 15 urls of each page.
we stored all urls in 'urls.txt' file.

In [None]:
f = open("urls.txt","w") # First we create a txt file where we can write the URLs  #  w means writing mode
for i in range(1, 401): #from first page to page 400
    url = f"https://www.findamasters.com/masters-degrees/msc-degrees/?PG={i}" #pages can be scrolled by changing the number after PG
    result = requests.get(url) # as we have done in class
    soup = BeautifulSoup(result.text, 'html.parser') # to get the html of each page

    for link in soup.find_all(class_ = re.compile('courseLink')): #as in class to get each tag of the page which belongs to class courseLink
        c = (link.get("href"))  # url of each page in the i-th page
        f.write("https://www.findamasters.com/"+c) #writing the rows
        f.write("\n")
f.close()
print('The "urls.txt" file is generated!')

### 1.2. Crawl master's degree pages

We wrote a function named 'download_url'.
Since the FindMaster website blocks us for 70 seconds for every (20 to 22) requests we send, we use 'time.sleep(70)' to wait and then resend the http get request. 
we also omit to download the http files that their directory are already existed.

for sending http get requests asynchronously, we can use async and await methods and take the advantage of using "aiohttp" library. the other way is to use ThreadPoolExecutor function executer. 
It means that we store the executer command in a variable named 'future_to_url' that we are able to call in the future.
The ThreadPoolExecutor is a built-in Python module that provides managing a pool of worker threads. It allows us to submit tasks to the pool, which are then executed by one of the worker threads in the pool.

In [None]:
from concurrent.futures import ThreadPoolExecutor

# Function to download and save HTML for a given URL
def download_url(url, folder_path, page_number):
    # Create a folder for each page if it doesn't exist
    page_folder = os.path.join(folder_path, f"page_{page_number}")
    if os.path.exists(page_folder):
        # uncomment the below code to see which pages are skiped, cause they have already been downloaded.
        # print(f"Skipping Page: {page_number} - Folder already exists.")
        return

    try:
        response = requests.get(url) # Send a GET request to the URL
        response.raise_for_status()  # Raise an exception for bad responses 

        # Create a folder for each page if it doesn't exist
        os.makedirs(page_folder, exist_ok=True)

        # Save the HTML content to a file
        file_path = os.path.join(page_folder, f"html_{page_number}.html")
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(response.text)
        print(f"Downloaded page {page_number}: {url}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download page {page_number}: {url}")
        print(f"Error: {e}")
        print("Retrying in 70 seconds...")
        time.sleep(70)  # Wait for 10 seconds before retrying
        download_url(url, folder_path, page_number)  # Retry the download

# Read all URLs one by one
with open('urls.txt', 'r') as urls_file:
    urls = urls_file.read().splitlines()

output_folder = 'HTML_folders' # Store all HTML files into this directory.

# We can use ThreadPoolExecutor for sending http requests asynchronously. 
# However, Since the FindMaster website blocks us for 70 seconds for every (20 to 22) requests we send, 
# the max_workers in below code assigned to number 1. So it sends requests synchronously.
with ThreadPoolExecutor(max_workers=1) as executor:
    # Enumerate through each URL and submit download tasks to the executor
    future_to_url = {executor.submit(download_url, url, output_folder, page_number): url for page_number, url in enumerate(urls, start=1)}

print("All HTML files are stored in the HTML_folders directory.")

### 1.3 Parse downloaded pages
Here we create a '.tsv' file including the following columns for each of the HTML files.

1. Course Name (to save as ```courseName```): string;
2. University (to save as ```universityName```): string;
3. Faculty (to save as ```facultyName```): string
4. Full or Part Time (to save as ```isItFullTime```): string;
5. Short Description (to save as ```description```): string;
6. Start Date (to save as ```startDate```): string;
7. Fees (to save as ```fees```): string;
8. Modality (to save as ```modality```):string;
9. Duration (to save as ```duration```):string;
10. City (to save as ```city```): string;
11. Country (to save as ```country```): string;
12. Presence or online modality (to save as ```administration```): string;
13. Link to the page (to save as ```url```): string.

Then, we merge all those files together to generate our final dataset.

In [None]:
current_path = os.getcwd()
# '/Users/armanfeili/Arman/Sapienza Courses/ADM/Homeworks/HW3/phase-2/ADM-HW3/HTML_folders'

for i in range(1,6001):
    # os.chdir(r'C:\Users\susan\Documents\DS\ADM\HW3\ADM-HW3\HTML_folders\page_'+str(i)) #change directories
    os.chdir(r'/Users/armanfeili/Arman/Sapienza Courses/ADM/Homeworks/HW3/phase-3/ADM-HW3/HTML_folders/page_'+str(i)) #change directories
    
    for filename in os.listdir(os.getcwd()): # get all the files in a folder
        if filename.endswith(".tsv"): continue # tsv file is already generated.
        elif filename.endswith(".html"): # if file extension is .html
            with open(os.path.join(os.getcwd(), filename), 'r',encoding='utf-8') as f: # open each file into a folder
                soup = BeautifulSoup(f,'html.parser') # get the html file by each file 
                out=[] # initialize a list where we append all the informations parsed from each html file

                # 1  Course Name
                courseName = soup.find_all(class_=re.compile("course-header__course-title"))
                out.append(courseName[0].text.strip() if courseName else "") #text.strip to eliminate strange simbols for the space
                # 2  University
                universityName = soup.find_all(class_=re.compile("course-header__institution"))
                out.append(universityName[0].text if universityName else "")
                # 3  Faculty
                facultyName = soup.find_all(class_=re.compile("course-header__department"))
                out.append(facultyName[0].text if facultyName else "")
                # 4  Full or Part Time
                isItFullTime = soup.find_all(class_=re.compile("concealLink"))
                out.append(isItFullTime[0].text if isItFullTime else "")
                # 5  Short Description
                description = soup.find_all(class_=re.compile("course-sections__content"))
                out.append(description[0].text.replace('\n', '') if description else "")
                # 6  Start Date
                startDate = soup.find_all(class_=re.compile("key-info__start-date"))
                out.append(startDate[0].text if startDate else "")
                # 7  Fees 
                fees_elements = soup.find_all(class_=re.compile("course-sections__fees")) # taking the fee
                fees_text = fees_elements[0].text.replace('\n', '') if fees_elements else "" 
                cleaned_fees = re.sub(r'Fees', '', fees_text)  # To not "Fees" at the beginning 
                out.append(cleaned_fees.strip() if cleaned_fees else "")
                # 8  Modality
                modality = soup.find_all(class_=re.compile("key-info__qualification"))
                out.append(modality[0].text if modality else "")
                # 9  Duration
                duration = soup.find_all(class_=re.compile("key-info__duration"))
                out.append(duration[0].text if duration else "")
                # 10  City
                city = soup.find_all(class_=re.compile("course-data__city"))
                out.append(city[0].text if city else "")
                # 11  Country
                country = soup.find_all(class_=re.compile("course-data__country"))
                out.append(country[0].text if country else "")
                # 12  Presence or online modality
                # We have seen that some courses has both online or oncampus modality, one of them is "Master of Business Administration"
                on_campus_elements = soup.find_all(class_=re.compile("course-data__on-campus"))
                online_elements = soup.find_all(class_=re.compile("course-data__online"))
                if on_campus_elements and online_elements:
                    out.append("both")
                else:
                    out.append(on_campus_elements[0].text if on_campus_elements else online_elements[0].text if online_elements else "Nan")
                # 13  Link to the page
                out.append(soup.find('link', {'rel': 'canonical'}).get('href') if soup.find('link', {'rel': 'canonical'}) else "Nan")
                f.close()
                
                # Creating file .tsv
                l = ['courseName','universityName','facultyName','isItFullTime','description','startDate','fees','modality','duration',
                    'city','country','administration','url']
                with open(filename+'.tsv','w',encoding='utf-8') as tsv:
                    tsv_output = csv.writer(tsv, delimiter='\t')
                    tsv_output.writerow(l)
                    tsv_output.writerow(out)
    os.chdir('..')  

print("All HTML files have been read and all .tsv files have been generated.")

In [None]:
data=[]
# to merge all the .tsv files
for i in range(1,6001):
    # os.chdir(r'./HTML_folders/page_'+str(i)) #change directories
    os.chdir(r'/Users/armanfeili/Arman/Sapienza Courses/ADM/Homeworks/HW3/phase-3/ADM-HW3/HTML_folders/page_'+str(i)) #change directories
    for filename in os.listdir(os.getcwd()):
        if filename.endswith(".tsv"):
            a = pd.read_csv(filename,sep='\t')
            data.append(a)
    os.chdir('..')
data=pd.concat(data,ignore_index=True)   
data.to_csv('../dataset.tsv',sep='\t',index=False) # Saving the big one
print("dataset.tsv file has been generated as the main dataset.")

In [None]:
# An illustration to the dataset:
data.head(5)

## 2. Search Engine

In [3]:
# Load the dataset we are going to work with
data = pd.read_table(r"dataset.tsv")
# Drop NAs
data = data.dropna()

## 2.0 Preprocessing

### 2.0.0) Preprocessing the text


In [7]:
# This function takes a text, removes special cases, punctuations and stop-words then
# it applies stemming and finally returns the preprocessed words separated by commas.

def preprocess_description(description_text):

    # Handle the cases of float type (there are 24)
    if type(description_text) != str:
        return ""

    # Remove all special chars and punctuations
    description_text = re.sub("[^a-z A-Z ]+","", description_text)

    # Convert everything in lowercase
    description_text = description_text.lower()

    # Remove stopwords using nltk package
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(description_text)
    words = [word for word in words if word not in stop_words]

    # Apply stemming using ntlk package
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    # Separate words with commas
    words = ','.join(words)

    return words

In [8]:
# Create and fill new column where the preprocessed descriptions wil be stored:
data["clean_description"] = data["description"].apply(preprocess_description)

In [9]:
# Show different fields
data[["description", "clean_description"]].head(10)

Unnamed: 0,description,clean_description
0,3D visualisation and animation play a role in ...,"visualis,anim,play,role,mani,area,popular,medi..."
1,Businesses and governments rely on sound finan...,"busi,govern,reli,sound,financi,knowledg,underp..."
2,"Our Accounting, Accountability & Financial Man...","account,account,financi,manag,msc,cours,provid..."
3,Embark on a professional accounting career wit...,"embark,profession,account,career,academ,ground..."
4,Join us for an online session for prospective ...,"join,us,onlin,session,prospect,student,find,ms..."
5,The Advanced Chemical Engineering MSc at Leeds...,"advanc,chemic,engin,msc,leed,build,core,founda..."
6,Programme overviewThe Advanced Master in Finan...,"programm,overviewth,advanc,master,financi,mark..."
7,Programme overviewThe Advanced Master in Innov...,"programm,overviewth,advanc,master,innov,strate..."
8,Progress your career as a physiotherapist with...,"progress,career,physiotherapist,within,nh,priv..."
9,Goal of the pro­grammeWould you like to be inv...,"goal,programmewould,like,involv,find,solut,fut..."


### 2.0.1) Preprocessing the fees column

In [11]:
# This function was provided by ChatGPT.
# It downloads the latest exchange rates (wrt United States Dollars) from openexchangerates.org.
# It returns a dictionary of the form {exchange_code : exchange_rate}
def get_latest_exchange_rates(app_id):

    base_url = "https://openexchangerates.org/api/latest.json"
    params = {"app_id": app_id}
    response = requests.get(base_url, params=params)

    if response.status_code == 200:
        return response.json().get("rates")
    else:
        print(f"Failed to fetch exchange rates. Status code: {response.status_code}")
        return None

In [14]:
# Load latest exchange rates
my_app_id = "1457fcd3d536441baad3ce7918b5025b"
exchange_rates_dict = get_latest_exchange_rates(my_app_id)

# Show exchange_rates_dict
count = 0
print("{")
for key, value in exchange_rates_dict.items():
    if count < 5:
        print(f'{key}: {value}')
        count += 1
    else:
        break
print("}")

{
AED: 3.672625
AFN: 70.5
ALL: 93.718087
AMD: 402.18
ANG: 1.801068
}


In [15]:
# Dictionary provided by ChatGPT, I changed some symbols that were not present.
# It will be used to convert a currency symbol into currency code.

currency_symbol_to_code = {
    '$': 'USD',  # United States Dollar
    '€': 'EUR',  # Euro
    "EURO": "EUR", # Euro
    '¥': 'JPY',  # Japanese Yen
    '£': 'GBP',  # British Pound Sterling
    'A$': 'AUD',  # Australian Dollar
    'C$': 'CAD',  # Canadian Dollar
    'CHF': 'CHF',  # Swiss Franc
    'KR': 'SEK',  # Swedish Krona
    'NZ$': 'NZD',  # New Zealand Dollar
    # Add more symbols and codes as needed
}

# This function handles the preprocessing of the "fees" field
def preprocess_fees(fees_text):

    # Handles the case of "nan"
    if type(fees_text) != str:
        return None

    # Preallocate all the fees found in text
    total_fees = []

    # Symbols we are looking for
    symbols = "GBP|USD|ISK|£|\$|₹|¥|₪|₽|₩|₦|₴|﷼|€|Euro"

    # Match the (symbol, number) case
    left_symbol_matches = re.findall(fr'(?:{symbols})+\s*[0-9]+[,.]?[0-9]*', fees_text, flags=re.IGNORECASE)

    # Match the (number,  symbol) case
    right_symbol_matches = re.findall(fr'[0-9]+[,.]?[0-9]*\s*(?:{symbols})+', fees_text, flags=re.IGNORECASE)

    # Merge them
    matches = left_symbol_matches + right_symbol_matches

    # If we got no matches returns None
    if len(matches) == 0 :
        return None

    for match in matches:
        # Remove "," or "." and change to the right type
        number = re.findall('([0-9]+)[.,]*([0-9]*)', match)[0]
        number = float(number[0] + number[1])

        # Isolate symbol and upper case it (to match exchange_rates_dict codes)
        symbol = re.findall(fr'(?i)({symbols})', match)[0].upper()

        # Transform symbol into code (if not already a code)
        if symbol in currency_symbol_to_code.keys():
            symbol = currency_symbol_to_code[symbol]

        # Change into USD using the exchange_rates dictionary and append to fees
        total_fees.append(number / exchange_rates_dict[symbol])

    # Take the max fee and return it
    max_fee = round(max(total_fees))
    return max_fee


In [16]:
# Save preprocessed fees into a new column, and show them
data["fees_USD"] = data["fees"].apply(preprocess_fees)
data[["fees", "fees_USD"]].head(10)

Unnamed: 0,fees,fees_USD
0,Please see the university website for further ...,
1,"UK: £18,000 (Total) International: £34,750 (To...",43810.0
2,Please see the university website for further ...,
3,Please see the university website for further ...,
4,Please see the university website for further ...,
5,"UK: £13,750 (Total) International: £31,000 (To...",39082.0
6,18.000 €,19693.0
7,18.000 €,19693.0
8,Please see the university website for further ...,
9,Tuition fee per year (non-EU/EEA students): 15...,16411.0


## 2.1. Conjunctive query

### 2.1.1) Create your index!


In [17]:
# Store all terms contained in the "clean_description" as a set
terms_set = set(','.join(data["clean_description"]).split(","))

# Create a dict that associate each term to a unique id
terms_id_dict = {key: value for value, key in enumerate(terms_set)}

In [18]:
# Here we create the inverted index dictionary.

# Preallocate a dictionary with the form: {term_id : []}
inverted_index = {i : [] for i in range(len(terms_id_dict))}

# Iterating over all terms and texts in the "clean_description" field
for i,text in enumerate(data["clean_description"]):

    text_set = set(text.split(","))

    for term in text_set:
        # Get term id
        term_id = terms_id_dict[term]

        # Add document id "i" to the term_id list
        inverted_index[term_id].append(i)

In [19]:
# Show inverted_index_dict structure
print("{")
count = 0
for key, value in inverted_index.items():
    if count < 5:
        print(f'{key}: {value}')
        count += 1
    else:
        break
print("}")

{
0: [28, 33, 38, 40, 81, 115, 187, 266, 302, 317, 344, 359, 417, 471, 472, 477, 503, 541, 552, 564, 579, 586, 587, 588, 755, 757, 759, 817, 845, 847, 921, 945, 961, 972, 996, 1087, 1107, 1117, 1118, 1141, 1142, 1148, 1163, 1173, 1183, 1215, 1248, 1250, 1313, 1395, 1435, 1437, 1476, 1486, 1487, 1489, 1491, 1676, 1677, 1678, 1692, 1700, 1701, 1705, 1754, 1771, 1783, 1785, 1831, 1834, 1855, 1862, 1872, 1893, 1934, 1946, 1959, 1960, 2015, 2029, 2067, 2108, 2112, 2129, 2142, 2168, 2169, 2195, 2218, 2224, 2232, 2243, 2249, 2255, 2257, 2268, 2300, 2336, 2354, 2370, 2386, 2479, 2484, 2524, 2560, 2629, 2648, 2649, 2650, 2652, 2654, 2655, 2674, 2678, 2679, 2685, 2693, 2694, 2718, 2724, 2743, 2747, 2751, 2769, 2815, 2912, 2913, 2951, 2963, 2970, 2978, 2995, 3013, 3017, 3018, 3044, 3046, 3109, 3122, 3133, 3157, 3161, 3198, 3254, 3264, 3276, 3300, 3308, 3313, 3342, 3396, 3427, 3443, 3445, 3450, 3461, 3487, 3521, 3526, 3531, 3552, 3574, 3577, 3586, 3588, 3592, 3601, 3627, 3640, 3662, 3665, 3687, 36

### 2.1.2) Execute the query

In [20]:
# This function takes a query as a input and returns the most affine docs:

def naive_search_engine(query):
    # Apply same preprocessing done for descriptions and split wrt ","
    query = preprocess_description(query).split(",")

    # For each term in query get all the docs ids that contain it as a set
    query_docs = [set(inverted_index[terms_id_dict[term]]) for term in query]

    # Select the docs ids that contain all the query term, and sort them
    query_docs = set.intersection(*query_docs)
    query_docs = list(sorted(query_docs))

    # Return selected columns of those docs
    result  = data.iloc[query_docs, [0,1,4,12]]

    return result

In [21]:
naive_search_engine("advanced knowledge")

Unnamed: 0,courseName,universityName,description,url
1,Accounting and Finance - MSc,University of Leeds,Businesses and governments rely on sound finan...,https://www.findamasters.com/masters-degrees/c...
4,Addictions MSc,King’s College London,Join us for an online session for prospective ...,https://www.findamasters.com/masters-degrees/c...
12,Analytical Toxicology MSc,King’s College London,The Analytical Toxicology MSc is a unique stud...,https://www.findamasters.com/masters-degrees/c...
48,Civil Engineering MSc,University of Greenwich,Meet the future demands of the construction in...,https://www.findamasters.com/masters-degrees/c...
86,Economics - MSc,University of Leeds,Our MSc Economics allows you to apply economic...,https://www.findamasters.com/masters-degrees/c...
...,...,...,...,...
5909,Master of Science/Postgraduate Diploma in Envi...,The Hong Kong University of Science and Techno...,The program is meant to meet the needs of prac...,https://www.findamasters.com/masters-degrees/c...
5937,Master Sociology – Social and Economic Psychology,University of Cologne,This programme provides you with:a solid found...,https://www.findamasters.com/masters-degrees/c...
5957,Masters in Economics,University of Lisbon,OBJECTIVESThe MSc in Economics aims to provide...,https://www.findamasters.com/masters-degrees/c...
5963,Master's in Global and European Politics,European School of Political and Social Scienc...,Europe and the EU in a changing worldOur inter...,https://www.findamasters.com/masters-degrees/c...


## 2.2) Conjunctive query & Ranking score

### 2.2.1) Inverted index


In [22]:
# Here we create the second inverted index dictionary.

# Preallocate a dictionary with the form: {term_id : [(doc_id, tfidf_score)]}
new_inverted_index = {i : [] for i in range(len(terms_id_dict))}

# Iterating over all terms and texts in the "clean_description" field
for i, text in enumerate(data["clean_description"]):

    text_list = text.split(",")

    # "set" here has the purpose of selecting unique terms only.
    for term in set(text_list):

        # Get term id
        term_id = terms_id_dict[term]

        # Get idf score: log( total number of documents / number of documents term is in )
        term_idf = np.log(len(data["clean_description"]) / len(inverted_index[term_id]))

        # Get tf score: number of times term appears in text / total number of terms in text
        term_tf =  text_list.count(term) / len(text_list)

        # Compute tfidf score
        term_tfidf = term_tf * term_idf

        # Update new_inverted_index list with a tuple
        new_inverted_index[term_id].append((i , term_tfidf))

In [23]:
# Show new_inverted_index structure
print("{")
count = 0
for key, value in new_inverted_index.items():
    if count < 5:
        print(f'{key}: {value}')
        count += 1
    else:
        break
print("}")

{
0: [(28, 0.06393913335324654), (33, 0.05201827798230227), (38, 0.05902073847991989), (40, 0.06820174224346298), (81, 0.09590870002986981), (115, 0.12787826670649308), (187, 0.05790713964067611), (266, 0.046501187893270214), (302, 0.06671909567295291), (317, 0.05683478520288581), (344, 0.061381568019116685), (359, 0.05580142547192425), (417, 0.05201827798230227), (471, 0.10960994289127979), (472, 0.10960994289127979), (477, 0.054804971445639894), (503, 0.07307329526085318), (541, 0.03300084302103048), (552, 0.07137391630129847), (564, 0.051151306682597236), (579, 0.06820174224346298), (586, 0.06671909567295291), (587, 0.050312760671407115), (588, 0.05902073847991989), (755, 0.09026701179281865), (757, 0.0538434807185234), (759, 0.06975178183990532), (817, 0.0538434807185234), (845, 0.061381568019116685), (847, 0.04580714031277364), (921, 0.05902073847991989), (945, 0.044479397115301945), (961, 0.0652995404458688), (972, 0.0626342530807313), (996, 0.042042169876107316), (1087, 0.060178

In the execution of queries  we will use the inverted index dictionary defined below, which transforms the list of tuples contained in new_inverted_index into a dictionary.
With this structure **tfidf(term_id, doc_id) = new_inverted_index_as_dict[term_id][doc_id]**

In [24]:
# Convert new_inverted_index values to dictionaries
new_inverted_index_as_dict = {key : dict(new_inverted_index[key]) for key in new_inverted_index.keys()}

In [25]:
# Show new_inverted_index_as_dict structure
print("{")
count = 0
for key, value in new_inverted_index_as_dict.items():
    if count < 5:
        print(f'{key}: {value}')
        count += 1
    else:
        break
print("}")

{
0: {28: 0.06393913335324654, 33: 0.05201827798230227, 38: 0.05902073847991989, 40: 0.06820174224346298, 81: 0.09590870002986981, 115: 0.12787826670649308, 187: 0.05790713964067611, 266: 0.046501187893270214, 302: 0.06671909567295291, 317: 0.05683478520288581, 344: 0.061381568019116685, 359: 0.05580142547192425, 417: 0.05201827798230227, 471: 0.10960994289127979, 472: 0.10960994289127979, 477: 0.054804971445639894, 503: 0.07307329526085318, 541: 0.03300084302103048, 552: 0.07137391630129847, 564: 0.051151306682597236, 579: 0.06820174224346298, 586: 0.06671909567295291, 587: 0.050312760671407115, 588: 0.05902073847991989, 755: 0.09026701179281865, 757: 0.0538434807185234, 759: 0.06975178183990532, 817: 0.0538434807185234, 845: 0.061381568019116685, 847: 0.04580714031277364, 921: 0.05902073847991989, 945: 0.044479397115301945, 961: 0.0652995404458688, 972: 0.0626342530807313, 996: 0.042042169876107316, 1087: 0.0601780078618791, 1107: 0.25575653341298615, 1117: 0.046501187893270214, 1118

## 2.2.2) Execute the query

We define our second search engine below

In [26]:
# This function computes from scratch cosine similarity
def cosine_similarity(vec1, vec2):

    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    # Handle division by zero
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0

    similarity = dot_product / (norm_vec1 * norm_vec2)
    return similarity

def  top_k_search_engine(query, k):

    # Apply same preprocessing done for descriptions and split wrt ","
    query = preprocess_description(query).split(",")

    # Get query terms ids
    query_terms_ids = [terms_id_dict[term] for term in set(query)]

    # Calculate for each term in query its tfidf score
    query_terms_idf = np.array([np.log(len(data["clean_description"]) / len(inverted_index[term_id])) for term_id in query_terms_ids])
    query_terms_tf =  np.array([query.count(term) / len(query) for term in set(query)])
    query_terms_tfidf = query_terms_idf * query_terms_tf

    # Create the tfidf representation of the query
    query_tfidf = np.zeros(len(terms_set))
    query_tfidf[query_terms_ids] = query_terms_tfidf

    # Get the indexes of docs that contain all terms in query using the first inverted index dict
    docs_ids = [set(inverted_index[term_id]) for term_id in query_terms_ids]
    appropriate_docs_ids = list(set.intersection(*docs_ids))

    # docs_tfidf will contain for each doc its tfidf vectorial representation.
    docs_tfidf = {i : [] for i in appropriate_docs_ids}

    # Iterate over docs containing all the words in query
    for doc_id in appropriate_docs_ids:

        # Initialize tdidf vectorial representation of doc
        doc_tfidf = np.zeros(len(terms_set))

        # Get doc as a list of preprocessed words
        doc = data["clean_description"].iloc[doc_id].split(",")

        # For each term
        for term in set(doc):
            # Get term_id
            term_id = terms_id_dict[term]

            # Store tfidf(term_id, doc_id) in doc vectorial representation
            doc_tfidf[term_id] = new_inverted_index_as_dict[term_id][doc_id]

        # Store doc vectorial representation
        docs_tfidf[doc_id] = doc_tfidf


    # Compute cosine similarities between query_tfidf and each doc_tfidf
    cos_sims = [cosine_similarity(query_tfidf, docs_tfidf[key]) for key in appropriate_docs_ids]


    # Select all appropriate_docs_ids from data and specified columns
    result  = data.iloc[appropriate_docs_ids, [0,1,4,12]]

    # Add the cosine similarity score and sort the dataframe
    result["cos_sim"] = cos_sims
    result = result.sort_values(by='cos_sim', ascending=False)

    # Get, if possible, just the top k
    if k < result.shape[0]:
        result = result[:k]

    return result

In [27]:
top_k_search_engine("advanced knowledge", k = 10)

Unnamed: 0,courseName,universityName,description,url,cos_sim
756,Advanced Computing MSc,King’s College London,Our Advanced Computing MSc provides knowledge ...,https://www.findamasters.com/masters-degrees/c...,0.361235
701,Advanced Clinical Practice MSc,University of Greenwich,Learn essential strategies and prepare for lea...,https://www.findamasters.com/masters-degrees/c...,0.333983
931,Advancing Practice - MSc,University of Northampton,Our MSc Advancing Practice awards support the ...,https://www.findamasters.com/masters-degrees/c...,0.32575
654,Advanced Clinical Practice - MSc,Canterbury Christ Church University,Gain the knowledge and skills needed to become...,https://www.findamasters.com/masters-degrees/c...,0.283256
830,Advanced Mechanical Engineering - MSc (Eng),University of Leeds,This course offers a broad range of advanced s...,https://www.findamasters.com/masters-degrees/c...,0.279986
897,Advanced Professional Practice (MSc),University of Gloucestershire,Our lecturers are research active experts who ...,https://www.findamasters.com/masters-degrees/c...,0.270317
653,Advanced Clinical Practice - MSc,University of Northampton,Our MSc Advanced Clinical Practice course aims...,https://www.findamasters.com/masters-degrees/c...,0.270123
786,Advanced Healthcare Practice - MSc,Cardiff University,Why study this courseOur MSc Advanced Healthca...,https://www.findamasters.com/masters-degrees/c...,0.267574
712,Advanced Clinical Practitioner - MSc,University of Sunderland,The MSc Advanced Clinical Practitioner is a hi...,https://www.findamasters.com/masters-degrees/c...,0.251893
892,Advanced Practice in Healthcare MSc,University of Liverpool,Explore specialist areas of practice in-depth ...,https://www.findamasters.com/masters-degrees/c...,0.250338


## 3. Define a new score!
Now it's your turn: build a new metric to rank MSc degrees.

Practically:

The user will enter a text query. As a starting point, get the query-related documents by exploiting the search engine of Step 2.1.
Once you have the documents, you need to sort them according to your new score. In this step, you won't have any more to take into account just the description field of the documents; you can use also the remaining variables in your dataset (or new possible variables that you can create from the existing ones or scrape again from the original web-pages). You must use a heap data structure (you can use Python libraries) for maintaining the top-k documents.
N.B.: You have to define a scoring function, not a filter!

The output, must contain:

courseName
universityName
description
URL
The new similarity score of the documents with respect to the query
Are the results you obtain better than with the previous scoring function? Explain and compare results.

### Answer

To develop our customized search engine, we opted to gather supplementary information for each university.
Recognizing that the selection of the "ideal" university is influenced by the city in which one resides, we chose to extract scores reflecting the quality of life in each city.

The following function used the [Teleport](https://teleport.org/) API to retrieve, for each city in our Data, three scores:
- Education score;
- Safety score;
- Cost of living score.

All of them are normalized in $[0,10]$.
When a city was not integrated into the teleport API, our solution was to obtain the scores associated with the capital of the country in which the city is located. In order to do so we will use the country field of our data in union with the "country_capital_dict", which maps each country to its capital.

In [28]:
# Load country_capital_dict
with open("country_capital_dict.json", 'r') as json_file:
    country_capital_dict = json.load(json_file)

# This function returns life quality scores for each city
def get_scores_out_of_ten(city):
    # Handle different data types
    city = str(city)

    try:
        # Try to get URL of the initial city
        base_url = f"https://api.teleport.org/api/urban_areas/slug:{city.lower()}/scores/"
        response = requests.get(base_url)

        # If city not found, retrieve URL of capital city
        if response.status_code == 404:

            # Retrieve the country of the city
            country = data[data["city"] == city]["country"].iloc[0]

            # Retrieve capital city from  the country
            capital_city = country_capital_dict[country]

            # Try to get URL of the capital city
            base_url = f"https://api.teleport.org/api/urban_areas/slug:{capital_city.lower()}/scores/"
            response = requests.get(base_url)

            # If capital city not found, return error
            if response.status_code == 404:
                return ("City not found", "City not found", "City not found")

        # Retrieve scores from URL
        datas = response.json()

        # Retrieve all categories scores
        categories = datas['categories']
        scores = {category["name"] : float(category["score_out_of_10"]) for  category in categories}

        # Return Cost of Living, Education and Safety
        return (scores["Cost of Living"],
                scores["Education"],
                scores["Safety"])


    except requests.exceptions.RequestException as e:
        return {'error': f'Request to Teleport API failed: {e}'}

In [29]:
# Create three new columns for our new scores (around 10m to run)
data[['Cost_of_Living', 'Education', 'Safety']] = data.city.apply(lambda x: pd.Series(get_scores_out_of_ten(x)))

In [30]:
# Show the new columns
data[['city','Cost_of_Living', 'Education', 'Safety']].head(10)

Unnamed: 0,city,Cost_of_Living,Education,Safety
0,Glasgow,5.623,5.3065,7.496
1,Leeds,5.363,4.9825,7.731
2,London,3.94,9.027,7.2435
3,Reading,3.94,9.027,7.2435
4,London,3.94,9.027,7.2435
5,Leeds,5.363,4.9825,7.731
6,Brussels,4.477,6.653,6.703
7,Brussels,4.477,6.653,6.703
8,Glasgow,5.623,5.3065,7.496
9,Helsinki,4.121,5.4545,8.674


Those scores will affect the importance our custom search engine gives to each university.
As an initial and naive approach we decided to keep thinks linear and just compute the new score as shown below:

\begin{equation}
NewScore = \frac{CosSimilarity \cdot Education \cdot Safety }{LivingCost}
\end{equation}


The following function follows the one already shown previously but orders the result with respect to the new score we just defined.

In [207]:
data = pd.read_table("new_dataset.tsv")

In [208]:
def  custom_search_engine(query, k):

    # Apply same preprocessing done for descriptions and split wrt ","
    query = preprocess_description(query).split(",")

    # Get query terms ids
    query_terms_ids = [terms_id_dict[term] for term in set(query)]

    # Calculate for each term in query its tfidf score
    query_terms_idf = np.array([np.log(len(data["clean_description"]) / len(inverted_index[term_id])) for term_id in query_terms_ids])
    query_terms_tf =  np.array([query.count(term) / len(query) for term in set(query)])
    query_terms_tfidf = query_terms_idf * query_terms_tf

    # Create the tfidf representation of the query
    query_tfidf = np.zeros(len(terms_set))
    query_tfidf[query_terms_ids] = query_terms_tfidf

    # Get the indexes of docs that contain all terms in query using the first inverted index dict
    docs_ids = [set(inverted_index[term_id]) for term_id in query_terms_ids]
    appropriate_docs_ids = list(set.intersection(*docs_ids))

    # docs_tfidf will contain for each doc its tfidf vectorial representation.
    docs_tfidf = {i : [] for i in appropriate_docs_ids}

    # Iterate over docs containing all the words in query
    for doc_id in appropriate_docs_ids:

        # Initialize tdidf vectorial representation of doc
        doc_tfidf = np.zeros(len(terms_set))

        # Get doc as a list of preprocessed words
        doc = data["clean_description"].iloc[doc_id].split(",")

        # For each term
        for term in set(doc):
            # Get term_id
            term_id = terms_id_dict[term]

            # Store tfidf(term_id, doc_id) in doc vectorial representation
            doc_tfidf[term_id] = new_inverted_index_as_dict[term_id][doc_id]

        # Store doc vectorial representation
        docs_tfidf[doc_id] = doc_tfidf


    # Compute cosine similarities between query_tfidf and each doc_tfidf
    cos_sims = [cosine_similarity(query_tfidf, docs_tfidf[key]) for key in appropriate_docs_ids]


    # Select all appropriate_docs_ids from data and specified columns
    result  = data.iloc[appropriate_docs_ids, [0,1,4,12, -3, -2, -1]]

    # Add the cosine similarity score
    result["Cos_sim"] = cos_sims

    # Compute our custom score
    our_score = result["Cos_sim"] * result["Education"] * result["Safety"] / result["Cost_of_Living"]

    # Add the cosine similarity score 
    result["Score"] = our_score

    # Convert the DataFrame to a dictionary
    diz = result.to_dict()

    # Mantaining the top k-university into an heap
    l_heap = []
    # Iterate over the keys (indices) and values (scores) 
    for i,j in zip(diz["courseName"].keys(),diz["Score"].values()):
        l_1=[]
        l_1.append(j) # Append the score
        l_1.append(i) # Append the index (university ID)
        heapq.heappush(l_heap,l_1) # Push the pair (score, index) onto the heap

    doc=[] # List to store the indices of the top K univerisities
    score=[] # List to store the corresponding scores of the top K universities

    # Retrieve the top K elements from the heap
    for i in heapq.nlargest(k,l_heap):
        doc.append(i[1])
        score.append(i[0])

    # Select rows from the result datafram corresponding to the top K universities
    df_heap = result.loc[doc]

    # Return the DataFrame containing information about the top K universities
    return df_heap

In [209]:
custom_search_engine("advanced knowledge", k = 10)

Unnamed: 0,courseName,universityName,description,url,Cost_of_Living,Education,Safety,Cos_sim,Score
5820,Masters in Hospitality Management,Ecole hotelière de Lausanne,"With our Master in Hospitality Management, you...",https://www.findamasters.com/masters-degrees/c...,1.0,6.622,8.0325,0.113095,6.015666
738,Advanced Computing MSc,King’s College London,Our Advanced Computing MSc provides knowledge ...,https://www.findamasters.com/masters-degrees/c...,3.94,9.027,7.2435,0.361235,5.994941
685,Advanced Clinical Practice MSc,University of Greenwich,Learn essential strategies and prepare for lea...,https://www.findamasters.com/masters-degrees/c...,3.94,9.027,7.2435,0.333983,5.542688
906,Advancing Practice - MSc,University of Northampton,Our MSc Advancing Practice awards support the ...,https://www.findamasters.com/masters-degrees/c...,3.94,9.027,7.2435,0.32575,5.406058
639,Advanced Clinical Practice - MSc,Canterbury Christ Church University,Gain the knowledge and skills needed to become...,https://www.findamasters.com/masters-degrees/c...,3.94,9.027,7.2435,0.283256,4.700829
875,Advanced Professional Practice (MSc),University of Gloucestershire,Our lecturers are research active experts who ...,https://www.findamasters.com/masters-degrees/c...,3.94,9.027,7.2435,0.270317,4.486097
638,Advanced Clinical Practice - MSc,University of Northampton,Our MSc Advanced Clinical Practice course aims...,https://www.findamasters.com/masters-degrees/c...,3.94,9.027,7.2435,0.270123,4.482883
696,Advanced Clinical Practitioner - MSc,University of Sunderland,The MSc Advanced Clinical Practitioner is a hi...,https://www.findamasters.com/masters-degrees/c...,3.94,9.027,7.2435,0.251893,4.180335
607,Advanced Biomedical Engineering - MSc,University of Bradford,Biomedical engineering is a fast evolving inte...,https://www.findamasters.com/masters-degrees/c...,3.94,9.027,7.2435,0.243003,4.0328
655,Advanced Clinical Practice (AHP) - MSc/PGDip/P...,Bangor University,The programme has been developed to enhance pr...,https://www.findamasters.com/masters-degrees/c...,3.94,9.027,7.2435,0.230842,3.830989


## 4. Visualizing the most relevant MSc degrees

In [210]:
# Select only the data from the search engine above
todisplay = custom_search_engine("advanced knowledge", k = 40)

In [211]:
# First we create a variable where we store the cities and their corresponding country
our_places = set(zip(data["city"],data["country"]))

# Retrieve the coordinates
coordinates = []

for city, country in our_places:
    location = geolocator.geocode(f"{city}, {country}")
    if location is not None:
        coordinates.append((city, country, location.latitude, location.longitude))

# Initialize the columns to None
data["latitude"] = None
data["longitude"] = None

# Let's put them into our original dataframe
for city, country, latitude, longitude in coordinates:
    mask = (data["city"] == city) & (data["country"] == country)
    data.loc[mask, "latitude"] = latitude
    data.loc[mask, "longitude"] = longitude

# Selecting the right indexes
places = data.iloc[list(todisplay.index),:]

In [222]:
# This is a key from mapbox.com necessary to use the style "open-street-map"
px.set_mapbox_access_token("pk.eyJ1Ijoic3VzYW5uYWJyYXZpIiwiYSI6ImNscGQ3bXR5eTEwamoya3FyaTBjbWt2N2wifQ.h95oGTEioKMBjxmsSAF3yw")

# Plotting the results
fig = px.scatter_mapbox(places, lat="latitude", lon="longitude", hover_name="courseName", hover_data=["fees_USD","Safety","Education","Cost_of_Living","courseName","universityName","facultyName"], color = "fees_USD",
                         zoom=5, height=600, mapbox_style="open-street-map")
fig.show()

We also add a screen of our map in case GitHub does not supports interactive maps
![Immagine](map.png)