In [None]:
# Libraries
import numpy as np
import pandas as pd
import csv
from bs4 import BeautifulSoup
import requests
import re
import os
from urllib.request import urlopen
import time
from concurrent.futures import ThreadPoolExecutor

import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from collections import Counter
from functools import reduce
import json

# 1. Data collection

## 1.1. Get the list of master's degree courses
We created a file named 'urls.txt' that contains all the urls associated with the url of each master page.
for reaching such purpose, we iterate over all 400 pages and took the link for every 15 urls of each page.
we stored all urls in 'urls.txt' file.

In [None]:
f = open("urls.txt","w") # First we create a txt file where we can write the URLs  #  w means writing mode
for i in range(1, 401): #from first page to page 400
    url = f"https://www.findamasters.com/masters-degrees/msc-degrees/?PG={i}" #pages can be scrolled by changing the number after PG
    result = requests.get(url) # as we have done in class
    soup = BeautifulSoup(result.text, 'html.parser') # to get the html of each page

    for link in soup.find_all(class_ = re.compile('courseLink')): #as in class to get each tag of the page which belongs to class courseLink
        c = (link.get("href"))  # url of each page in the i-th page
        f.write("https://www.findamasters.com/"+c) #writing the rows
        f.write("\n")
f.close()
print('The "urls.txt" file is generated!')

### 1.2. Crawl master's degree pages

We wrote a function named 'download_url'.
Since the FindMaster website blocks us for 70 seconds for every (20 to 22) requests we send, we use 'time.sleep(70)' to wait and then resend the http get request. 
we also omit to download the http files that their directory are already existed.

for sending http get requests asynchronously, we can use async and await methods and take the advantage of using "aiohttp" library. the other way is to use ThreadPoolExecutor function executer. 
It means that we store the executer command in a variable named 'future_to_url' that we are able to call in the future.
The ThreadPoolExecutor is a built-in Python module that provides managing a pool of worker threads. It allows us to submit tasks to the pool, which are then executed by one of the worker threads in the pool.

In [None]:
from concurrent.futures import ThreadPoolExecutor

# Function to download and save HTML for a given URL
def download_url(url, folder_path, page_number):
    # Create a folder for each page if it doesn't exist
    page_folder = os.path.join(folder_path, f"page_{page_number}")
    if os.path.exists(page_folder):
        # uncomment the below code to see which pages are skiped, cause they have already been downloaded.
        # print(f"Skipping Page: {page_number} - Folder already exists.")
        return

    try:
        response = requests.get(url) # Send a GET request to the URL
        response.raise_for_status()  # Raise an exception for bad responses 

        # Create a folder for each page if it doesn't exist
        os.makedirs(page_folder, exist_ok=True)

        # Save the HTML content to a file
        file_path = os.path.join(page_folder, f"html_{page_number}.html")
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(response.text)
        print(f"Downloaded page {page_number}: {url}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download page {page_number}: {url}")
        print(f"Error: {e}")
        print("Retrying in 70 seconds...")
        time.sleep(70)  # Wait for 10 seconds before retrying
        download_url(url, folder_path, page_number)  # Retry the download

# Read all URLs one by one
with open('urls.txt', 'r') as urls_file:
    urls = urls_file.read().splitlines()

output_folder = 'HTML_folders' # Store all HTML files into this directory.

# We can use ThreadPoolExecutor for sending http requests asynchronously. 
# However, Since the FindMaster website blocks us for 70 seconds for every (20 to 22) requests we send, 
# the max_workers in below code assigned to number 1. So it sends requests synchronously.
with ThreadPoolExecutor(max_workers=1) as executor:
    # Enumerate through each URL and submit download tasks to the executor
    future_to_url = {executor.submit(download_url, url, output_folder, page_number): url for page_number, url in enumerate(urls, start=1)}

print("All HTML files are stored in the HTML_folders directory.")

### 1.3 Parse downloaded pages
Here we create a '.tsv' file including the following columns for each of the HTML files.

1. Course Name (to save as ```courseName```): string;
2. University (to save as ```universityName```): string;
3. Faculty (to save as ```facultyName```): string
4. Full or Part Time (to save as ```isItFullTime```): string;
5. Short Description (to save as ```description```): string;
6. Start Date (to save as ```startDate```): string;
7. Fees (to save as ```fees```): string;
8. Modality (to save as ```modality```):string;
9. Duration (to save as ```duration```):string;
10. City (to save as ```city```): string;
11. Country (to save as ```country```): string;
12. Presence or online modality (to save as ```administration```): string;
13. Link to the page (to save as ```url```): string.

Then, we merge all those files together to generate our final dataset.

In [None]:
current_path = os.getcwd()
# '/Users/armanfeili/Arman/Sapienza Courses/ADM/Homeworks/HW3/phase-2/ADM-HW3/HTML_folders'

for i in range(1,6001):
    # os.chdir(r'C:\Users\susan\Documents\DS\ADM\HW3\ADM-HW3\HTML_folders\page_'+str(i)) #change directories
    os.chdir(r'/Users/armanfeili/Arman/Sapienza Courses/ADM/Homeworks/HW3/phase-3/ADM-HW3/HTML_folders/page_'+str(i)) #change directories
    
    for filename in os.listdir(os.getcwd()): # get all the files in a folder
        if filename.endswith(".tsv"): continue # tsv file is already generated.
        elif filename.endswith(".html"): # if file extension is .html
            with open(os.path.join(os.getcwd(), filename), 'r',encoding='utf-8') as f: # open each file into a folder
                soup = BeautifulSoup(f,'html.parser') # get the html file by each file 
                out=[] # initialize a list where we append all the informations parsed from each html file

                # 1  Course Name
                courseName = soup.find_all(class_=re.compile("course-header__course-title"))
                out.append(courseName[0].text.strip() if courseName else "") #text.strip to eliminate strange simbols for the space
                # 2  University
                universityName = soup.find_all(class_=re.compile("course-header__institution"))
                out.append(universityName[0].text if universityName else "")
                # 3  Faculty
                facultyName = soup.find_all(class_=re.compile("course-header__department"))
                out.append(facultyName[0].text if facultyName else "")
                # 4  Full or Part Time
                isItFullTime = soup.find_all(class_=re.compile("concealLink"))
                out.append(isItFullTime[0].text if isItFullTime else "")
                # 5  Short Description
                description = soup.find_all(class_=re.compile("course-sections__content"))
                out.append(description[0].text.replace('\n', '') if description else "")
                # 6  Start Date
                startDate = soup.find_all(class_=re.compile("key-info__start-date"))
                out.append(startDate[0].text if startDate else "")
                # 7  Fees 
                fees_elements = soup.find_all(class_=re.compile("course-sections__fees")) # taking the fee
                fees_text = fees_elements[0].text.replace('\n', '') if fees_elements else "" 
                cleaned_fees = re.sub(r'Fees', '', fees_text)  # To not "Fees" at the beginning 
                out.append(cleaned_fees.strip() if cleaned_fees else "")
                # 8  Modality
                modality = soup.find_all(class_=re.compile("key-info__qualification"))
                out.append(modality[0].text if modality else "")
                # 9  Duration
                duration = soup.find_all(class_=re.compile("key-info__duration"))
                out.append(duration[0].text if duration else "")
                # 10  City
                city = soup.find_all(class_=re.compile("course-data__city"))
                out.append(city[0].text if city else "")
                # 11  Country
                country = soup.find_all(class_=re.compile("course-data__country"))
                out.append(country[0].text if country else "")
                # 12  Presence or online modality
                # We have seen that some courses has both online or oncampus modality, one of them is "Master of Business Administration"
                on_campus_elements = soup.find_all(class_=re.compile("course-data__on-campus"))
                online_elements = soup.find_all(class_=re.compile("course-data__online"))
                if on_campus_elements and online_elements:
                    out.append("both")
                else:
                    out.append(on_campus_elements[0].text if on_campus_elements else online_elements[0].text if online_elements else "Nan")
                # 13  Link to the page
                out.append(soup.find('link', {'rel': 'canonical'}).get('href') if soup.find('link', {'rel': 'canonical'}) else "Nan")
                f.close()
                
                # Creating file .tsv
                l = ['courseName','universityName','facultyName','isItFullTime','description','startDate','fees','modality','duration',
                    'city','country','administration','url']
                with open(filename+'.tsv','w',encoding='utf-8') as tsv:
                    tsv_output = csv.writer(tsv, delimiter='\t')
                    tsv_output.writerow(l)
                    tsv_output.writerow(out)
    os.chdir('..')  

print("All HTML files have been read and all .tsv files have been generated.")

In [None]:
data=[]
# to merge all the .tsv files
for i in range(1,6001):
    # os.chdir(r'./HTML_folders/page_'+str(i)) #change directories
    os.chdir(r'/Users/armanfeili/Arman/Sapienza Courses/ADM/Homeworks/HW3/phase-3/ADM-HW3/HTML_folders/page_'+str(i)) #change directories
    for filename in os.listdir(os.getcwd()):
        if filename.endswith(".tsv"):
            a = pd.read_csv(filename,sep='\t')
            data.append(a)
    os.chdir('..')
data=pd.concat(data,ignore_index=True)   
data.to_csv('../dataset.tsv',sep='\t',index=False) # Saving the big one
print("dataset.tsv file has been generated as the main dataset.")

In [None]:
# An illustration to the dataset:
data.head(5)

## 2. Search Engine

In [119]:
# Load the dataset we are going to work with
data = pd.read_table(r"dataset.tsv")

# Count and show NAs
np.sum(data.isna())

courseName         24
universityName     24
facultyName        24
isItFullTime       24
description        24
startDate          24
fees              146
modality           24
duration           24
city               24
country            24
administration      0
url                 0
dtype: int64

In [120]:
# Drop NAs
data = data.dropna()
data

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,country,administration,url
0,3D Design for Virtual Environments - MSc,Glasgow Caledonian University,School of Engineering and Built Environment,Full time,3D visualisation and animation play a role in ...,September,Please see the university website for further ...,MSc,1 year full-time,Glasgow,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
1,Accounting and Finance - MSc,University of Leeds,Leeds University Business School,Full time,Businesses and governments rely on sound finan...,September,"UK: £18,000 (Total) International: £34,750 (To...",MSc,1 year full time,Leeds,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
2,"Accounting, Accountability & Financial Managem...",King’s College London,King’s Business School,Full time,"Our Accounting, Accountability & Financial Man...",September,Please see the university website for further ...,MSc,1 year FT,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
3,"Accounting, Financial Management and Digital B...",University of Reading,Henley Business School,Full time,Embark on a professional accounting career wit...,September,Please see the university website for further ...,MSc,1 year full time,Reading,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
4,Addictions MSc,King’s College London,"Institute of Psychiatry, Psychology and Neuros...",Full time,Join us for an online session for prospective ...,September,Please see the university website for further ...,MSc,One year FT,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,Materials and Molecular Modelling MSc,University College London,Department of Chemistry,Full time,Register your interest in graduate study at UC...,September,"Full time - £14,100",MSc,1 year full time,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
5996,Materials Chemistry - MSc,University of Bradford,Faculty of Life Sciences,Full time,We provide a unique Master’s education in Mate...,September,Please see the university website for further ...,MSc,1 year full time,Bradford,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
5997,Materials Chemistry MSc,University of Edinburgh,School of Chemistry,Full time,Programme descriptionMaterials Chemistry has e...,September,Tuition fees vary between degree programmes. F...,MSc,1 year full-time,Edinburgh,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
5998,Materials Engineering,University of Padua,School of Engineering,Full time,The Master's degree Materials Engineering is a...,October,Our tuition fees will not exceed 2700 euros pe...,MSc,2 years,Padua,Italy,On Campus,https://www.findamasters.com/masters-degrees/c...


## 2.0 Preprocessing

### 2.0.0) Preprocessing the text


In [139]:
# This function takes a text, removes special cases, punctuations and stop-words then
# it applies stemming and finally returns the preprocessed words separated by commas.

def preprocess_description(description_text):

    # Handle the cases of float type (there are 24)
    if type(description_text) != str:
        return ""

    # Remove all special chars and punctuations
    description_text = re.sub("[^a-z A-Z ]+","", description_text)

    # Convert everything in lowercase
    description_text = description_text.lower()

    # Remove stopwords using nltk package
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(description_text)
    words = [word for word in words if word not in stop_words]

    # Apply stemming using ntlk package
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    # Separate words with commas
    words = ','.join(words)

    return words

In [140]:
# Create and fill new column where the preprocessed descriptions wil be stored:
data["clean_description"] = data["description"].apply(preprocess_description)

In [141]:
# Show different fields
data[["description", "clean_description"]].head(10)

Unnamed: 0,description,clean_description
0,3D visualisation and animation play a role in ...,"visualis,anim,play,role,mani,area,popular,medi..."
1,Businesses and governments rely on sound finan...,"busi,govern,reli,sound,financi,knowledg,underp..."
2,"Our Accounting, Accountability & Financial Man...","account,account,financi,manag,msc,cours,provid..."
3,Embark on a professional accounting career wit...,"embark,profession,account,career,academ,ground..."
4,Join us for an online session for prospective ...,"join,us,onlin,session,prospect,student,find,ms..."
5,The Advanced Chemical Engineering MSc at Leeds...,"advanc,chemic,engin,msc,leed,build,core,founda..."
6,Programme overviewThe Advanced Master in Finan...,"programm,overviewth,advanc,master,financi,mark..."
7,Programme overviewThe Advanced Master in Innov...,"programm,overviewth,advanc,master,innov,strate..."
8,Progress your career as a physiotherapist with...,"progress,career,physiotherapist,within,nh,priv..."
9,Goal of the pro­grammeWould you like to be inv...,"goal,programmewould,like,involv,find,solut,fut..."


### 2.0.1) Preprocessing the fees column

In [142]:
# This function was provided by ChatGPT.
# It downloads the latest exchange rates (wrt United States Dollars) from openexchangerates.org.
# It returns a dictionary of the form {exchange_code : exchange_rate}
def get_latest_exchange_rates(app_id):

    base_url = "https://openexchangerates.org/api/latest.json"
    params = {"app_id": app_id}
    response = requests.get(base_url, params=params)

    if response.status_code == 200:
        return response.json().get("rates")
    else:
        print(f"Failed to fetch exchange rates. Status code: {response.status_code}")
        return None


# Dictionary provided by ChatGPT, I changed some symbols that were not present.
# It will be used to convert a currency symbol into currency code.

currency_symbol_to_code = {
    '$': 'USD',  # United States Dollar
    '€': 'EUR',  # Euro
    "EURO": "EUR", # Euro
    '¥': 'JPY',  # Japanese Yen
    '£': 'GBP',  # British Pound Sterling
    'A$': 'AUD',  # Australian Dollar
    'C$': 'CAD',  # Canadian Dollar
    'CHF': 'CHF',  # Swiss Franc
    'KR': 'SEK',  # Swedish Krona
    'NZ$': 'NZD',  # New Zealand Dollar
    # Add more symbols and codes as needed
}

# This function handles the preprocessing of the "fees" field
def preprocess_fees(fees_text):

    # Handles the case of "nan"
    if type(fees_text) != str:
        return None

    # Preallocate all the fees found in text
    total_fees = []

    # Symbols we are looking for
    symbols = "GBP|USD|ISK|£|\$|₹|¥|₪|₽|₩|₦|₴|﷼|€|Euro"

    # Match the (symbol, number) case
    left_symbol_matches = re.findall(fr'(?:{symbols})+\s*[0-9]+[,.]?[0-9]*', fees_text, flags=re.IGNORECASE)

    # Match the (number,  symbol) case
    right_symbol_matches = re.findall(fr'[0-9]+[,.]?[0-9]*\s*(?:{symbols})+', fees_text, flags=re.IGNORECASE)

    # Merge them
    matches = left_symbol_matches + right_symbol_matches

    # If we got no matches returns None
    if len(matches) == 0 :
        return None

    for match in matches:
        # Remove "," or "." and change to the right type
        number = re.findall('([0-9]+)[.,]*([0-9]*)', match)[0]
        number = float(number[0] + number[1])

        # Isolate symbol and upper case it (to match exchange_rates_dict codes)
        symbol = re.findall(fr'(?i)({symbols})', match)[0].upper()

        # Transform symbol into code (if not already a code)
        if symbol in currency_symbol_to_code.keys():
            symbol = currency_symbol_to_code[symbol]

        # Change into USD using the exchange_rates dictionary and append to fees
        total_fees.append(number / exchange_rates_dict[symbol])

    # Take the max fee and return it
    max_fee = round(max(total_fees))
    return max_fee


In [143]:
# Load latest exchange rates
my_app_id = "1457fcd3d536441baad3ce7918b5025b"
exchange_rates_dict = get_latest_exchange_rates(my_app_id)

# Show exchange_rates_dict
count = 0
print("{")
for key, value in exchange_rates_dict.items():
    if count < 5:
        print(f'{key}: {value}')
        count += 1
    else:
        break
print("}")

{
AED: 3.67281
AFN: 70.205771
ALL: 94.659415
AMD: 402.028287
ANG: 1.801361
}


In [144]:
exchange_rates_dict

{'AED': 3.67281,
 'AFN': 70.205771,
 'ALL': 94.659415,
 'AMD': 402.028287,
 'ANG': 1.801361,
 'AOA': 829.905333,
 'ARS': 355.966893,
 'AUD': 1.52512,
 'AWG': 1.768,
 'AZN': 1.7,
 'BAM': 1.786984,
 'BBD': 2,
 'BDT': 110.692829,
 'BGN': 1.791948,
 'BHD': 0.376747,
 'BIF': 2845.177181,
 'BMD': 1,
 'BND': 1.336464,
 'BOB': 6.906665,
 'BRL': 4.899224,
 'BSD': 1,
 'BTC': 2.7156302e-05,
 'BTN': 83.285679,
 'BWP': 13.41654,
 'BYN': 3.292918,
 'BZD': 2.014674,
 'CAD': 1.370147,
 'CDF': 2575.361714,
 'CHF': 0.883671,
 'CLF': 0.031526,
 'CLP': 869.936087,
 'CNH': 7.142968,
 'CNY': 7.099033,
 'COP': 4057.225474,
 'CRC': 530.494682,
 'CUC': 1,
 'CUP': 25.75,
 'CVE': 100.988416,
 'CZK': 22.474885,
 'DJF': 178.062377,
 'DKK': 6.832836,
 'DOP': 56.882573,
 'DZD': 134.093675,
 'EGP': 30.902213,
 'ERN': 15,
 'ETB': 55.801723,
 'EUR': 0.916482,
 'FJD': 2.24175,
 'FKP': 0.797519,
 'GBP': 0.797519,
 'GEL': 2.705,
 'GGP': 0.797519,
 'GHS': 11.95637,
 'GIP': 0.797519,
 'GMD': 67.25,
 'GNF': 8613.139488,
 'GT

In [145]:
# Save preprocessed fees into a new column
data["fees_USD"] = data["fees"].apply(preprocess_fees)

## 2.1. Conjunctive query

### 2.1.1) Create your index!


In [146]:
# Store all terms contained in the "clean_description" as a set
terms_set = set(','.join(data["clean_description"]).split(","))

# Create a dict that associate each term to a unique id
terms_id_dict = {key: value for value, key in enumerate(terms_set)}

In [148]:
# Here we create the inverted index dictionary.

# Preallocate a dictionary with the form: {term_id : []}
inverted_index_dict = {i : [] for i in range(len(terms_id_dict))}

# Iterating over all terms and texts in the "clean_description" field
for i,text in enumerate(data["clean_description"]):
    text_list = text.split(",")
    for term in text_list:

        # Get term id
        term_id = terms_id_dict[term]

        # Add document id "i" to the term_id list
        inverted_index_dict[term_id].append(i)

In [149]:
# Show inverted_index_dict structure
print("{")
count = 0
for key, value in inverted_index_dict.items():
    if count < 5:
        print(f'{key}: {value}')
        count += 1
    else:
        break
print("}")

{
0: [1576, 2307, 4459]
1: [3177]
2: [55, 178, 194, 194, 223, 320, 325, 396, 737, 1272, 1299, 1304, 1570, 1911, 1912, 2068, 2085, 2164, 2437, 2460, 2465, 2623, 2708, 2859, 2860, 2900, 2947, 3130, 3380, 3617, 3710, 3829, 3906, 4144, 4150, 4210, 4397, 4514, 4661, 4691, 4695, 4797, 4839, 4986, 5054, 5082, 5385, 5388, 5630, 5633, 5638, 5760]
3: [4361]
4: [5153]
}


### 2.1.2) Execute the query

In [150]:
# This function takes a query as a input and returns the most affine docs:

def naive_search_engine(query):
    # Apply same preprocessing done for descriptions and split wrt ","
    query = preprocess_description(query).split(",")

    # For each term in query get all the docs ids that contain it as a set
    query_docs = [set(inverted_index_dict[terms_id_dict[term]]) for term in query]

    # Select the docs ids that contain all the query term, and sort them
    query_docs = set.intersection(*query_docs)
    query_docs = list(sorted(query_docs))

    # Return selected columns of those docs
    result  = data.iloc[query_docs, [0,1,4,12]]

    return result

In [151]:
naive_search_engine("advanced knowledge")

Unnamed: 0,courseName,universityName,description,url
1,Accounting and Finance - MSc,University of Leeds,Businesses and governments rely on sound finan...,https://www.findamasters.com/masters-degrees/c...
4,Addictions MSc,King’s College London,Join us for an online session for prospective ...,https://www.findamasters.com/masters-degrees/c...
12,Analytical Toxicology MSc,King’s College London,The Analytical Toxicology MSc is a unique stud...,https://www.findamasters.com/masters-degrees/c...
48,Civil Engineering MSc,University of Greenwich,Meet the future demands of the construction in...,https://www.findamasters.com/masters-degrees/c...
86,Economics - MSc,University of Leeds,Our MSc Economics allows you to apply economic...,https://www.findamasters.com/masters-degrees/c...
...,...,...,...,...
5909,Master of Science/Postgraduate Diploma in Envi...,The Hong Kong University of Science and Techno...,The program is meant to meet the needs of prac...,https://www.findamasters.com/masters-degrees/c...
5937,Master Sociology – Social and Economic Psychology,University of Cologne,This programme provides you with:a solid found...,https://www.findamasters.com/masters-degrees/c...
5957,Masters in Economics,University of Lisbon,OBJECTIVESThe MSc in Economics aims to provide...,https://www.findamasters.com/masters-degrees/c...
5963,Master's in Global and European Politics,European School of Political and Social Scienc...,Europe and the EU in a changing worldOur inter...,https://www.findamasters.com/masters-degrees/c...


## 2.2) Conjunctive query & Ranking score

### 2.2.1) Inverted index


In [152]:
# Here we create the second inverted index dictionary.

# Preallocate a dictionary with the form: {term_id : []}
inverted_index_dict_with_scores = {i : [] for i in range(len(terms_id_dict))}

# Iterating over all terms and texts in the "clean_description" field
for i,text in enumerate(data["clean_description"]):
    text_list = text.split(",")

    # Set here has the purpose of selecting unique terms only.
    # We don't want to insert in inverted_2 dict multiple times the same score
    for term in set(text_list):

        # Get term id
        term_id = terms_id_dict[term]

        # Get idf score: total number of documents / number of documents term is in
        term_idf = np.log(len(data["clean_description"]) / len(inverted_index_dict[term_id]))

        # Get tf score: number of times term appears in text / total number of terms in text
        term_tf =  sum([word == term for word in text_list]) / len(text_list)

        # Compute tfidf score
        term_tfidf = term_tf * term_idf

        # Update new_inverted list
        inverted_index_dict_with_scores[term_id].append((i , term_tfidf))

In [153]:
# Show inverted_index_dict_with_scores structure
print("{")
count = 0
for key, value in inverted_index_dict_with_scores.items():
    if count < 5:
        print(f'{key}: {value}')
        count += 1
    else:
        break
print("}")

{
0: [(1576, 0.10980098809541626), (2307, 0.11479194209975337), (4459, 0.12025822505688447)]
1: [(3177, 0.1807266764010798)]
2: [(55, 0.05429467527207361), (178, 0.08747475460500748), (194, 0.13496104996201153), (223, 0.08287082015211235), (320, 0.1968181978612668), (325, 0.07872727914450672), (396, 0.07743666801099024), (737, 0.08144201290811041), (1272, 0.10496970552600898), (1299, 0.10496970552600898), (1304, 0.06845850360391889), (1570, 0.08588430452128007), (1911, 0.08287082015211235), (1912, 0.08287082015211235), (2068, 0.09447273497340808), (2085, 0.08006163980797294), (2164, 0.08747475460500748), (2437, 0.08912522167302649), (2460, 0.1005029095461788), (2465, 0.08006163980797294), (2623, 0.05248485276300449), (2708, 0.1073553806516001), (2859, 0.07267133459492929), (2860, 0.07267133459492929), (2900, 0.04820037498643269), (2947, 0.1005029095461788), (3130, 0.07157025376773339), (3380, 0.0705020410249314), (3617, 0.07872727914450672), (3710, 0.008019756788914098), (3829, 0.15237

## 2.2.2) Execute the query


In [154]:
# Here we define our second search engine

# This function computes from scratch cosine similarity
def cosine_similarity(vec1, vec2):
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)

    # Calculate the norms
    norm_vec_1 = np.linalg.norm(vec1)
    norm_vec_2 = np.linalg.norm(vec2)

    # Compute the cosine similarity
    cos_sim = np.dot(vec1, vec2) / (norm_vec_1 * norm_vec_2)

    return cos_sim


def  top_k_search_engine(query, k):

    # Apply same preprocessing done for descriptions and split wrt ","
    query = preprocess_description(query).split(",")

    # Get query terms ids
    query_ids = [terms_id_dict[term] for term in query]

    # Calculate for each term in query its tfidf score
    query_idf = np.array([np.log(len(data["clean_description"]) / len(inverted_index_dict[term_id])) for term_id in query_ids])
    query_tf =  np.array([sum([word == term for word in query]) / len(query) for term in query]) # change this
    query_tfidf = query_idf * query_tf

    # Get the indexes of docs that contain all terms in query using inverted_1
    docs_ids = [set(inverted_index_dict[term_id]) for term_id in query_ids]
    appropriate_docs_ids = list(set.intersection(*docs_ids))

    # docs_tfidf will contain, for each id in appropriate_docs_ids, its tfidf vectorial representation.
    docs_tfidf = {i : [] for i in appropriate_docs_ids}

    # For each term in the query we retrieve its inverted_2 list of tuples
    for term_id in query_ids:
        list_of_tuples = inverted_index_dict_with_scores[term_id]
        for tuple_doc in list_of_tuples:

            # When we encounter a tuple with an appropriate doc id  we add its tfidf score in docs_tfidf
            if tuple_doc[0] in appropriate_docs_ids:
                docs_tfidf[tuple_doc[0]].append(tuple_doc[1])

    # Transform into a list
    docs_tfidf = list(docs_tfidf.values())

    # Compute cosine similarities between query_tfidf and each doc_tfidf
    cos_sims = [cosine_similarity(query_tfidf, doc_tfidf) for doc_tfidf in docs_tfidf]

    # Select all appropriate_docs_ids from data and specified columns
    result  = data.iloc[appropriate_docs_ids, [0,1,4,12]]

    # Add the cosine similarity score and sort the dataframe
    result["cos_sim"] = cos_sims
    result = result.sort_values(by='cos_sim', ascending=False)

    # Get, if possible, just the top k
    if k < result.shape[0]:
        result = result[:k]

    return result

In [155]:
top_k_search_engine("advanced knowledge", k = 10)

Unnamed: 0,courseName,universityName,description,url,cos_sim
2703,Data Science with Artificial Intelligence MSc,University of Exeter,OverviewDesigned for those interested in learn...,https://www.findamasters.com/masters-degrees/c...,1.0
3259,Energy Systems and Data Analytics (ESDA) MSc,University College London,Register your interest in graduate study at UC...,https://www.findamasters.com/masters-degrees/c...,1.0
4632,International Business - MSc,University of Glasgow,International Business will provide you with a...,https://www.findamasters.com/masters-degrees/c...,1.0
721,Advanced Computational Methods for Aeronautics...,Imperial College London,This programme is suitable for applicants who ...,https://www.findamasters.com/masters-degrees/c...,1.0
1412,Banking Innovation and Risk Analytics MSc,University of Edinburgh,Programme descriptionOur MSc in Banking Innova...,https://www.findamasters.com/masters-degrees/c...,1.0
2577,Dance Science MSc,University of Chichester,This suite of MSc programmes is designed for s...,https://www.findamasters.com/masters-degrees/c...,1.0
4321,History of International Relations MSc,London School of Economics and Political Science,Ask LSEThe MSc History of International Relati...,https://www.findamasters.com/masters-degrees/c...,1.0
4886,International Master of Science in Fire Safety...,University of Edinburgh,Programme descriptionThe International Master ...,https://www.findamasters.com/masters-degrees/c...,1.0
1028,Analytical Sciences MSc,University of Bradford,Our MSc in Analytical Sciences MSc is a resear...,https://www.findamasters.com/masters-degrees/c...,1.0
3313,Engineering Management MSc,University of Greenwich,Extend and develop your skills and build a car...,https://www.findamasters.com/masters-degrees/c...,1.0


## 3. Define a new score!
Now it's your turn: build a new metric to rank MSc degrees.

Practically:

The user will enter a text query. As a starting point, get the query-related documents by exploiting the search engine of Step 2.1.
Once you have the documents, you need to sort them according to your new score. In this step, you won't have any more to take into account just the description field of the documents; you can use also the remaining variables in your dataset (or new possible variables that you can create from the existing ones or scrape again from the original web-pages). You must use a heap data structure (you can use Python libraries) for maintaining the top-k documents.
N.B.: You have to define a scoring function, not a filter!

The output, must contain:

courseName
universityName
description
URL
The new similarity score of the documents with respect to the query
Are the results you obtain better than with the previous scoring function? Explain and compare results.

### Answer

To develop our customized search engine, we opted to gather supplementary information for each university.
Recognizing that the selection of the "ideal" university is influenced by the city in which one resides, we chose to extract scores reflecting the quality of life in each city.



In [192]:
country_capital_dict = {
    'Gibraltar': 'Gibraltar',
    'Sweden': 'Stockholm',
    'Austria': 'Vienna',
    'Spain': 'Madrid',
    'Singapore': 'Singapore',
    'Finland': 'Helsinki',
    'Israel': 'Jerusalem',
    'Turkey': 'Ankara',
    'Germany': 'Berlin',
    'Canada': 'Ottawa',
    'New Zealand': 'Wellington',
    'Netherlands': 'Amsterdam',
    'Japan': 'Tokyo',
    'United Kingdom': 'London',
    'Switzerland': 'Bern',
    'Cyprus': 'Nicosia',
    'Iceland': 'Reykjavik',
    'Jamaica': 'Kingston',
    'Czechia': 'Prague',
    'Malaysia': 'Kuala Lumpur',
    'USA': 'new-orleans',
    'Hong Kong': 'Hong Kong',
    'Portugal': 'Lisbon',
    'Estonia': 'Tallinn',
    'Greece': 'Athens',
    'Denmark': 'Copenhagen',
    'France': 'Paris',
    'Lithuania': 'Vilnius',
    'Italy': 'Rome',
    'India': 'New Delhi',
    'Luxembourg': 'Luxembourg City',
    'Australia': 'Canberra',
    'Croatia': 'Zagreb',
    'China': 'Beijing',
    'Belgium': 'Brussels',
    'Chile': 'Santiago',
    'Kazakhstan': 'Nur-Sultan',
    'Kazakstan': 'Nur-Sultan',
    'Qatar': 'Doha',
    'United Arab Emirates': 'Abu Dhabi',
    'Saudi Arabia': 'Riyadh',
    'Hungary': 'Budapest',
    'Norway': 'Oslo',
    'Ireland': 'Dublin',
    'Romania': 'Bucharest'
}

The following function used the [Teleport](https://teleport.org/) API to retrieve, for each city in our Data, three scores:
- Education score;
- Safety score;
- Cost of living score.

All of them are normalized in $[0,10]$.
When a city was not integrated into the teleport API, our solution was to obtain the scores associated with the capital of the country in which the city is located.

In [193]:
get_scores_out_of_ten(data.city.loc[115])

(4.991999999999999, 4.151999999999999, 2.088)

In [189]:

def get_scores_out_of_ten(city):
    # Handle different data types
    city = str(city)

    try:
        # Try to get URL of the initial city
        base_url = f"https://api.teleport.org/api/urban_areas/slug:{city.lower()}/scores/"
        response = requests.get(base_url)

        # If city not found, retrieve URL of capital city
        if response.status_code == 404:

            # Retrieve the country of the city
            country = data[data["city"] == city]["country"].iloc[0]

            # Retrieve capital city of the country
            capital_city = country_capital_dict[country]

            base_url = f"https://api.teleport.org/api/urban_areas/slug:{capital_city.lower()}/scores/"
            response = requests.get(base_url)

            # If capital city not found, return error
            if response.status_code == 404:
                return ("City not found", "City not found", "City not found")

        # Retrieve scores from URL
        datas = response.json()

        # Retrieve all categories scores
        categories = datas['categories']
        scores = {category["name"] : float(category["score_out_of_10"]) for  category in categories}

        # Return Cost of Living, Education and Safety
        return (scores["Cost of Living"],
                scores["Education"],
                scores["Safety"])


    except requests.exceptions.RequestException as e:
        return {'error': f'Request to Teleport API failed: {e}'}

In [135]:
# Create three new columns for our new scores (around 10m to run)
data[['Cost_of_Living', 'Education', 'Safety']]= data.city.apply(lambda x: pd.Series(get_scores_out_of_ten(x)))

In [194]:
# TODO fix the following countries
a = [type(i) == str for i in data["Cost_of_Living"]]
set(data[a].country)

{'Hong Kong', 'Israel', 'Kazakstan', 'Malaysia', 'USA', 'United Arab Emirates'}

Those scores will affect the importance our custom search engine gives to each university.
As an initial and naive approach we decided to keep thinks linear and just compute the new score as shown below:

\begin{equation}
$NewScore = \frac{CosSimilarity \cdot Education \cdot Safety }{LivingCost}$
\end{equation}


The following function follows the one already shown previously but orders the result with respect to the new score we just defined.

In [166]:
def  custom_search_engine(query, k):

    # Apply same preprocessing done for descriptions and split wrt ","
    query = preprocess_description(query).split(",")

    # Get query terms ids
    query_ids = [terms_id_dict[term] for term in query]

    # Calculate for each term in query its tfidf score
    query_idf = np.array([np.log(len(data["clean_description"]) / len(inverted_index_dict[term_id])) for term_id in query_ids])
    query_tf =  np.array([sum([word == term for word in query]) / len(query) for term in query]) # change this
    query_tfidf = query_idf * query_tf

    # Get the indexes of docs that contain all terms in query using inverted_1
    docs_ids = [set(inverted_index_dict[term_id]) for term_id in query_ids]
    appropriate_docs_ids = list(set.intersection(*docs_ids))

    # docs_tfidf will contain, for each id in appropriate_docs_ids, its tfidf vectorial representation.
    docs_tfidf = {i : [] for i in appropriate_docs_ids}

    # For each term in the query we retrieve its inverted_2 list of tuples
    for term_id in query_ids:
        list_of_tuples = inverted_index_dict_with_scores[term_id]
        for tuple_doc in list_of_tuples:

            # When we encounter a tuple with an appropriate doc id  we add its tfidf score in docs_tfidf
            if tuple_doc[0] in appropriate_docs_ids:
                docs_tfidf[tuple_doc[0]].append(tuple_doc[1])

    # Transform into a list
    docs_tfidf = list(docs_tfidf.values())

    # Compute cosine similarities between query_tfidf and each doc_tfidf
    cos_sims = np.array([cosine_similarity(query_tfidf, doc_tfidf) for doc_tfidf in docs_tfidf])

    # Select all appropriate_docs_ids from data and specified columns
    result  = data.iloc[appropriate_docs_ids]

    # Add the cosine similarity score
    result["cos_sim"] = cos_sims

    # Compute our custom score
    our_score = result["cos_sim"] * result["Education"] * result["Safety"] / result["Cost_of_Living"]

    # Add the cosine similarity score and sort the dataframe
    result["score"] = our_score
    result = result.sort_values(by='score', ascending=False)

    # Get, if possible, just the top k
    if k < result.shape[0]:
        result = result[:k]

    return result

In [167]:
custom_search_engine("advanced knowledge", k = 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result["cos_sim"] = cos_sims


TypeError: can't multiply sequence by non-int of type 'float'