In [2]:
from bs4 import BeautifulSoup
import requests
import re
import os
from dotenv import load_dotenv
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from chromadb.config import Settings
import os
import ast

### SCRAPING LIST OF COURSES ###

def get_courses(html: bytes):
    soup = BeautifulSoup(html, 'html.parser')
    div = soup.find('div', class_=re.compile('^w3-row view view-courses-view view-id-courses_view'))
    view_content_div = div.find('div', class_=re.compile('view-content'))
    child_divs = view_content_div.find_all('div', recursive=False)
    courses = []

    for child_div in child_divs:
        d = {}

        try:
            # Extract course name
            course_name = child_div.find('div', {'aria-label': True}).get_text(strip=True)

            # Extract description
            description = child_div.find('div', class_='views-field-field-desc').get_text(strip=True)

            # Extract prerequisites
            prerequisites = (child_div.find('span', class_='views-field-field-prerequisite').get_text(strip=False))[15:]
            prerequisites: str = re.split(r'(\s+|\(|\)|\[|\])', prerequisites)
            prerequisites = [i for i in prerequisites if (i != " " and i != '')]

            d["name"] = course_name
            d["description"] = description
            d["prerequisites"] = prerequisites
            courses.append(d)
            print(d["prerequisites"])
        except:
            pass

    return courses


def getHTMLCourses(url: str): 
    print(url)
    response = requests.get(url) 

    if response.status_code != 200:
        print("Error getting HTML")
    else:
        html = response.content
        return get_courses(html)


CHUNKS = 1 # Depending on the size of the input data, this number may need to be increased due to batch size limits

def create_model(courses):
    
    chroma_client = chromadb.PersistentClient(settings=Settings(allow_reset=True))
    try:
        collection =  chroma_client.get_collection("vector_earch")
    except: 
        chroma_client.reset()
        collection = chroma_client.create_collection(name="vector_search")

    documents = []
    metadatas = []
    ids = []
    id = 1

    for course in courses:
        documents.append(course["description"])
        metadatas.append({'item_id': course["name"], "prerequisites": course["prerequisites"]})
        ids.append(f'id{id}')
        id += 1

    t = len(documents) // CHUNKS

    for i in range(0, len(documents), t):
        collection.add(
            documents=documents[i:i+t],
            metadatas=metadatas[i:i+t],
            ids=ids[i:i+t]
    )
    

def query_courses(query: str):
    chroma_client = chromadb.PersistentClient(settings=Settings(allow_reset=True))
    collection =  chroma_client.get_collection("vector_search")
    n = 5
    results = collection.query(
        query_texts=[query],
        n_results=n
    )
    print(results)

    return [{"name": results["metadatas"][0][i]["item_id"], "description": results["documents"][0][i], "prerequisites": results["metadatas"][0][i]["prerequisites"]} for i in range(0, n)]


def search_db(name: str, courses_db):
    for course in courses_db:
        if course["name"][:8] == name:
            return ast.literal_eval(course["prerequisites"])
    return []


def get_full_trajectory(query, courses_db):
    lst = []
    interest_courses = query_courses(query)
    for course in interest_courses:
        lst.append(course["name"][:8])
    return categorize_courses(get_prereqs(lst, courses_db))


def get_prereqs(courses, courses_db):
    d = {}
    for course in courses:
        prereqs = search_db(course, courses_db)
        d[course] = get_prereqs(prereqs, courses_db)
            
    return d


def categorize_courses(course_dict, categorized_courses={}):
    for course, prereqs in course_dict.items():
        year = course[3] 
        year_label = int(year)
        if year_label not in categorized_courses:
            categorized_courses[year_label] = []
        if course not in categorized_courses[year_label]:
            categorized_courses[year_label].append(course)
        categorize_courses(prereqs, categorized_courses) 

    return categorized_courses

In [3]:
getHTMLCourses("https://utm.calendar.utoronto.ca/section/Computer-Science")

https://utm.calendar.utoronto.ca/section/Computer-Science
['permission', 'of', 'instructor']
['Grade', '12', 'Advanced', 'Functions', '(', 'MHF4U', ')', '.']
['CSC108H5']
['permission', 'of', 'instructor']
['60%', 'in', 'CSC148H5', '(', 'Only', 'CSC148H5', 'taken', 'at', 'the', 'UTM', 'campus', 'will', 'be', 'accepted.', ')']
['CSC207H5']
['CSC148H5', 'and', 'MAT102H5']
['CSC148H5']
['CSC207H5', 'and', 'CSC236H5', 'and', '(', 'STA107H5', 'or', 'STA246H5', 'or', 'STA256H5', 'or', 'STA237H1', 'or', 'STA238H1', 'or', 'ECO227Y5', 'or', 'ECE286H1', ')']
['CSC148H5']
['Any', 'CSC', 'half-course', 'and', 'CGPA', '2.0']
['CSC209H5']
['CSC209H5', 'and', 'CSC263H5']
['CSC148H5', 'and', 'MAT223H5', 'and', '(', 'STA246H5', 'or', 'STA256H5', 'or', 'ECO227Y5', ')']
['CSC207H5', 'and', '(', 'MAT223H5', 'or', 'MAT240H5', ')', 'and', 'MAT232H5', 'and', '(', 'STA246H5', 'or', 'STA256H5', ')']
['CSC207H5']
['(', 'MAT224H5', 'or', 'MAT240H5', ')', 'and', 'MAT301H5']
['CSC207H5', 'and', 'CSC236H5']
['CSC20

[{'name': 'CBJ481Y5 • Independent Project in Bioinformatics',
  'description': 'This course is intended for students in the Bioinformatics Specialist degree program. Possible areas in which the research may take place include: functional genomics (e.g., microarray and proteomic data analysis); systems biology; and the development of novel analytical methods for large datasets. Students will be required to produce a written document of their project and present it orally. In order to enrol in this course, students must obtain, several months in advance, approval from a faculty member(s) who will serve as supervisor(s).',
  'prerequisites': ['permission', 'of', 'instructor']},
 {'name': 'CSC108H5 • Introduction to Computer Programming',
  'description': 'Structure of computers; the computing environment. Programming in a language such as Python. Program structure: elementary data types, statements, control flow, functions, classes, objects, methods, fields. List: searching, sorting and c

In [None]:
from flask import Flask, request, jsonify
from flask_cors import CORS

app = Flask(__name__)
CORS(app, supports_credentials=True)

@app.route("/set_courses", methods=["POST"])
def set_courses():
    program_type = request.form.get('program_type')
    courses = getHTMLCourses(f'https://utm.calendar.utoronto.ca/section/{program_type}')
    create_model(courses)
    return courses


@app.route("/set_collection", methods=["POST"])
def set_collection():
    create_model(request.form.get("courses"))
    return jsonify(success=True)


import json
@app.route("/interest_timeline", methods=["POST"])
def interest_timeline():
    courses_db = json.loads(request.form.get("courses"))
    create_model(courses_db)
    interest_query = request.form.get('query')
    return get_full_trajectory(interest_query, courses_db)


if __name__ == '__main__':
    app.run(port=5328)