In [5]:
import requests
from bs4 import BeautifulSoup

### Extract Subject URLs from the base URL and extract Course details from these subject URLs

In [3]:

def get_subject_urls(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    subject_links = soup.select('ul.nav.levelone li a')
    url = 'https://catalog.northeastern.edu'
    return [url + link['href'] for link in subject_links]


def scrape_courses(subject_url):
    response = requests.get(subject_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    courses = []

    for course_block in soup.select('.courseblock'):
        title_tag = course_block.select_one('.courseblocktitle strong')
        title = title_tag.get_text(strip=True) if title_tag else "N/A"

        desc_tag = course_block.select_one('p.cb_desc')
        description = desc_tag.get_text(strip=True) if desc_tag else "N/A"

        extra_info = []
        for extra in course_block.select('p.courseblockextra'):
            extra_info.append(extra.get_text(strip=True))

        courses.append({
            'title': title,
            'description': description,
            'extras': extra_info
        })

    return courses


base_url = 'https://catalog.northeastern.edu/course-descriptions/'
subject_urls = get_subject_urls(base_url)

all_courses = []
for url in subject_urls:
    all_courses.extend(scrape_courses(url))


### Creating Llama Index Documents from the Course list

In [5]:
from llama_index.core import Document

def create_documents(course_data):
    documents = []
    for course in course_data:
        full_text = course["title"] + "\n" + course["description"]
        if course.get("extras"):
            full_text += "\n" + "\n".join(course["extras"])
        documents.append(Document(text=full_text, metadata={"title": course["title"]}))
    return documents

In [7]:
documents = create_documents(all_courses)

### Chunking, Indexing and storing the documents in a vector database(Chromadb)

In [9]:
from llama_index.core import Document
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

db = chromadb.PersistentClient(path=r"C:\Users\vigne\Desktop\Higher studies\Northeastern University Boston\Courses\Semester 2\DS5983(LLMs)\NEU-COURSE-FINDER\NeuCourses_Chroma_db")
chroma_collection = db.get_or_create_collection("NeuCourses_Chroma_db")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=100, chunk_overlap=10),
        HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
    ],
    vector_store=vector_store,
)

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
def chunk_list(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]

for chunk in chunk_list(documents, 100):  
    print(f"Processing chunk with {len(chunk)} documents...")
    await pipeline.arun(documents=chunk)

Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100 documents...
Processing chunk with 100

In [15]:
print("Number of documents in DB:", chroma_collection.count())

Number of documents in DB: 14726
