In [11]:
from FlagEmbedding import FlagModel

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import numpy as np
import re
import time
import gzip
import pandas as pd
import heapq
import json

In [12]:
# if you haven't downloaded stopwords or wordnet before, uncomment the lines below and run them
# nltk.download('stopwords')
# nltk.download('wordnet')

## Step 0: Read in the data

In [13]:
course_catalog = pd.read_csv('data/course_catalog.csv')
course_catalog

Unnamed: 0,Code,Department,Title,Units,Description,Prerequisites,Level,URL
0,AIP 97,AIP,Academic Internship,"2, 4",Individual placements for field learning. Must...,"lower-division standing, completion of thirty ...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...
1,AIP 197,AIP,Academic Internship Program,"2, 4, 6, 8, 10, 12",Individual internship placements integrated wi...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
2,AIP 197DC,AIP,"UCDC: Washington, DC Internship","6, 8, 10",This internship is attached to the University ...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
3,AIP 197P,AIP,Public Service Internship,"4, 8, 12",Individual placements for field learning perfo...,ninety units completed; 2.5 minimum cumulative...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
4,AIP 197T,AIP,Academic Internship Program—Special Programs,2,Individual placements for field learning assoc...,ninety units minimum completed; 2.5 minimum cu...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
...,...,...,...,...,...,...,...,...
7164,WCWP 100,WCWP,Academic Writing,4,An upper-division workshop course in argumenta...,junior/senior standing and must be a Warren Co...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
7165,WCWP 160,WCWP,Technical Writing for Scientists and Engineers,4,An upper-division workshop-style writing cours...,junior/senior standing.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
7166,WARR 189,WARR,Academic Mentoring and the Writing Process,2,Students will gain a fundamental understanding...,permission of instructor is required to enroll.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
7167,WCWP 198,WCWP,Group Study,2,A directed group study involving research and ...,none,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...


In [14]:
def preprocess(text):
     # Convert to lowercase
    text = text.lower()
    
    # Tokenizes text
    tokens = re.split(r'[^a-zA-Z0-9]+', text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

In [15]:
start = time.time()
course_catalog_preprocessed = course_catalog.assign(Preprocessed_Title=course_catalog['Title'].apply(preprocess))
course_catalog_preprocessed = course_catalog_preprocessed.assign(Preprocessed_Description=course_catalog['Description'].apply(preprocess))
end = time.time()
print("Preprocess Time:", end - start, "seconds")
course_catalog_preprocessed

Preprocess Time: 17.714894771575928 seconds


Unnamed: 0,Code,Department,Title,Units,Description,Prerequisites,Level,URL,Preprocessed_Title,Preprocessed_Description
0,AIP 97,AIP,Academic Internship,"2, 4",Individual placements for field learning. Must...,"lower-division standing, completion of thirty ...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,academic internship,individual placement field learning must integ...
1,AIP 197,AIP,Academic Internship Program,"2, 4, 6, 8, 10, 12",Individual internship placements integrated wi...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,academic internship program,individual internship placement integrated aca...
2,AIP 197DC,AIP,"UCDC: Washington, DC Internship","6, 8, 10",This internship is attached to the University ...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,ucdc washington dc internship,internship attached university california wash...
3,AIP 197P,AIP,Public Service Internship,"4, 8, 12",Individual placements for field learning perfo...,ninety units completed; 2.5 minimum cumulative...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,public service internship,individual placement field learning performed ...
4,AIP 197T,AIP,Academic Internship Program—Special Programs,2,Individual placements for field learning assoc...,ninety units minimum completed; 2.5 minimum cu...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,academic internship program special program,individual placement field learning associated...
...,...,...,...,...,...,...,...,...,...,...
7164,WCWP 100,WCWP,Academic Writing,4,An upper-division workshop course in argumenta...,junior/senior standing and must be a Warren Co...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,academic writing,upper division workshop course argumentation r...
7165,WCWP 160,WCWP,Technical Writing for Scientists and Engineers,4,An upper-division workshop-style writing cours...,junior/senior standing.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,technical writing scientist engineer,upper division workshop style writing course f...
7166,WARR 189,WARR,Academic Mentoring and the Writing Process,2,Students will gain a fundamental understanding...,permission of instructor is required to enroll.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,academic mentoring writing process,student gain fundamental understanding stage w...
7167,WCWP 198,WCWP,Group Study,2,A directed group study involving research and ...,none,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,group study,directed group study involving research analys...


## Step 2: Retrieve Off-the-Shelf Embeddings

In [16]:
model = FlagModel('BAAI/bge-small-en-v1.5', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages: ",
                  use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

## Step 3: Generate Embeddings for Documents

### Step 3a: Title Embeddings

In [21]:
title_documents = course_catalog_preprocessed['Preprocessed_Title'].values.tolist()

In [24]:
start = time.time()

# generate embeddings for title
title_document_embeddings = model.encode(title_documents)

end = time.time()
print("Time taken to generate title embeddings:", round(end - start, 2), "seconds")

Inference Embeddings: 100%|████████████████████████████████████████████████████████████| 29/29 [00:52<00:00,  1.82s/it]

Time taken to generate title embeddings: 52.86 seconds





In [25]:
# convert title embeddings to dictionary, where key is the index of the course and value is the embeddings
title_embeddings_dict = dict(zip(range(len(title_document_embeddings)), title_document_embeddings.tolist()))

In [26]:
# export title_embeddings_dict as a json file
title_embeddings_json = json.dumps(title_embeddings_dict)
with open('data/title_bge_embeddings.json', 'w') as f:
    f.write(title_embeddings_json)

### Step 3b: Description Embeddings

In [27]:
desc_documents = course_catalog_preprocessed['Preprocessed_Description'].values.tolist()

In [28]:
start = time.time()

# generate embeddings for title
desc_document_embeddings = model.encode(desc_documents)

end = time.time()
print("Time taken to generate description embeddings:", round(end - start, 2), "seconds")

Inference Embeddings: 100%|████████████████████████████████████████████████████████████| 29/29 [08:33<00:00, 17.72s/it]

Time taken to generate description embeddings: 513.93 seconds





In [29]:
# convert description embeddings to dictionary, where key is the index of the course and value is the embeddings
desc_embeddings_dict = dict(zip(range(len(desc_document_embeddings)), desc_document_embeddings.tolist()))

In [30]:
# export desc_embeddings_dict as a json file
desc_embeddings_json = json.dumps(desc_embeddings_dict)
with open('data/desc_bge_embeddings.json', 'w') as f:
    f.write(desc_embeddings_json)