In [19]:
import time
import json
from FlagEmbedding import FlagModel
import numpy as np
import pandas as pd
import heapq
import re
import spacy

# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer

## Step 0: Read in the Data + Model

In [20]:
course_catalog = pd.read_csv('../data/course_info.csv')
course_catalog.head()

Unnamed: 0,Code,Department,Title,Units,Description,Prerequisites,Level,URL,Spring24
0,AIP 97,AIP,Academic Internship,"2, 4",Individual placements for field learning. Must...,"lower-division standing, completion of thirty ...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,False
1,AIP 197,AIP,Academic Internship Program,"2, 4, 6, 8, 10, 12",Individual internship placements integrated wi...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,False
2,AIP 197DC,AIP,"UCDC: Washington, DC Internship","6, 8, 10",This internship is attached to the University ...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,False
3,AIP 197P,AIP,Public Service Internship,"4, 8, 12",Individual placements for field learning perfo...,ninety units completed; 2.5 minimum cumulative...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,False
4,AIP 197T,AIP,Academic Internship Program—Special Programs,2,Individual placements for field learning assoc...,ninety units minimum completed; 2.5 minimum cu...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,False


In [3]:
model = FlagModel('BAAI/bge-small-en-v1.5', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages: ",
                  use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

## Step 1: Load in the Document Embeddings

In [4]:
start = time.time()

# read the json file from title_bge_embeddings.json
with open('../data/embeddings/bge_title_embeddings.json', 'r') as json_file:
    title_embeddings_dict = json.load(json_file)

# convert each list value to a numpy array
title_embeddings = []
for key, value in title_embeddings_dict.items():
    title_embeddings.append(np.array(value))
end = time.time()

print("Time taken to load in the title embeddings:", end - start, "seconds")

Time taken to load in the title embeddings: 1.347188949584961 seconds


In [5]:
start = time.time()

# read the json file from desc_bge_embeddings.json
with open('../data/embeddings/bge_desc_embeddings.json', 'r') as json_file:
    desc_embeddings_dict = json.load(json_file)

# convert each list value to a numpy array
desc_embeddings = []
for key, value in desc_embeddings_dict.items():
    desc_embeddings.append(np.array(value))
end = time.time()

print("Time taken to load in the description embeddings:", end - start, "seconds")

Time taken to load in the description embeddings: 1.3619611263275146 seconds


## Step 2: Implement the Search Function

In [6]:
# Uncomment to download en_core_web_sm model if not downloaded already
# !python -m spacy download en_core_web_sm

In [7]:
nlp = spacy.load('en_core_web_sm')

In [8]:
def preprocess_and_embed(text):
#   Preprocessing using NLTK
#     ===============================================================
#      # Convert to lowercase
#     text = text.lower()
    
#     # Tokenizes text
#     tokens = re.split(r'[^a-zA-Z0-9]+', text)
    
#     # Remove stopwords
#     stop_words = set(stopwords.words('english'))
#     tokens = [token for token in tokens if token not in stop_words]
    
#     # Lemmatize
#     lemmatizer = WordNetLemmatizer()
#     tokens = [lemmatizer.lemmatize(token) for token in tokens]

#     preprocessed_text = ' '.join(tokens)
#   =================================================================

    # uses spacy to preprocess text
    preprocessed_text = str(nlp(text))

    return model.encode(preprocessed_text)

In [9]:
def cosine_similarity(vec1, vec2):
    """
    Computes the cosine similarity between two vectors
    """
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [10]:
def search(query, title_embeddings, desc_embeddings, data, k=10):
    """
    Computes the embedding of the query and retrieves the k most similar documents
    """
    # if query is a potential course code, check if its in a valid course code format
    if re.match('^[A-Za-z]+\d+$', query):
        # remove punctuation from query
        query = re.sub('[^0-9a-zA-Z ]+', '', query)
        # add the space if query doesn't have a space between department and number
        query = re.sub("[A-Za-z]+", lambda ele: " " + ele[0] + " ", query).strip()
        
    # if the query is a course code, return just the row containing the course code
    if re.sub('[^0-9a-zA-Z ]+', '', query).upper() in data['Code'].values:  
        query = re.sub('[^0-9a-zA-Z ]+', '', query)
        exact_code = data[data['Code'] == query.upper()].iloc[0]
        return [(exact_code['Code'], exact_code['Title'], 1.0)] # return the row of the exact course match (sim = 1.0)
    
    # if the query is a course department, return all courses from that department
    if re.sub('[^0-9a-zA-Z ]+', '', query).upper() in data['Department'].values:
        query = re.sub('[^0-9a-zA-Z ]+', '', query)
        exact_dept = data[data['Department'] == query.upper()]
        return [(exact_dept['Code'].iloc[i], exact_dept['Title'].iloc[i], 1.0) for i in range(exact_dept.shape[0])]
    
    # gets the embedding of the query
    query_embedding = preprocess_and_embed(query)
    
    # get the similarities between the query embedding and the title embeddings
    title_similarities = np.array([cosine_similarity(query_embedding, doc_emb) for doc_emb in title_embeddings])
    
    # get the similarities between the query embedding and the description embeddings
    desc_similarities = np.array([cosine_similarity(query_embedding, doc_emb) for doc_emb in desc_embeddings])
    
    # weight the title and description similarities (weights should add up to 1) and calculate total similarity
    title_weight = 0.7
    desc_weight = 0.3
    similarities = (title_similarities * title_weight) + (desc_similarities * desc_weight)
    
    # ranks similarities by most similar to query embedding
    index_similarity_pair_ranked =  heapq.nlargest(k, enumerate(similarities), key=lambda x: x[1])
    
    ranked_docs = []
    for ind, sim in index_similarity_pair_ranked:
        # if cosine similarity < 0.7 and at least 3 courses have been added to search results, stop adding to search results
        if sim < 0.7 and len(ranked_docs) >= 3:
            break
        # grab the course code, the course title, and the similarity score
        ranked_docs.append((data['Code'][ind], data['Title'][ind], sim))
    
    return ranked_docs

## Step 3: Testing the Search Engine

In [11]:
start = time.time()
query = "dsc."
search_results = search(query, title_embeddings, desc_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.0029888153076171875 seconds


[('DSC 10', 'Principles of Data Science', 1.0),
 ('DSC 20', 'Programming and Basic Data Structures for Data Science', 1.0),
 ('DSC 30', 'Data Structures and Algorithms for Data Science', 1.0),
 ('DSC 40A', 'Theoretical Foundations of Data Science I', 1.0),
 ('DSC 40B', 'Theoretical Foundations of Data Science II', 1.0),
 ('DSC 80', 'The Practice and Application of Data Science', 1.0),
 ('DSC 90', 'Seminar in Data Science', 1.0),
 ('DSC 95', 'Tutor Apprenticeship in Data Science', 1.0),
 ('DSC 96', 'Workshop in Data Science', 1.0),
 ('DSC 97', 'Internship in Data Science', 1.0),
 ('DSC 98', 'Directed Group Study in Data Science', 1.0),
 ('DSC 99', 'Independent Study in Data Science', 1.0),
 ('DSC 100', 'Introduction to Data Management', 1.0),
 ('DSC 102', 'Systems for Scalable Analytics', 1.0),
 ('DSC 104', 'Beyond Relational Data Management', 1.0),
 ('DSC 106', 'Introduction to Data Visualization', 1.0),
 ('DSC 120', 'Signal Processing for Data Analysis', 1.0),
 ('DSC 140A', 'Probabili

In [12]:
start = time.time()
query = "dsc 10"
search_results = search(query, title_embeddings, desc_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.001008749008178711 seconds


[('DSC 10', 'Principles of Data Science', 1.0)]

In [13]:
start = time.time()
query = "introduction to calculus"
search_results = search(query, title_embeddings, desc_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.2039804458618164 seconds


[('MATH 10A', 'Calculus I', 0.8489875773069042),
 ('MATH 20A', 'Calculus for Science and Engineering', 0.8330968660103681),
 ('MATH 10B', 'Calculus II', 0.8287532588866264),
 ('MATH 20B', 'Calculus for Science and Engineering', 0.819815530418673),
 ('MATH 20D', 'Introduction to Differential Equations', 0.8159379424115494),
 ('MAE 105', 'Introduction to Mathematical Physics', 0.8008051210718767),
 ('MATH 110',
  'Introduction to Partial Differential Equations',
  0.7931030652319019),
 ('MATH 170C',
  'Introduction to Numerical Analysis: Ordinary Differential Equations',
  0.7920915515928617),
 ('MATH 157', 'Introduction to Mathematical Software', 0.7909416657714962),
 ('MATH 15A', 'Introduction to Discrete Mathematics', 0.7899653791136791)]

In [14]:
start = time.time()
query = "teaching math"
search_results = search(query, title_embeddings, desc_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.19084811210632324 seconds


[('MATH 95', 'Introduction to Teaching Math', 0.8690410107775794),
 ('MATH 195', 'Introduction to Teaching in Mathematics', 0.8531119389423881),
 ('MATH 121A',
  'Foundations of Teaching and Learning Mathematics I',
  0.8399435237588702),
 ('EDS 374', 'Secondary Mathematics Teaching Practices', 0.8285359108905239),
 ('MATH 121B',
  'Foundations of Teaching and Learning Math II',
  0.8193767269986378),
 ('MSED 297',
  'Special Topics in Math and Science Education',
  0.8159380099344792),
 ('EDS 385',
  'Elementary School Mathematics Content and Pedagogy',
  0.8144931841357561),
 ('MATH 153', 'Geometry for Secondary Teachers', 0.8140247492439814),
 ('EDS 355C',
  'Advanced Mathematics Teaching Practices for Grades K–6',
  0.8071020828748441),
 ('EDS 355A',
  'Advanced Mathematics Teaching Practices for Grades K–6',
  0.8064081401487981)]

In [15]:
start = time.time()
query = "sign language"
search_results = search(query, title_embeddings, desc_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.22683119773864746 seconds


[('LISL 1D', 'American Sign Language Conversation', 0.8566984500842192),
 ('LISL 1C', 'American Sign Language Conversation', 0.8513451949604657),
 ('LISL 1A', 'American Sign Language Conversation', 0.8503227548526439),
 ('LISL 1B', 'American Sign Language Conversation', 0.8494378726229577),
 ('LIGN 7', 'Sign Languages and Their Cultures', 0.8482194864398789),
 ('LIGN 280', 'Sign Language Research', 0.8439243261619339),
 ('LISL 1DX', 'Analysis of American Sign Language', 0.842297113218707),
 ('LIGN 148',
  'The Psycholinguistics of Gesture and Sign Languages',
  0.8406767774228334),
 ('LISL 1E',
  'Intermediate American Sign Language Conversation',
  0.8324623088454213),
 ('LISL 1CX', 'Analysis of American Sign Language', 0.8260226145336731)]

In [16]:
start = time.time()
query = "game theory"
search_results = search(query, title_embeddings, desc_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.19827532768249512 seconds


[('ECON 109', 'Game Theory', 0.9324998366334738),
 ('POLI 204C', 'Game Theory 1', 0.896040592567642),
 ('POLI 205', 'Game Theory II', 0.8704689822243641),
 ('ECON 262', 'Behavioral Game Theory', 0.8544199109431581),
 ('POLI 118', 'Game Theory in Political Science', 0.8527695704894559),
 ('ECON 109T', 'Advanced Topics in Game Theory', 0.8370550320547614),
 ('POLI 100U', 'Games, Strategy, and Politics', 0.776035180767025),
 ('MATH 280A', 'Probability Theory I', 0.7737567717698303),
 ('ECON 204', 'Contract Theory', 0.7703438700206499),
 ('PHIL 125', 'Probability and Decision Theory', 0.7664281303160928)]

In [17]:
start = time.time()
query = "math 10a"
search_results = search(query, title_embeddings, desc_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.0020012855529785156 seconds


[('MATH 10A', 'Calculus I', 1.0)]

In [18]:
start = time.time()
query = "soccer"
search_results = search(query, title_embeddings, desc_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.17919349670410156 seconds


[('HITO 123', 'The Global History of Soccer', 0.7334518200290014),
 ('PSYC 139', 'The Social Psychology of Sport', 0.6842909329266696),
 ('COMM 111T', 'CCP: Cultural Politics of Sport', 0.6568140348476779)]