In [19]:
import time
import json

from FlagEmbedding import FlagModel

import numpy as np
import pandas as pd
import heapq

## Step 0: Read in the Data + Model

In [20]:
course_catalog = pd.read_csv('data/course_catalog.csv')
course_catalog

Unnamed: 0,Code,Department,Title,Units,Description,Prerequisites,Level,URL
0,AIP 97,AIP,Academic Internship,"2, 4",Individual placements for field learning. Must...,"lower-division standing, completion of thirty ...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...
1,AIP 197,AIP,Academic Internship Program,"2, 4, 6, 8, 10, 12",Individual internship placements integrated wi...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
2,AIP 197DC,AIP,"UCDC: Washington, DC Internship","6, 8, 10",This internship is attached to the University ...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
3,AIP 197P,AIP,Public Service Internship,"4, 8, 12",Individual placements for field learning perfo...,ninety units completed; 2.5 minimum cumulative...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
4,AIP 197T,AIP,Academic Internship Program—Special Programs,2,Individual placements for field learning assoc...,ninety units minimum completed; 2.5 minimum cu...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
...,...,...,...,...,...,...,...,...
7164,WCWP 100,WCWP,Academic Writing,4,An upper-division workshop course in argumenta...,junior/senior standing and must be a Warren Co...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
7165,WCWP 160,WCWP,Technical Writing for Scientists and Engineers,4,An upper-division workshop-style writing cours...,junior/senior standing.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
7166,WARR 189,WARR,Academic Mentoring and the Writing Process,2,Students will gain a fundamental understanding...,permission of instructor is required to enroll.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
7167,WCWP 198,WCWP,Group Study,2,A directed group study involving research and ...,none,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...


In [21]:
model = FlagModel('BAAI/bge-small-en-v1.5', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages: ",
                  use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

## Step 1: Load in the Document Embeddings

In [22]:
start = time.time()

# read the json file from embeddings.json
with open('data/embeddings.json', 'r') as json_file:
    embeddings_dict = json.load(json_file)

# convert each list value to a numpy array
document_embeddings = []
for key, value in embeddings_dict.items():
    document_embeddings.append(np.array(value))
end = time.time()

print("Time taken to load in the document embeddings:", end - start, "seconds")

Time taken to load in the document embeddings: 4.518217086791992 seconds


## Step 2: Implement the Search Function

In [23]:
def cosine_similarity(vec1, vec2):
    """
    Computes the cosine similarity between two vectors
    """
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [24]:
def search(query, document_embeddings, data, k=10):
    """
    Computes the embedding of the query and retrieves the k most similar documents
    """
    # gets the embedding of the query
    query_embedding = model.encode_queries(query)
    
    # get the similarities between the query embedding and the document embeddings
    similarities = [cosine_similarity(query_embedding, doc_emb) for doc_emb in document_embeddings]
    
    # ranks similarities by most similar to query embedding
    index_similarity_pair_ranked =  heapq.nlargest(k, enumerate(similarities), key=lambda x: x[1])
    
    
    ranked_docs = []
    for ind, sim in index_similarity_pair_ranked:
        # don't include results that have similarity score < 0.7 unless it's the top result
        if sim < 0.7 and ranked_docs: break
        # grab the course code, the course title, and the similarity score
        ranked_docs.append((data['Code'][ind], data['Title'][ind], sim))
    
    return ranked_docs

## Step 5: Testing the Search Engine

In [25]:
start = time.time()
query = "introduction to calculus"
search_results = search(query, document_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.2977466583251953 seconds


[('MATH 20A', 'Calculus for Science and Engineering', 0.7832842117962404),
 ('ECON 205', 'Mathematics for Economists', 0.7626482329288281),
 ('MATH 10B', 'Calculus II', 0.7528032897528413),
 ('MATH 110',
  'Introduction to Partial Differential Equations',
  0.7497785873210528),
 ('MATH 142B', 'Introduction to Analysis II', 0.7473588667392784),
 ('MATH 142A', 'Introduction to Analysis I', 0.7442389243686187),
 ('MATH 2', 'Introduction to College Mathematics', 0.7438710212145163),
 ('MATH 157', 'Introduction to Mathematical Software', 0.7428268311673124),
 ('MATH 20D', 'Introduction to Differential Equations', 0.733033815941933),
 ('MATH 3C', 'Precalculus', 0.7189926278571934)]

In [26]:
start = time.time()
query = "teaching math"
search_results = search(query, document_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.28283238410949707 seconds


[('MATH 121B',
  'Foundations of Teaching and Learning Math II',
  0.8085485859532091),
 ('MATH 95', 'Introduction to Teaching Math', 0.8016646074818957),
 ('MATH 121A',
  'Foundations of Teaching and Learning Mathematics I',
  0.7859315320956897),
 ('EDS 385',
  'Elementary School Mathematics Content and Pedagogy',
  0.7844682042940657),
 ('EDS 355A',
  'Advanced Mathematics Teaching Practices for Grades K–6',
  0.7837348291015898),
 ('MATH 153', 'Geometry for Secondary Teachers', 0.780271389980039),
 ('EDS 355C',
  'Advanced Mathematics Teaching Practices for Grades K–6',
  0.7797543303701058),
 ('EDS 355B',
  'Advanced Mathematics Teaching Practices for Grades K–6',
  0.7767898435785789),
 ('MATH 195', 'Introduction to Teaching in Mathematics', 0.7741549747766477),
 ('ERC 89', 'Practicum in Math Tutoring', 0.7677124335475961)]

In [27]:
start = time.time()
query = "sign language"
search_results = search(query, document_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.2775764465332031 seconds


[('LIGN 280', 'Sign Language Research', 0.8278859710201408),
 ('LISL 1DX', 'Analysis of American Sign Language', 0.8014108293821758),
 ('LISL 1D', 'American Sign Language Conversation', 0.7970519462669414),
 ('LIGN 7', 'Sign Languages and Their Cultures', 0.7918134424969009),
 ('LIGN 148',
  'The Psycholinguistics of Gesture and Sign Languages',
  0.7851199425436874),
 ('LIGN 9GS',
  'Sign Languages and Deaf Culture in the U.S. and France',
  0.7784987300522903),
 ('LIGN 149GS',
  'The Historical Roots of American Sign Language',
  0.7701106309739947),
 ('LISL 1C', 'American Sign Language Conversation', 0.7691383649360072),
 ('LISL 1A', 'American Sign Language Conversation', 0.766016865189545),
 ('LISL 1B', 'American Sign Language Conversation', 0.7656975301384012)]

In [55]:
start = time.time()
query = "game theory"
search_results = search(query, document_embeddings, course_catalog, 25)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.3049123287200928 seconds


[('POLI 205', 'Game Theory II', 0.8247931416708111),
 ('ECON 109', 'Game Theory', 0.8216064827458682),
 ('POLI 204C', 'Game Theory 1', 0.8152783308984616),
 ('ECON 208', 'Games and Information', 0.8053477098285355),
 ('ECON 262', 'Behavioral Game Theory', 0.799163933023045),
 ('POLI 118', 'Game Theory in Political Science', 0.7956325252473994),
 ('POLI 100U', 'Games, Strategy, and Politics', 0.7929239361498983),
 ('POLI 203B', 'Analytic Theory II', 0.7928832951901142),
 ('POLI 203A', 'Analytic Theory I', 0.7921769822333592),
 ('ECON 109T', 'Advanced Topics in Game Theory', 0.7758552177322635),
 ('POLI 211', 'Formal Models in Political Theory', 0.7438693373688868),
 ('POLI 247B', 'Formal Models in International Relations', 0.735367437196154),
 ('ECON 172A', 'Operations Research A', 0.7316542465841428),
 ('ECON 263', 'Modeling Behavioral Economics', 0.7311616775825396),
 ('ECON 204', 'Contract Theory', 0.7300957865136254),
 ('ECON 200C', 'Microeconomics C', 0.7216346724246007),
 ('COGR 2