In [1]:
import time
import json

from FlagEmbedding import FlagModel

import numpy as np
import pandas as pd

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

## Step 0: Load Data, Embeddings, and Model

In [2]:
course_info = pd.read_csv('data/course_catalog.csv')
course_info

Unnamed: 0,Code,Department,Title,Units,Description,Prerequisites,Level,URL
0,AIP 97,AIP,Academic Internship,"2, 4",Individual placements for field learning. Must...,"lower-division standing, completion of thirty ...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...
1,AIP 197,AIP,Academic Internship Program,"2, 4, 6, 8, 10, 12",Individual internship placements integrated wi...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
2,AIP 197DC,AIP,"UCDC: Washington, DC Internship","6, 8, 10",This internship is attached to the University ...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
3,AIP 197P,AIP,Public Service Internship,"4, 8, 12",Individual placements for field learning perfo...,ninety units completed; 2.5 minimum cumulative...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
4,AIP 197T,AIP,Academic Internship Program—Special Programs,2,Individual placements for field learning assoc...,ninety units minimum completed; 2.5 minimum cu...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
...,...,...,...,...,...,...,...,...
7164,WCWP 100,WCWP,Academic Writing,4,An upper-division workshop course in argumenta...,junior/senior standing and must be a Warren Co...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
7165,WCWP 160,WCWP,Technical Writing for Scientists and Engineers,4,An upper-division workshop-style writing cours...,junior/senior standing.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
7166,WARR 189,WARR,Academic Mentoring and the Writing Process,2,Students will gain a fundamental understanding...,permission of instructor is required to enroll.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...
7167,WCWP 198,WCWP,Group Study,2,A directed group study involving research and ...,none,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...


In [3]:
start = time.time()

# read the json file from embeddings.json
with open('data/embeddings.json', 'r') as json_file:
    embeddings_dict = json.load(json_file)

# convert each list value to a numpy array
document_embeddings = []
for key, value in embeddings_dict.items():
    document_embeddings.append(np.array(value))
end = time.time()

print("Time taken to load in the document embeddings:", end - start, "seconds")

Time taken to load in the document embeddings: 1.6410658359527588 seconds


In [4]:
model = FlagModel('BAAI/bge-small-en-v1.5', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages: ",
                  use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

In [5]:
course_info['Embeddings'] = document_embeddings
course_info

Unnamed: 0,Code,Department,Title,Units,Description,Prerequisites,Level,URL,Embeddings
0,AIP 97,AIP,Academic Internship,"2, 4",Individual placements for field learning. Must...,"lower-division standing, completion of thirty ...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.011910725384950638, -0.0069093117490410805..."
1,AIP 197,AIP,Academic Internship Program,"2, 4, 6, 8, 10, 12",Individual internship placements integrated wi...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.010860656388103962, -0.003987603820860386, ..."
2,AIP 197DC,AIP,"UCDC: Washington, DC Internship","6, 8, 10",This internship is attached to the University ...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.005698702298104763, -0.01941147819161415, 0..."
3,AIP 197P,AIP,Public Service Internship,"4, 8, 12",Individual placements for field learning perfo...,ninety units completed; 2.5 minimum cumulative...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.014157678931951523, 0.011336466297507286, -..."
4,AIP 197T,AIP,Academic Internship Program—Special Programs,2,Individual placements for field learning assoc...,ninety units minimum completed; 2.5 minimum cu...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.022439394146203995, 0.011419197544455528, ..."
...,...,...,...,...,...,...,...,...,...
7164,WCWP 100,WCWP,Academic Writing,4,An upper-division workshop course in argumenta...,junior/senior standing and must be a Warren Co...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.07818092405796051, -0.060588475316762924, ..."
7165,WCWP 160,WCWP,Technical Writing for Scientists and Engineers,4,An upper-division workshop-style writing cours...,junior/senior standing.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.01776105910539627, 0.02582802064716816, 0.0..."
7166,WARR 189,WARR,Academic Mentoring and the Writing Process,2,Students will gain a fundamental understanding...,permission of instructor is required to enroll.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.04885496571660042, -0.002173090586438775, ..."
7167,WCWP 198,WCWP,Group Study,2,A directed group study involving research and ...,none,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.013629290275275707, 0.051931194961071014, -..."


## Step 1: Set up Elasticsearch

In [6]:
es = Elasticsearch("http://localhost:9200")
es.info()

{'name': '22a859511903',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': '_47Rjm1wS6CYOZNtGmSuow',
 'version': {'number': '8.7.0',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '09520b59b6bc1057340b55750186466ea715e30e',
  'build_date': '2023-03-27T16:31:09.816451435Z',
  'build_snapshot': False,
  'lucene_version': '9.5.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [7]:
# creating mappings 
mappings = {
    "properties": {
        'Code': {"type": "text"}, 
        'Department': {"type": "keyword"}, 
        'Title': {"type": "text"}, 
        'Units': {"type": "text"}, 
        'Description': {"type": "text"}, 
        'Prerequisites': {"type": "text"}, 
        'Level': {"type": "keyword"}, 
        'URL': {"type": "text"}, 
        'Embeddings':{'type': 'dense_vector', 'dims':384, "index":True, "similarity":"cosine"}
    }
}

In [8]:
try:
    es.indices.create(index="courses", mappings=mappings)
except:
    es.indices.delete(index='courses')
    print("Run cell again")
    pass

## Step 2: Adding Data into ElasticSearch

In [9]:
# Creating a list of dictionaries with all the data to be added in to the ElasticSearch index
bulk_data = []
for i, row in course_info.iterrows():
    bulk_data.append(
        {
            "_index": "courses",
            "_id": i,
            "_source": {
                "Code": row['Code'],
                'Department': row['Department'], 
                'Title': row['Title'], 
                'Units': row['Units'], 
                'Description': row['Description'], 
                'Prerequisites': row['Prerequisites'],
                'Level':row['Level'],
                'URL':row['URL'],
                'Embeddings': row['Embeddings']
            }
        }
    )

bulk(es, bulk_data)

(7169, [])

In [10]:
# verifying that all data has been read into the python index properly
es.indices.refresh(index="courses")
es.cat.count(index="courses", format="json")

[{'epoch': '1706724515', 'timestamp': '18:08:35', 'count': '7169'}]

## Step 3: Performing Search

In [11]:
def embedding_search(query):
    """
    Performs a search on embeddings (only can do one field at a time)
    """
    query_array = model.encode(query).tolist()
    
    embedding_search_query = {
        "size": 10,  # Number of results to return
        "knn": {
            "field": "Embeddings",  
            "query_vector": query_array,
            "k": 10,  # Number of nearest neighbors to retrieve
            "num_candidates": 8000  # Number of candidate hits the search will examine
        }
    }
    
    response = es.search(index="courses", body=embedding_search_query)

    results = []
    for hit in response['hits']['hits']:
        row = hit['_source']
        results.append((row['Code'], row['Title'], hit['_score']))
   
    return results

In [12]:
embedding_search('game theory')

  response = es.search(index="courses", body=embedding_search_query)


[('ECON 208', 'Games and Information', 0.92312264),
 ('ECON 109', 'Game Theory', 0.92272115),
 ('POLI 204C', 'Game Theory 1', 0.92184967),
 ('POLI 205', 'Game Theory II', 0.9092659),
 ('POLI 203B', 'Analytic Theory II', 0.90422004),
 ('ECON 262', 'Behavioral Game Theory', 0.9017492),
 ('POLI 203A', 'Analytic Theory I', 0.900231),
 ('POLI 118', 'Game Theory in Political Science', 0.896047),
 ('POLI 100U', 'Games, Strategy, and Politics', 0.8950685),
 ('ECON 109T', 'Advanced Topics in Game Theory', 0.89384615)]

In [13]:
embedding_search('introduction to calculus')

  response = es.search(index="courses", body=embedding_search_query)


[('MATH 20A', 'Calculus for Science and Engineering', 0.9090444),
 ('ECON 205', 'Mathematics for Economists', 0.9075136),
 ('MATH 20D', 'Introduction to Differential Equations', 0.9041934),
 ('MATH 10B', 'Calculus II', 0.9040549),
 ('MATH 20C',
  'Calculus and Analytic Geometry for Science and Engineering',
  0.89720225),
 ('MATH 10A', 'Calculus I', 0.8948755),
 ('MATH 170C',
  'Introduction to Numerical Analysis: Ordinary Differential Equations',
  0.8934132),
 ('MATH 2', 'Introduction to College Mathematics', 0.89307666),
 ('MATH 279', 'Projects in Computational and Applied Mathematics', 0.8904394),
 ('MATH 142A', 'Introduction to Analysis I', 0.89039785)]