**Note**: Before running this code, please ensure an Elasticsearch instance is running first. If not, run 
`docker run --rm -p 9200:9200 -p 9300:9300 -e "xpack.security.enabled=false" -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:8.7.0 ` 
in your terminal. 

In [1]:
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

## Step 0: Loading in Data

In [2]:
course_info = pd.read_pickle('../data/course_catalog_final.pkl')
course_info.head()

Unnamed: 0,Code,Department,Title,Units,Description,Prerequisites,Level,URL,Description Embeddings,Title Embeddings,Spring
0,AIP 97,AIP,Academic Internship,"2, 4",Individual placements for field learning. Must...,"lower-division standing, completion of thirty ...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.015916548669338226, -0.02055269479751587, ...","[0.020150907337665558, 0.027494141831994057, -...",F
1,AIP 197,AIP,Academic Internship Program,"2, 4, 6, 8, 10, 12",Individual internship placements integrated wi...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.014821934513747692, -0.016383204609155655, ...","[0.007122549694031477, 0.03160935267806053, -0...",F
2,AIP 197DC,AIP,"UCDC: Washington, DC Internship","6, 8, 10",This internship is attached to the University ...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.011095795780420303, -0.020766522735357285, ...","[-0.015360986813902855, -0.006952735595405102,...",F
3,AIP 197P,AIP,Public Service Internship,"4, 8, 12",Individual placements for field learning perfo...,ninety units completed; 2.5 minimum cumulative...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.01937849633395672, 0.002653240691870451, -0...","[-0.006266473326832056, -0.006047536619007587,...",F
4,AIP 197T,AIP,Academic Internship Program—Special Programs,2,Individual placements for field learning assoc...,ninety units minimum completed; 2.5 minimum cu...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.018711868673563004, -0.014204149134457111,...","[-0.022189151495695114, 0.08559203147888184, 0...",F


In [3]:
len(course_info['Title Embeddings'][0])

384

In [4]:
# Elasticsearch will not take tensors
course_info['Title Embeddings'] = course_info['Title Embeddings'].apply(lambda x: x.tolist())
course_info['Description Embeddings'] = course_info['Description Embeddings'].apply(lambda x: x.tolist())

## Step 1: Set up Elasticsearch

In [5]:
es = Elasticsearch("http://localhost:9200")
es.info().body

{'name': '41a8f2ecdb77',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'bDokc4nkQqWX7deXy6aWUw',
 'version': {'number': '8.7.0',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '09520b59b6bc1057340b55750186466ea715e30e',
  'build_date': '2023-03-27T16:31:09.816451435Z',
  'build_snapshot': False,
  'lucene_version': '9.5.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [6]:
# creating mappings 
mappings = {
    "properties": {
        'Code': {"type": "text"}, 
        'Department': {"type": "keyword"}, 
        'Title': {"type": "text"}, 
        'Units': {"type": "text"}, 
        'Description': {"type": "text"}, 
        'Prerequisites': {"type": "text"}, 
        'Level': {"type": "keyword"}, 
        'URL': {"type": "text"}, 
        'Description Embeddings':{'type': 'dense_vector', 'dims':384, "index":True, "similarity":"cosine"},
        'Title Embeddings':{'type': 'dense_vector', 'dims':384, "index":True, "similarity":"cosine"},
    }
}

In [7]:
try:
    es.indices.create(index="courses", mappings=mappings)
except:
    print('passed')
    pass

In [8]:
# Creating a list of dictionaries with all the data to be added in to the ElasticSearch index
bulk_data = []
for i, row in course_info.iterrows():
    bulk_data.append(
        {
            "_index": "courses",
            "_id": i,
            "_source": {
                "Code": row['Code'],
                'Department': row['Department'], 
                'Title': row['Title'], 
                'Units': row['Units'], 
                'Description': row['Description'], 
                'Prerequisites': row['Prerequisites'],
                'Level':row['Level'],
                'URL':row['URL'],
                'Description Embeddings': row['Description Embeddings'],
                'Title Embeddings':row['Title Embeddings'],
            }
        }
    )

bulk(es, bulk_data)

(7169, [])

In [9]:
# verifying that all data has been read into the python index properly
es.indices.refresh(index="courses")
es.cat.count(index="courses", format="json")

ListApiResponse([{'epoch': '1710361726', 'timestamp': '20:28:46', 'count': '7169'}])

## Step 3: Performing Search

### Basic Elasticsearch

In [10]:
def es_search(query, upperdiv=True, lowerdiv=True, graduate=True, include='', exclude='', k=10):
    # Build the query
    must_clauses = [{
        "query_string": {
            "query": query,
            "fields": [
                'Code',
                'Department',
                'Title',
                'Description',
                'Prerequisites',
                'Level'
            ],
            "phrase_slop": 2
        }
    }]

    # Process include and exclude lists
    include_list = include.upper().replace(" ", "").split(',')
    exclude_list = exclude.upper().replace(" ", "").split(',')

    # Add department filters
    if include_list != ['']:
        must_clauses.append({"terms": {"Department": include_list}})
    
    must_not_clause = {"terms": {"Department": exclude_list}} if exclude_list != [''] else []


    # Initialize the 'should' clause for class level filters
    should_clauses = []
    if upperdiv:
        should_clauses.append({"match": {"Level": "Upper Division"}})
    if lowerdiv:
        should_clauses.append({"match": {"Level": "Lower Division"}})
    if graduate:
        should_clauses.append({"match": {"Level": "Graduate"}})

    # Build the final query
    es_query = {
        "query": {
            "bool": {
                "must": must_clauses,
                "should": should_clauses,
                "must_not": must_not_clause,
                # "minimum_should_match": 1 if should_clauses else 0
            }
        },
        "size": k
    }

    response = es.search(index="courses", body=es_query)
    
    results = []
    for hit in response['hits']['hits']:
        row = hit['_source']
        results.append((row['Code'], row['Title'], hit['_score']))
        
    return results

In [11]:
# example
es_search("game theory", k=3)

[('ECON 109', 'Game Theory', 14.601797),
 ('ECON 262', 'Behavioral Game Theory', 13.264897),
 ('POLI 204C', 'Game Theory 1', 13.264897)]

### Semantic Elasticsearch

In [54]:
from FlagEmbedding import FlagModel
import heapq

In [61]:
def sem_es_search(query, k=10):
    """
    Performs a search on embeddings (only can do one field at a time)
    """
    model = FlagModel('BAAI/bge-small-en-v1.5', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages: ",
                  use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
    query_array = model.encode(query).tolist()
    
    description_search_query = {
        "size": k + 5
        ,  # Number of results to return
        "knn": {
            "field": "Description Embeddings",  
            "query_vector": query_array,
            "k": k + 5,  # Number of nearest neighbors to retrieve
            "num_candidates": 8000  # Number of candidate hits the search will examine
        }
    }

    title_search_query = {
        "size": k + 5,  # Number of results to return
        "knn": {
            "field": "Title Embeddings",  
            "query_vector": query_array,
            "k": k + 5,  # Number of nearest neighbors to retrieve
            "num_candidates": 8000  # Number of candidate hits the search will examine
        }
    }
    
    desc_response = es.search(index="courses", body=description_search_query)
    title_response = es.search(index="courses", body=title_search_query)

    # combine weighted results
    desc_weight = 0.3
    title_weight = 0.7
    combined_results = {}
    for item in title_response['hits']['hits']:
        id_ = item['_id']
        combined_results[id_] = {'score': title_weight * item['_score'], 'doc': item}
    for item in desc_response['hits']['hits']:
        id_ = item['_id']
        if id_ in combined_results:
            combined_results[id_]['score'] += desc_weight * item['_score']
        else:
            combined_results[id_] = {'score': desc_weight * item['_score'], 'doc': item}
    
    # get top k results from combined scores
    top_k_items = heapq.nlargest(k, combined_results.values(), key=lambda x: x['score'])
    
    # return top k results (with info)
    results = []
    for item in top_k_items:
        row = item['doc']['_source']
        results.append((row['Code'], row['Title'], item['doc']['_score']))
    return results

    # if you want to look at individual performance instead
    # individual_results = ['-------------------------------------']
    # for response in [desc_response, title_response]:
    #     for hit in response['hits']['hits']:
    #         row = hit['_source']
    #         individual_results.append((row['Code'], row['Title'], hit['_score']))
    #     individual_results.append('-------------------------------------')
    # return individual_results


In [62]:
# example semantic search
sem_es_search("game theory", k=3)

[('ECON 109', 'Game Theory', 1.0),
 ('POLI 204C', 'Game Theory 1', 0.9684946),
 ('POLI 205', 'Game Theory II', 0.9508383)]