In [1]:
import pandas as pd
import numpy as np
# from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

## Step 0: Load Data

In [3]:
course_info = pd.read_pickle('../data/course_catalog_with_embeddings.pkl')
# first column unneeded
# course_info = course_info.drop(course_info.columns[0], axis=1)
course_info.head()

Unnamed: 0,Code,Department,Title,Units,Description,Prerequisites,Level,URL,Description Embeddings,Title Embeddings
0,AIP 97,AIP,Academic Internship,"2, 4",Individual placements for field learning. Must...,"lower-division standing, completion of thirty ...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[tensor(-0.1893), tensor(0.4135), tensor(-0.14...","[tensor(0.1341), tensor(0.0716), tensor(-0.119..."
1,AIP 197,AIP,Academic Internship Program,"2, 4, 6, 8, 10, 12",Individual internship placements integrated wi...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[tensor(-0.2007), tensor(0.4261), tensor(0.143...","[tensor(0.0525), tensor(0.3550), tensor(-0.113..."
2,AIP 197DC,AIP,"UCDC: Washington, DC Internship","6, 8, 10",This internship is attached to the University ...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[tensor(-0.0163), tensor(0.5243), tensor(-0.14...","[tensor(-0.2652), tensor(0.4275), tensor(-0.41..."
3,AIP 197P,AIP,Public Service Internship,"4, 8, 12",Individual placements for field learning perfo...,ninety units completed; 2.5 minimum cumulative...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[tensor(0.5637), tensor(0.1932), tensor(-0.447...","[tensor(0.2329), tensor(0.0559), tensor(-0.580..."
4,AIP 197T,AIP,Academic Internship Program—Special Programs,2,Individual placements for field learning assoc...,ninety units minimum completed; 2.5 minimum cu...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[tensor(-0.1039), tensor(0.4214), tensor(-0.43...","[tensor(0.2310), tensor(-0.0024), tensor(-0.41..."


In [None]:
# Elasticsearch will not take tensors
course_info['Title Embeddings'] = course_info['Title Embeddings'].apply(lambda x: x.tolist())
course_info['Description Embeddings'] = course_info['Description Embeddings'].apply(lambda x: x.tolist())


In [None]:
course_info.head()

In [None]:
# course_info = course_info.astype(str)
# course_info = course_info.drop_duplicates()

## Step 1: Set up Elasticsearch

In [6]:
es = Elasticsearch("http://localhost:9200")
es.info().body

{'name': '5b7bad3897c1',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'okw4nkaNQDCUaCqDEKyQPQ',
 'version': {'number': '8.7.0',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '09520b59b6bc1057340b55750186466ea715e30e',
  'build_date': '2023-03-27T16:31:09.816451435Z',
  'build_snapshot': False,
  'lucene_version': '9.5.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [7]:
# creating mappings 
mappings = {
    "properties": {
        'Code': {"type": "text"}, 
        'Department': {"type": "keyword"}, 
        'Title': {"type": "text"}, 
        'Units': {"type": "text"}, 
        'Description': {"type": "text"}, 
        'Prerequisites': {"type": "text"}, 
        'Level': {"type": "keyword"}, 
        'URL': {"type": "text"}, 
        # 'Description Embeddings':{'type': 'dense_vector', 'dims':768, "index":True, "similarity":"cosine"},
        # 'Title Embeddings':{'type': 'dense_vector', 'dims':768, "index":True, "similarity":"cosine"},
    }
}

In [8]:
try:
    es.indices.create(index="courses", mappings=mappings)
except:
    print('passed')
    pass

## Step 2: Adding Data into ElasticSearch

In [9]:
# Creating a list of dictionaries with all the data to be added in to the ElasticSearch index
bulk_data = []
for i, row in course_info.iterrows():
    bulk_data.append(
        {
            "_index": "courses",
            "_id": i,
            "_source": {
                "Code": row['Code'],
                'Department': row['Department'], 
                'Title': row['Title'], 
                'Units': row['Units'], 
                'Description': row['Description'], 
                'Prerequisites': row['Prerequisites'],
                'Level':row['Level'],
                'URL':row['URL'],
                # 'Description Embeddings': row['Description Embeddings'],
                # 'Title Embeddings':row['Title Embeddings'],
            }
        }
    )

bulk(es, bulk_data)

(7169, [])

In [10]:
# verifying that all data has been read into the python index properly
es.indices.refresh(index="courses")
es.cat.count(index="courses", format="json")

ListApiResponse([{'epoch': '1709155990', 'timestamp': '21:33:10', 'count': '7169'}])

## Step 3: Performing Search

In [36]:
def es_search(query, upperdiv=True, lowerdiv=True, graduate=True, include='', exclude='', k=10):
    # Build the query
    must_clauses = [{
        "query_string": {
            "query": query,
            "fields": [
                'Code',
                'Department',
                'Title',
                'Description',
                'Prerequisites',
                'Level'
            ],
            "phrase_slop": 2
        }
    }]

    # Process include and exclude lists
    include_list = include.upper().replace(" ", "").split(',')
    exclude_list = exclude.upper().replace(" ", "").split(',')

    # Add department filters
    if include_list != ['']:
        must_clauses.append({"terms": {"Department": include_list}})
    
    must_not_clause = {"terms": {"Department": exclude_list}} if exclude_list != [''] else []


    # Initialize the 'should' clause for class level filters
    should_clauses = []
    if upperdiv:
        should_clauses.append({"match": {"Level": "Upper Division"}})
    if lowerdiv:
        should_clauses.append({"match": {"Level": "Lower Division"}})
    if graduate:
        should_clauses.append({"match": {"Level": "Graduate"}})

    # Build the final query
    es_query = {
        "query": {
            "bool": {
                "must": must_clauses,
                "should": should_clauses,
                "must_not": must_not_clause,
                # "minimum_should_match": 1 if should_clauses else 0
            }
        },
        "size": k
    }

    response = es.search(index="courses", body=es_query)
    
    results = []
    for hit in response['hits']['hits']:
        row = hit['_source']
        results.append((row['Code'], row['Title'], row['Description']))
        
    return results


In [22]:
# es_search('game theory', exclude="POLI")

In [None]:
# test_results = {'Code':['DSC 100', 'DSC 80', 'DSC 10', "DSC 200","POLI 117"]}

# def filter(results, upperdiv, lowerdiv, graduate, include, exclude):
#     df = pd.DataFrame(results).merge(course_info, on="Code")

#     if not upperdiv:
#         df.drop(df.loc[df['Level']=='Upper Division'].index, inplace=True)

#     if not lowerdiv:
#         df.drop(df.loc[df['Level']=='Lower Division'].index, inplace=True)

#     if not graduate:
#         df.drop(df.loc[df['Level']=='Graduate'].index, inplace=True)

#     # pre-processing of include and exlude
#     include_list = include.upper().upper().replace(" ", "").split(',')
#     exclude_list = exclude.upper().upper().replace(" ", "").split(',')

#     include_mask = df['Department'].isin(include_list)
#     exclude_mask = ~df['Department'].isin(exclude_list)

#     # Combine masks and filter the DataFrame
#     filtered_df = df[include_mask & exclude_mask]

#     return filtered_df

# filter(test_results, True, False, True, "dsc", "poli")

In [17]:
# def embedding_search(query):
#     """
#     Performs a search on embeddings (only can do one field at a time)
#     """
#     model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v4')
#     query_array = model.encode(query).tolist()
    
#     description_search_query = {
#         "size": 10,  # Number of results to return
#         "knn": {
#             "field": "Description Embeddings",  
#             "query_vector": query_array,
#             "k": 10,  # Number of nearest neighbors to retrieve
#             "num_candidates": 8000  # Number of candidate hits the search will examine
#         }
#     }

#     title_search_query = {
#         "size": 10,  # Number of results to return
#         "knn": {
#             "field": "Title Embeddings",  
#             "query_vector": query_array,
#             "k": 10,  # Number of nearest neighbors to retrieve
#             "num_candidates": 8000  # Number of candidate hits the search will examine
#         }
#     }
    
#     desc_response = es.search(index="courses", body=description_search_query)
#     title_response = es.search(index="courses", body=title_search_query)

#     # combined_results = desc_response['hits']['hits'] + title_response['hits']['hits']
    
#     # results = []
#     # for result in combined_results:
#     #     row = result['_source']
#     #     results.append((row['Code'], row['Title'], result['_score']))

#     results = []
#     for response in [desc_response, title_response]:
#         for hit in response['hits']['hits']:
#             row = hit['_source']
#             results.append((row['Code'], row['Title'], hit['_score']))
#         results.append('-------------------------------------')
   
#     return results


In [13]:
queries = ["2d art", "accounting", "acting", "advance biological topics", "advanced machine learning methods", "ai", 
          "alcohol", "algae", "algebra", "algos", "analytics", "anbi", "animal", "aquarium", "archeology", "art", 
          "artificial intelligence", "asia", "astronomy", "audio", "autonomous vehicles", "bananas", "baseball", 
          "basket weaving", "bayes theorem", "beer", "beng", "biochemistry", "bioinformatics", "biology", "bird", 
          "black markets", "board games", "botany", "brain", "brawl stars", "bubble machine", "building vehicles", 
          "business", "chamber music", "city planning", "clout", "code", "coding", "coffee", "communication", 
          "composite", "computer", "computer science electives", "computer vision", "computing", "conservation", 
          "convolutional neural networks", "cook", "cooking", "course search engines", "covid 19", "creative writing", 
          "creativity", "credit", "credit cards", "cryptography", "culture", "cum", "cystic", "cystic fibrosis", 
          "dance", "data science capstone", "data science ethics", "data science project", "data science theory", 
          "data science topics", "data structures", "data visualization", "database", "deep learning", "definite integrals", 
          "demon", "design", "digital design", "digital photography", "disability", "dog", "downsides of social media", 
          "drawing", "drugs", "economics", "electrical engineering", "electrodynamics", "electromagnetism", "engineering", 
          "enviornmental engineering", "ethics of social media", "exercise", "exploring the ocean", "fashion", "film", 
          "finance", "fish", "fisheries", "food", "football", "forensics", "forestry", "fortnite", "fourier analysis", 
          "game theory", "games", "gaming", "garden", "genetic", "genetic algorithm", "genetics", "glizzy", "herb", 
          "hiphop", "history", "history and engineering", "how the internet works", "how to build a website", 
          "how to buy a house", "how to teach kids computer science", "human biology", "human interaction", "ice cream", 
          "image processing", "information systems", "intensive writing", "interaction", "intro data science", 
          "intro to data science", "introduction to calculus", "islam", "japan", "japanese", "java", "javascript", "joke", 
          "kanye", "korean", "kubernetes", "l'hospital's rule", "language models", "large code bases", "latin", 
          "latin american history", "law", "learning sciences", "learning sign language", "lgbt", "linguistic", "llm", 
          "love", "machine", "machine learning", "makeup", "marine biology lab", "marketing", "matrix multiplication", 
          "meaders", "medical robotics", "medical school", "medicinal", "medicine", "meme", "mental health", "michelangelo", 
          "middle east", "modern architecture", "movie", "music", "nan", "national security", "native american literature", 
          "network", "neural networks", "neuroscience", "nlp", "nosql", "occult", "ocean", "ocean sensors", "operator algebra", 
          "optimization", "organic chemistry", "pain", "painting", "panda", "personal finances", "penis", "phage", 
          "pharmacology", "photoshop", "physics", "physiology", "pizza", "plant", "poisson", "pokemon", "poop", "pottery", 
          "projects", "python coding", "python for beginners", "pythons", "quantitative finance", "racism", "real estate",
            "recommender systems", "remote", "research communication", "robotics", "saxophone", "science writing",
            "semantic search", "semantics", "seminar", "sex", "sixth general education", "skincare", "snakes", "social media",
            "something with 2d", "sport", "sql", "stock trading", "storm water", "super bowl", "super computer", "supply chain",
            "survival skills", "synthesis", "taxes", "teaching calculus", "transportation", 
            "ucsd course search", "ui and ux", "ups", "urban studies", "valentines day", "video game",
            "video game creation", "video game design", "video games", "vietnam", "volleyball",
            "wastewater", "website", "wine", "yoga", "youtube", "zoo", "zoology"]

In [38]:
all_results
for q in queries:
    all_results += es_search(q, k=5)

BadRequestError: BadRequestError(400, 'search_phase_execution_exception', 'Failed to parse query [ui/ux]')

In [None]:
pd.DataFrame(all_results)

In [37]:
all_results = []

to_add = es_search('data science', k=5)
all_results += to_add
all_results += (es_search('planes', k=5))
all_results

[('DSC 80',
  'The Practice and Application of Data Science',
  'The marriage of data, computation, and inferential thinking, or “data science,” is redefining how people and organizations solve challenging problems and understand the world. This course bridges lower- and upper-division data science courses as well as methods courses in other fields. Students master the data science life-cycle and learn many of the fundamental principles and techniques of data science spanning algorithms, statistics, machine learning, visualization, and data systems.'),
 ('DSC 95',
  'Tutor Apprenticeship in Data Science',
  'Students will receive training in skills and techniques necessary to be effective tutors for data science courses. Students will also gain practical experience in tutoring students on data science topics.'),
 ('COGS 108',
  'Data Science in Practice',
  'Data science is multidisciplinary, covering computer science, statistics, cognitive science and psychology, data visualization, a

In [19]:
embedding_search('introduction to calculus')

  return self.fget.__get__(instance, owner)()


[('POLI 270', 'Mathematical and Statistical Foundations', 0.73850524),
 ('PHYS 2A', 'Physics—Mechanics', 0.7245373),
 ('PHYS 2AR', 'Physics—Mechanics (distance education)', 0.72253084),
 ('ECON 280', 'Computation', 0.72087157),
 ('ECON 205', 'Mathematics for Economists', 0.71879166),
 ('MATH 103A', 'Modern Algebra I', 0.717171),
 ('MATH 142B', 'Introduction to Analysis II', 0.7144685),
 ('MATH 3B', 'Foundations of Precalculus', 0.7086526),
 ('MATH 110', 'Introduction to Partial Differential Equations', 0.70828575),
 ('MATH 10B', 'Calculus II', 0.70151496),
 '-------------------------------------',
 ('SIOC 203A', 'Introduction to Applied Mathematics I', 0.84996086),
 ('PHIL 10', 'Introduction to Logic', 0.8465594),
 ('MAE 294A', 'Introduction to Applied Mathematics', 0.8461771),
 ('MATH 10A', 'Calculus I', 0.84524536),
 ('MATH 20D', 'Introduction to Differential Equations', 0.8412711),
 ('MATH 2', 'Introduction to College Mathematics', 0.8373743),
 ('MATH 11',
  'Calculus-Based Introduc

In [20]:
embedding_search('game theory')

[('ECON 109', 'Game Theory', 0.823559),
 ('POLI 100U', 'Games, Strategy, and Politics', 0.82306385),
 ('ECON 208', 'Games and Information', 0.8096264),
 ('POLI 203A', 'Analytic Theory I', 0.7998295),
 ('POLI 203B', 'Analytic Theory II', 0.7922381),
 ('POLI 118', 'Game Theory in Political Science', 0.7833301),
 ('POLI 205', 'Game Theory II', 0.771317),
 ('ECON 262', 'Behavioral Game Theory', 0.7551508),
 ('POLI 204C', 'Game Theory 1', 0.73355067),
 ('POLI 211', 'Formal Models in Political Theory', 0.7028576),
 '-------------------------------------',
 ('ECON 109', 'Game Theory', 0.99999994),
 ('POLI 204C', 'Game Theory 1', 0.97212493),
 ('POLI 118', 'Game Theory in Political Science', 0.9246417),
 ('ECON 262', 'Behavioral Game Theory', 0.8475761),
 ('ECON 109T', 'Advanced Topics in Game Theory', 0.82947564),
 ('POLI 205', 'Game Theory II', 0.81930065),
 ('COGR 239', 'Computer Game Studies', 0.78623635),
 ('POLI 100U', 'Games, Strategy, and Politics', 0.7699954),
 ('ECON 208', 'Games and

In [21]:
embedding_search('computer art')

[('VIS 142', 'Practices in Computing Arts', 0.76777357),
 ('VIS 159', 'History of Art and Technology', 0.7421306),
 ('VIS 149', 'Seminar in Contemporary Computer Topics', 0.72628015),
 ('VIS 10', 'Computing in the Arts Lecture Series', 0.7215397),
 ('CSE 272', 'Advanced Image Synthesis', 0.71468794),
 ('CSE 274', 'Selected Topics in Graphics', 0.703153),
 ('VIS 147A', 'Electronic Technologies for Art I', 0.6988392),
 ('CSE 152B', 'Introduction to Computer Vision II', 0.6959293),
 ('CSE 163', 'Advanced Computer Graphics', 0.69464904),
 ('MUS 171', 'Computer Music I', 0.6830322),
 '-------------------------------------',
 ('CSE 169', 'Computer Animation', 0.85925657),
 ('VIS 141A', 'Computer Programming for the Arts I', 0.8318141),
 ('CSE 167', 'Computer Graphics', 0.8307047),
 ('MAE 291', 'Design and Mechanics in Computer Technology', 0.7837784),
 ('VIS 141B', 'Computer Programming for the Arts II', 0.7710541),
 ('ECE 30', 'Introduction to Computer Engineering', 0.76057684),
 ('CSE 193'