In [1]:
from FlagEmbedding import FlagModel
import numpy as np
import pandas as pd
import heapq
import spacy

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [7]:
triton_search = pd.read_csv('../data/tritonsearch_results.csv')

queries = triton_search['Query'].unique()

data = pd.read_pickle('../data/course_catalog_final.pkl')

In [8]:
nlp = spacy.load('en_core_web_sm')
# loading model
model = FlagModel('BAAI/bge-small-en-v1.5', 
                query_instruction_for_retrieval="Represent this sentence for searching relevant passages: ",
                use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation;

In [10]:
def filter(df, springOnly, upper_div, lower_div, graduate, include, exclude):
    """
    Optimized filter function for a DataFrame based on level of study and department inclusion/exclusion.

    Parameters:
    - df: DataFrame to filter.
    - upper_div: Boolean, True to include Upper Division levels.
    - lower_div: Boolean, True to include Lower Division levels.
    - graduate: Boolean, True to include Graduate levels.
    - include: String of departments to include (depts separated with commas)
    - exclude: String of departments to exclude (depts separated with commas)

    Returns:
    - Optimized filtered DataFrame based on the specified criteria.
    """
    # Create a boolean series for each level condition
    conditions = pd.Series(False, index=df.index)
    if upper_div:
        conditions |= (df['Level'] == 'Upper Division')
    if lower_div:
        conditions |= (df['Level'] == 'Lower Division')
    if graduate:
        conditions |= (df['Level'] == 'Graduate')
    if springOnly:
        conditions &= (df['Spring'] == 'T')
    
    # Apply level filtering
    df = df[conditions]

    # Apply department inclusion and exclusion
    if include:
        include_list = include.upper().replace(",", " ").split()
        print(include_list)
        df = df[df['Department'].isin(include_list)]
    if exclude:
        exclude_list = exclude.upper().replace(",", " ").split()
        df = df[~df['Department'].isin(exclude_list)]
    
    # Reset index
    df = df.reset_index(drop=True)
    return df
 
def preprocess_and_embed(text):
    preprocessed_text = str(nlp(text))
    return model.encode(preprocessed_text)

def cosine_similarity(vec1, vec2):
    """
    Computes the cosine similarity between two vectors
    """
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def search(query, data, k):
    """
    Computes the embedding of the query and retrieves the k most similar documents
    """
    k = int(k)
    title_embeddings = data['Title Embeddings']
    desc_embeddings = data['Description Embeddings']

    # if the query is a course code, return just the row containing the course code
    if query.upper() in data['Code'].values:
        exact_code = data[data['Code'] == query.upper()].iloc[0]
        return [(exact_code['Code'], exact_code['Title'], exact_code['Description'], exact_code['Prerequisites'], exact_code['URL'], exact_code['Spring'])] 
    # if the query is a course department, return all courses from that department
    if query.upper() in data['Department'].values:
        exact_dept = data[data['Department'] == query.upper()]
        return exact_dept[['Code', 'Title', 'Description', 'Prerequisites', 'URL', 'Spring']].values.tolist()
    
    # gets the embedding of the query
    query_embedding = preprocess_and_embed(query)
    
    # get the similarities between the query embedding and the title embeddings
    title_similarities = np.array([cosine_similarity(query_embedding, doc_emb) for doc_emb in title_embeddings])
    
    # get the similarities between the query embedding and the description embeddings
    desc_similarities = np.array([cosine_similarity(query_embedding, doc_emb) for doc_emb in desc_embeddings])
    
    # weight the title and description similarities (weights should add up to 1) and calculate total similarity
    title_weight = 0.7
    desc_weight = 0.3
    similarities = (title_similarities * title_weight) + (desc_similarities * desc_weight)
    
    # ranks similarities by most similar to query embedding
    index_similarity_pair_ranked =  heapq.nlargest(k, enumerate(similarities), key=lambda x: x[1])
    
    ranked_docs = []
    for ind, sim in index_similarity_pair_ranked:
        # if cosine similarity < 0.7 and at least 3 courses have been added to search results, stop adding to search results
        if sim < 0.7 and len(ranked_docs) >= 3:
            break
        # return the necessary information
        ranked_docs.append((data['Code'][ind], data['Title'][ind],  data['Description'][ind]))
    
    return ranked_docs

In [12]:
all_results = []

for q in queries:
    results = search(q, data, 5)
    for r in results:
        all_results.append([q, r[0], r[1], r[2]])
    while (len(all_results)%5 != 0):
        all_results.append([q, 'NO RESULTS', 'NO RESULTS', 'NO RESULTS'])

In [13]:
all_results

[['2d art',
  'VIS 1',
  'Introduction to Art Making: Two-Dimensional Practices',
  'An introduction to the concepts and techniques of two-dimensional art making with an emphasis on drawing. Lectures and studio classes will introduce skills and concepts of contemporary drawing practice in relation to a variety of genres such as illustration, comics, advertising, animation, and other forms of visualization. This course is offered only one time each year.'],
 ['2d art',
  'VIS 106A',
  'Painting: Image Making',
  'A studio course focusing on problems inherent in painting—transferring information and ideas onto a two-dimensional surface, color, composition, as well as manual and technical procedures. These concepts will be explored through the use of models, still life, and landscapes.'],
 ['2d art',
  'VIS 2',
  'Introduction to Art Making: Motion and Time-Based Art',
  'An introduction to art making utilizing the transaction between people, objects, situations, and media. Includes both 

In [15]:
ucs_results = pd.DataFrame(all_results)
ucs_results.columns = ['Query', 'Class Code', 'Class Title', 'Class Description']
ucs_results

Unnamed: 0,Query,Class Code,Class Title,Class Description
0,2d art,VIS 1,Introduction to Art Making: Two-Dimensional Pr...,An introduction to the concepts and techniques...
1,2d art,VIS 106A,Painting: Image Making,A studio course focusing on problems inherent ...
2,2d art,VIS 2,Introduction to Art Making: Motion and Time-Ba...,An introduction to art making utilizing the tr...
3,2d art,VIS 105A,Drawing: Representing the Subject,A studio course in beginning drawing covering ...
4,2d art,VIS 3,Introduction to Art Making: Three-Dimensional ...,An introduction to art making that uses as its...
...,...,...,...,...
720,javascript,LIGN 121,Syntax I,What universal principles determine how words ...
721,javascript,CSE 134B,Web Client Languages,Design and implementation of interactive World...
722,javascript,LIGN 221,Syntax,"An introduction to syntactic phenomena, argume..."
723,javascript,NO RESULTS,NO RESULTS,NO RESULTS


In [16]:
ucs_results.to_csv('../data/ucs_results.csv', index=False)