In [14]:
import time
import json

from FlagEmbedding import FlagModel

import numpy as np
import pandas as pd
import heapq
import re
import spacy
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer

## Step 0: Read in the Data + Model

In [5]:
course_catalog = pd.read_csv('../data/course_catalog.csv')
course_catalog

Unnamed: 0,Code,Department,Title,Units,Description,Prerequisites,Level,URL,Spring24
0,AIP 97,AIP,Academic Internship,"2, 4",Individual placements for field learning. Must...,"lower-division standing, completion of thirty ...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,False
1,AIP 197,AIP,Academic Internship Program,"2, 4, 6, 8, 10, 12",Individual internship placements integrated wi...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,False
2,AIP 197DC,AIP,"UCDC: Washington, DC Internship","6, 8, 10",This internship is attached to the University ...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,False
3,AIP 197P,AIP,Public Service Internship,"4, 8, 12",Individual placements for field learning perfo...,ninety units completed; 2.5 minimum cumulative...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,False
4,AIP 197T,AIP,Academic Internship Program—Special Programs,2,Individual placements for field learning assoc...,ninety units minimum completed; 2.5 minimum cu...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,False
...,...,...,...,...,...,...,...,...,...
7164,WCWP 100,WCWP,Academic Writing,4,An upper-division workshop course in argumenta...,junior/senior standing and must be a Warren Co...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,True
7165,WCWP 160,WCWP,Technical Writing for Scientists and Engineers,4,An upper-division workshop-style writing cours...,junior/senior standing.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,False
7166,WARR 189,WARR,Academic Mentoring and the Writing Process,2,Students will gain a fundamental understanding...,permission of instructor is required to enroll.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,False
7167,WCWP 198,WCWP,Group Study,2,A directed group study involving research and ...,none,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,False


In [6]:
model = FlagModel('BAAI/bge-small-en-v1.5', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages: ",
                  use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

## Step 1: Load in the Document Embeddings

In [7]:
start = time.time()

# read the json file from title_bge_embeddings.json
with open('../data/title_bge_embeddings.json', 'r') as json_file:
    title_embeddings_dict = json.load(json_file)

# convert each list value to a numpy array
title_embeddings = []
for key, value in title_embeddings_dict.items():
    title_embeddings.append(np.array(value))
end = time.time()

print("Time taken to load in the title embeddings:", end - start, "seconds")

Time taken to load in the title embeddings: 4.5997254848480225 seconds


In [8]:
start = time.time()

# read the json file from desc_bge_embeddings.json
with open('../data/desc_bge_embeddings.json', 'r') as json_file:
    desc_embeddings_dict = json.load(json_file)

# convert each list value to a numpy array
desc_embeddings = []
for key, value in desc_embeddings_dict.items():
    desc_embeddings.append(np.array(value))
end = time.time()

print("Time taken to load in the description embeddings:", end - start, "seconds")

Time taken to load in the description embeddings: 4.497195243835449 seconds


## Step 2: Implement the Search Function

In [12]:
# Uncomment to download en_core_web_sm model if not downloaded already
# !python -m spacy download en_core_web_sm

In [15]:
nlp = spacy.load('en_core_web_sm')

In [51]:
def preprocess_and_embed(text):
#   Preprocessing using NLTK
#     ===============================================================
#      # Convert to lowercase
#     text = text.lower()
    
#     # Tokenizes text
#     tokens = re.split(r'[^a-zA-Z0-9]+', text)
    
#     # Remove stopwords
#     stop_words = set(stopwords.words('english'))
#     tokens = [token for token in tokens if token not in stop_words]
    
#     # Lemmatize
#     lemmatizer = WordNetLemmatizer()
#     tokens = [lemmatizer.lemmatize(token) for token in tokens]

#     preprocessed_text = ' '.join(tokens)
#   =================================================================

    # uses spacy to preprocess text
    preprocessed_text = str(nlp(text))

    return model.encode(preprocessed_text)

In [53]:
def cosine_similarity(vec1, vec2):
    """
    Computes the cosine similarity between two vectors
    """
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [55]:
def search(query, title_embeddings, desc_embeddings, data, k=10):
    """
    Computes the embedding of the query and retrieves the k most similar documents
    """
    # if the query is a course code, return just the row containing the course code
    if query.upper() in data['Code'].values:
        exact_code = data[data['Code'] == query.upper()].iloc[0]
        return [(exact_code['Code'], exact_code['Title'], 1.0)] # return the row of the exact course match (sim = 1.0)
                 
    # gets the embedding of the query
    query_embedding = preprocess_and_embed(query)
    
    # get the similarities between the query embedding and the title embeddings
    title_similarities = np.array([cosine_similarity(query_embedding, doc_emb) for doc_emb in title_embeddings])
    
    # get the similarities between the query embedding and the description embeddings
    desc_similarities = np.array([cosine_similarity(query_embedding, doc_emb) for doc_emb in desc_embeddings])
    
    # weight the title and description similarities (weights should add up to 1) and calculate total similarity
    title_weight = 0.7
    desc_weight = 0.3
    similarities = (title_similarities * title_weight) + (desc_similarities * desc_weight)
    
    # ranks similarities by most similar to query embedding
    index_similarity_pair_ranked =  heapq.nlargest(k, enumerate(similarities), key=lambda x: x[1])
    
    ranked_docs = []
    for ind, sim in index_similarity_pair_ranked:
        # if cosine similarity < 0.7 and at least 3 courses have been added to search results, stop adding to search results
        if sim < 0.7 and len(ranked_docs) >= 3:
            break
        # grab the course code, the course title, and the similarity score
        ranked_docs.append((data['Code'][ind], data['Title'][ind], sim))
    
    return ranked_docs

## Step 3: Testing the Search Engine

In [56]:
start = time.time()
query = "introduction to calculus"
search_results = search(query, title_embeddings, desc_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.4777712821960449 seconds


[('MATH 10A', 'Calculus I', 0.8489875799224692),
 ('MATH 20A', 'Calculus for Science and Engineering', 0.8330968716528182),
 ('MATH 10B', 'Calculus II', 0.8287532945654694),
 ('MATH 20B', 'Calculus for Science and Engineering', 0.8198155476027544),
 ('MATH 20D', 'Introduction to Differential Equations', 0.8159379240283797),
 ('MAE 105', 'Introduction to Mathematical Physics', 0.8008051317512308),
 ('MATH 110',
  'Introduction to Partial Differential Equations',
  0.7931030513350209),
 ('MATH 170C',
  'Introduction to Numerical Analysis: Ordinary Differential Equations',
  0.792091578761086),
 ('MATH 157', 'Introduction to Mathematical Software', 0.7909416271555552),
 ('MATH 15A', 'Introduction to Discrete Mathematics', 0.7899653219147139)]

In [57]:
start = time.time()
query = "teaching math"
search_results = search(query, title_embeddings, desc_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.48476672172546387 seconds


[('MATH 95', 'Introduction to Teaching Math', 0.8690410046747863),
 ('MATH 195', 'Introduction to Teaching in Mathematics', 0.853111930171725),
 ('MATH 121A',
  'Foundations of Teaching and Learning Mathematics I',
  0.839943495568798),
 ('EDS 374', 'Secondary Mathematics Teaching Practices', 0.8285358902703903),
 ('MATH 121B',
  'Foundations of Teaching and Learning Math II',
  0.8193766865004611),
 ('MSED 297',
  'Special Topics in Math and Science Education',
  0.8159380082896769),
 ('EDS 385',
  'Elementary School Mathematics Content and Pedagogy',
  0.8144931824053239),
 ('MATH 153', 'Geometry for Secondary Teachers', 0.814024777280814),
 ('EDS 355C',
  'Advanced Mathematics Teaching Practices for Grades K–6',
  0.8071020744088768),
 ('EDS 355A',
  'Advanced Mathematics Teaching Practices for Grades K–6',
  0.8064081267716726)]

In [58]:
start = time.time()
query = "sign language"
search_results = search(query, title_embeddings, desc_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.4772927761077881 seconds


[('LISL 1D', 'American Sign Language Conversation', 0.8566984175276343),
 ('LISL 1C', 'American Sign Language Conversation', 0.8513451789116507),
 ('LISL 1A', 'American Sign Language Conversation', 0.850322739568772),
 ('LISL 1B', 'American Sign Language Conversation', 0.8494378577502517),
 ('LIGN 7', 'Sign Languages and Their Cultures', 0.8482194606686209),
 ('LIGN 280', 'Sign Language Research', 0.8439243210799603),
 ('LISL 1DX', 'Analysis of American Sign Language', 0.8422970939071233),
 ('LIGN 148',
  'The Psycholinguistics of Gesture and Sign Languages',
  0.8406767363211997),
 ('LISL 1E',
  'Intermediate American Sign Language Conversation',
  0.8324622849928092),
 ('LISL 1CX', 'Analysis of American Sign Language', 0.8260225990669664)]

In [59]:
start = time.time()
query = "game theory"
search_results = search(query, title_embeddings, desc_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.49006056785583496 seconds


[('ECON 109', 'Game Theory', 0.9324998181038461),
 ('POLI 204C', 'Game Theory 1', 0.8960405688258588),
 ('POLI 205', 'Game Theory II', 0.8704689598469136),
 ('ECON 262', 'Behavioral Game Theory', 0.8544199015792345),
 ('POLI 118', 'Game Theory in Political Science', 0.8527695242528832),
 ('ECON 109T', 'Advanced Topics in Game Theory', 0.8370550081013244),
 ('POLI 100U', 'Games, Strategy, and Politics', 0.7760351728166678),
 ('MATH 280A', 'Probability Theory I', 0.7737567040057383),
 ('ECON 204', 'Contract Theory', 0.7703438224845501),
 ('PHIL 125', 'Probability and Decision Theory', 0.7664280790520638)]

In [60]:
start = time.time()
query = "math 10a"
search_results = search(query, title_embeddings, desc_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.017027616500854492 seconds


[('MATH 10A', 'Calculus I', 1.0)]

In [61]:
start = time.time()
query = "soccer"
search_results = search(query, title_embeddings, desc_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.4677999019622803 seconds


[('HITO 123', 'The Global History of Soccer', 0.7334517640455127),
 ('PSYC 139', 'The Social Psychology of Sport', 0.6842908495901094),
 ('COMM 111T', 'CCP: Cultural Politics of Sport', 0.6568139681455102)]

## Step 4: Implementing Learning to Rank

In [18]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

In [118]:
# Load hand labeled test dataset
hand_labeled_test = pd.read_csv('../data/hand_labeled_test.csv')
hand_labeled_test

Unnamed: 0,Query,Code,Relevance
0,introduction to calculus,MATH 10A,3
1,introduction to calculus,MATH 20A,3
2,introduction to calculus,MATH 10B,2
3,introduction to calculus,MATH 20B,2
4,introduction to calculus,MATH 10C,1
...,...,...,...
218,religion,ANSC 125,2
219,religion,ANSC 154,1
220,religion,LTWL 138,1
221,religion,RELI 1,1


In [119]:
# Create embeddings columns in course_catalog and save as course_catalog_embed
course_catalog_embed = course_catalog.copy()
course_catalog_embed['Title Embeddings'] = title_embeddings
course_catalog_embed['Desc Embeddings'] = desc_embeddings
course_catalog_embed

Unnamed: 0,Code,Department,Title,Units,Description,Prerequisites,Level,URL,Title Embeddings,Desc Embeddings
0,AIP 97,AIP,Academic Internship,"2, 4",Individual placements for field learning. Must...,"lower-division standing, completion of thirty ...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.020150907337665558, 0.027494141831994057, -...","[-0.015916548669338226, -0.02055269479751587, ..."
1,AIP 197,AIP,Academic Internship Program,"2, 4, 6, 8, 10, 12",Individual internship placements integrated wi...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.007122549694031477, 0.03160935267806053, -0...","[0.014821934513747692, -0.016383204609155655, ..."
2,AIP 197DC,AIP,"UCDC: Washington, DC Internship","6, 8, 10",This internship is attached to the University ...,upper-division standing; department approval.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.015360986813902855, -0.006952735595405102,...","[0.011095795780420303, -0.020766522735357285, ..."
3,AIP 197P,AIP,Public Service Internship,"4, 8, 12",Individual placements for field learning perfo...,ninety units completed; 2.5 minimum cumulative...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.006266473326832056, -0.006047536619007587,...","[0.01937849633395672, 0.002653240691870451, -0..."
4,AIP 197T,AIP,Academic Internship Program—Special Programs,2,Individual placements for field learning assoc...,ninety units minimum completed; 2.5 minimum cu...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.022189151495695114, 0.08559203147888184, 0...","[-0.018711868673563004, -0.014204149134457111,..."
...,...,...,...,...,...,...,...,...,...,...
7164,WCWP 100,WCWP,Academic Writing,4,An upper-division workshop course in argumenta...,junior/senior standing and must be a Warren Co...,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.004997400101274252, 0.051742229610681534, ...","[-0.0769183561205864, -0.08011184632778168, -0..."
7165,WCWP 160,WCWP,Technical Writing for Scientists and Engineers,4,An upper-division workshop-style writing cours...,junior/senior standing.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.01645762473344803, 0.11270517855882645, 0....","[0.018340999260544777, -0.014060850255191326, ..."
7166,WARR 189,WARR,Academic Mentoring and the Writing Process,2,Students will gain a fundamental understanding...,permission of instructor is required to enroll.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.01667705550789833, 0.037703923881053925, -...","[-0.06008476763963699, -0.030637728050351143, ..."
7167,WCWP 198,WCWP,Group Study,2,A directed group study involving research and ...,none,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.025717351585626602, 0.032808322459459305, -...","[0.015909047797322273, 0.04573538899421692, -0..."


In [120]:
# Left join hand labeled test set with course_catalog_embed
ltr_data = pd.merge(hand_labeled_test, course_catalog_embed, how="left", left_on="Code", right_on="Code")
ltr_data

Unnamed: 0,Query,Code,Relevance,Department,Title,Units,Description,Prerequisites,Level,URL,Title Embeddings,Desc Embeddings
0,introduction to calculus,MATH 10A,3,MATH,Calculus I,4,Differential calculus of functions of one vari...,"Math Placement Exam qualifying score, or AP Ca...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.028717810288071632, -0.009232687763869762,...","[-0.03349634259939194, -0.028811199590563774, ..."
1,introduction to calculus,MATH 20A,3,MATH,Calculus for Science and Engineering,4,Foundations of differential and integral calcu...,"Math Placement Exam qualifying score, or AP Ca...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.009116207249462605, 0.029863763600587845, ...","[-0.03580307215452194, 0.03098887763917446, 0...."
2,introduction to calculus,MATH 10B,2,MATH,Calculus II,4,Integral calculus of functions of one variable...,"AP Calculus AB score of 3, 4, or 5 (or equival...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.05535414442420006, -0.020128319039940834, ...","[-0.044424429535865784, -0.05418340489268303, ..."
3,introduction to calculus,MATH 20B,2,MATH,Calculus for Science and Engineering,4,Integral calculus of one variable and its appl...,"AP Calculus AB score of 4 or 5, or AP Calculus...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.009116207249462605, 0.029863763600587845, ...","[-0.04570493474602699, 0.05882224813103676, 0...."
4,introduction to calculus,MATH 10C,1,MATH,Calculus III,4,Introduction to functions of more than one var...,"AP Calculus BC score of 3, 4, or 5, or MATH 10...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.045021943747997284, -0.014576049521565437,...","[-0.06486564129590988, -0.021479932591319084, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
218,religion,ANSC 125,2,ANSC,"Gender, Sexuality, and Society",4,How are gender and sexuality shaped by cultura...,upper-division standing.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.06735498458147049, 0.012076237238943577, 0....","[0.04372513294219971, 0.042502693831920624, 0...."
219,religion,ANSC 154,1,ANSC,Gender and Religion,4,This course explores the intersections of reli...,upper-division standing.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.07439419627189636, 0.05450890213251114, -0....","[0.0884685218334198, 0.05951087549328804, -0.0..."
220,religion,LTWL 138,1,LTWL,Critical Religion Studies,4,"Selected topics, texts, and problems in the st...",none,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.007093604654073715, 0.05521330609917641, -0...","[0.024651162326335907, 0.07051291316747665, -0..."
221,religion,RELI 1,1,RELI,Introduction to Religion,4,An introduction to key topics in the study of ...,none,Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.012707609683275223, 0.0533902645111084, 0.0...","[0.0022306889295578003, 0.08763747662305832, -..."


In [123]:
# Calculate the cosine similarities between the query and title, as well as between the query and description
title_query_sims = []
desc_query_sims = []
for index, row in ltr_data.iterrows():
    # get the embedding for the query
    query_embedding = preprocess_and_embed(row['Query'])
    
    # get cosine similarity between query and course title
    query_title_sim = cosine_similarity(query_embedding, row['Title Embeddings'])
    # get cosine similarity between query and course description
    query_desc_sim = cosine_similarity(query_embedding, row['Desc Embeddings'])
    
    title_query_sims.append(float(query_title_sim))
    desc_query_sims.append(float(query_desc_sim))
    
ltr_data['Title Similarity'] = title_query_sims
ltr_data['Desc Similarity'] = desc_query_sims

ltr_data

Unnamed: 0,Query,Code,Relevance,Department,Title,Units,Description,Prerequisites,Level,URL,Title Embeddings,Desc Embeddings,Title Similarity,Desc Similarity
0,introduction to calculus,MATH 10A,3,MATH,Calculus I,4,Differential calculus of functions of one vari...,"Math Placement Exam qualifying score, or AP Ca...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.028717810288071632, -0.009232687763869762,...","[-0.03349634259939194, -0.028811199590563774, ...",0.826263,0.678525
1,introduction to calculus,MATH 20A,3,MATH,Calculus for Science and Engineering,4,Foundations of differential and integral calcu...,"Math Placement Exam qualifying score, or AP Ca...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.009116207249462605, 0.029863763600587845, ...","[-0.03580307215452194, 0.03098887763917446, 0....",0.767332,0.763446
2,introduction to calculus,MATH 10B,2,MATH,Calculus II,4,Integral calculus of functions of one variable...,"AP Calculus AB score of 3, 4, or 5 (or equival...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.05535414442420006, -0.020128319039940834, ...","[-0.044424429535865784, -0.05418340489268303, ...",0.770875,0.723407
3,introduction to calculus,MATH 20B,2,MATH,Calculus for Science and Engineering,4,Integral calculus of one variable and its appl...,"AP Calculus AB score of 4 or 5, or AP Calculus...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.009116207249462605, 0.029863763600587845, ...","[-0.04570493474602699, 0.05882224813103676, 0....",0.767332,0.690720
4,introduction to calculus,MATH 10C,1,MATH,Calculus III,4,Introduction to functions of more than one var...,"AP Calculus BC score of 3, 4, or 5, or MATH 10...",Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[-0.045021943747997284, -0.014576049521565437,...","[-0.06486564129590988, -0.021479932591319084, ...",0.752670,0.583165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,religion,ANSC 125,2,ANSC,"Gender, Sexuality, and Society",4,How are gender and sexuality shaped by cultura...,upper-division standing.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.06735498458147049, 0.012076237238943577, 0....","[0.04372513294219971, 0.042502693831920624, 0....",0.554074,0.638262
219,religion,ANSC 154,1,ANSC,Gender and Religion,4,This course explores the intersections of reli...,upper-division standing.,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.07439419627189636, 0.05450890213251114, -0....","[0.0884685218334198, 0.05951087549328804, -0.0...",0.774815,0.735266
220,religion,LTWL 138,1,LTWL,Critical Religion Studies,4,"Selected topics, texts, and problems in the st...",none,Upper Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.007093604654073715, 0.05521330609917641, -0...","[0.024651162326335907, 0.07051291316747665, -0...",0.712002,0.688348
221,religion,RELI 1,1,RELI,Introduction to Religion,4,An introduction to key topics in the study of ...,none,Lower Division,https://cape.ucsd.edu/responses/Results.aspx?N...,"[0.012707609683275223, 0.0533902645111084, 0.0...","[0.0022306889295578003, 0.08763747662305832, -...",0.791152,0.699303


In [124]:
# split into X features and y label
X = ltr_data[['Title Similarity', 'Desc Similarity']]
y = ltr_data['Relevance']

In [125]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [147]:
# Create the LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train, group=[X_train.shape[0]])
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data, group=[X_test.shape[0]])

In [166]:
# Define parameters
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg'
}

# Train the model
ltr_model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=100)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 117
[LightGBM] [Info] Number of data points in the train set: 178, number of used features: 2


In [167]:
# Predict the relevance scores on the test set
y_pred = ltr_model.predict(X_test)
y_pred

array([ 2.5068525 , -3.24289162,  3.76968233, -5.61693661, -5.03910183,
       -3.5228815 , -3.51844093, -3.11427661, -7.58849106, -1.23926199,
        4.12665431, -3.79584961,  1.72673144,  4.57617469,  2.26112562,
       -1.1686449 , -1.55588833, -0.79316219, -3.10871986, -5.44012656,
       -5.14367449, -2.38147888, -1.74505915, -3.51277162, -3.38070036,
       -3.46820637, -0.95165667, -6.36105806, -4.17297209, -1.43892991,
       -7.61680314, -0.45186258, -3.21729046, -2.39439213, -6.21928828,
       -5.8092754 , -3.69689493, -7.47797409, -4.06911293, -2.65438478,
        2.32858605, -6.427709  , -3.69718297, -7.80727602, -7.32102409])

In [168]:
# Evaluate the model
ndcg = ndcg_score(y_test.values.reshape(1, -1), y_pred.reshape(1, -1))
print(f'NDCG Score: {ndcg}')

NDCG Score: 0.9424415090170758


In [169]:
# Save the model
ltr_model.save_model('../data/ltr_model.txt')

<lightgbm.basic.Booster at 0x1b58d3b7340>

## Step 5: Search with LTR

In [196]:
def search_ltr(query, title_embeddings, desc_embeddings, data, k=10):
    """
    Computes the embedding of the query and retrieves the k most similar documents
    """
    # gets the embedding of the query
    query_embedding = preprocess_and_embed(query)
    
    # get the similarities between the query embedding and the title embeddings
    title_similarities = np.array([cosine_similarity(query_embedding, doc_emb) for doc_emb in title_embeddings])
    
    # get the similarities between the query embedding and the description embeddings
    desc_similarities = np.array([cosine_similarity(query_embedding, doc_emb) for doc_emb in desc_embeddings])
    
    
    
    # weight the title and description similarities (weights should add up to 1) and calculate total similarity
    title_weight = 0.7
    desc_weight = 0.3
    initial_similarities = (title_similarities * title_weight) + (desc_similarities * desc_weight)
    
    # Select top N documents to re-rank where N > k
    N = 50
    top_n_index = np.argsort(-initial_similarities)[:N]
    subset = data.iloc[top_n_index].reset_index()

    # Extract features for the LTR model and re-rank
    ltr_features = np.array([title_similarities[top_n_index], desc_similarities[top_n_index]]).T
    ltr_scores = ltr_model.predict(ltr_features)
    top_ltr_index = np.argsort(-ltr_scores)[:k]

    # Retrieve final ranked documents
    ranked_docs = []
    for ind in top_ltr_index:
        # You can filter out documents below a certain threshold score if needed
        ranked_docs.append((subset['Code'].iloc[ind], subset['Title'].iloc[ind], ltr_scores[ind]))
    
    return ranked_docs

In [197]:
start = time.time()
query = "introduction to calculus"
search_results = search_ltr(query, title_embeddings, desc_embeddings, course_catalog)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 0.48241567611694336 seconds


[('ANTH 268', 'Introduction to Science Studies: Part I', 2.3082249677744193),
 ('CHEM 96', 'Introduction to Teaching Science', 2.2981050574137893),
 ('MATH 20A', 'Calculus for Science and Engineering', 2.0833361347893407),
 ('MATH 20D', 'Introduction to Differential Equations', 1.8172154006890517),
 ('MATH 4C', 'Precalculus for Science and Engineering', 1.4740255605038992),
 ('VIS 70N', 'Introduction to Media', -0.29931560868988977),
 ('COGS 3', 'Introduction to Computing', -0.3202103868836362),
 ('MATH 144', 'Introduction to Fourier Analysis', -0.3202103868836362),
 ('MATH 10A', 'Calculus I', -0.35058186556347937),
 ('SIOC 203C', 'Introduction to Applied Mathematics III', -0.36842482409883)]