In [1]:
!pip install datasets
!pip install -U sentence-transformers



In [2]:
from datasets import load_dataset
from torch import Tensor
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
from sentence_transformers import CrossEncoder

In [3]:
# load embeddings, this will be a pandas df with embeddings stored in a list
reference_embeddings = load_dataset("anordkvist/gu-course-syllabus-embeddings")
df_embeddings = reference_embeddings['train'].to_pandas()

print(df_embeddings.shape)
df_embeddings.head()

(2971, 11)


Unnamed: 0,course_code,Confirmation,Position in the educational system,Entry requirements,Learning outcomes,Course content,Form of teaching,Assessment,Grades,Course evaluation,Additional information
0,GS2532,"[0.01716105453670025, -0.026554130017757416, 0...","[0.016933701932430267, -0.039054516702890396, ...","[-0.01404751930385828, -0.03766262158751488, -...","[0.005510212387889624, -0.03996856138110161, -...","[0.001180240884423256, -0.034723859280347824, ...","[0.03018626570701599, -0.051153115928173065, 0...","[0.0028430288657546043, -0.036798927932977676,...","[0.01037866435945034, -0.046425629407167435, -...","[0.011339511722326279, -0.03879973292350769, 0...","[0.03251421079039574, -0.05399147793650627, 0...."
1,FEA471,"[0.01081564649939537, -0.03007451631128788, 0....","[-0.011405865661799908, -0.058512911200523376,...","[-0.016064323484897614, -0.05973084643483162, ...","[0.03075617179274559, -0.05152737721800804, 0....","[-0.013770826160907745, -0.059195626527071, 0....","[0.03075617179274559, -0.05152737721800804, 0....","[-0.014199590310454369, -0.05461760610342026, ...","[0.0034296647645533085, -0.029118631035089493,...","[0.011512437835335732, -0.030733948573470116, ...","[0.014594447799026966, -0.042849600315093994, ..."
2,PT2214,"[0.009092817083001137, -0.032789260149002075, ...","[0.007765460293740034, -0.026354366913437843, ...","[0.010043337941169739, -0.05205079913139343, 0...","[-0.0016323194140568376, -0.05323197692632675,...","[0.01740981824696064, -0.04114428162574768, 0....","[0.017448483034968376, -0.047597385942935944, ...","[0.006252621300518513, -0.04249892011284828, 0...","[-0.0029416403267532587, -0.05673420429229736,...","[0.028365440666675568, -0.04533102363348007, 0...","[0.008878586813807487, -0.052575305104255676, ..."
3,DIT822,"[0.027984850108623505, -0.04347119852900505, 0...","[0.01576567254960537, -0.03299769386649132, 0....","[0.0008193698013201356, -0.03754168003797531, ...","[0.017853254452347755, -0.05903192609548569, 0...","[0.008794586174190044, -0.06108751893043518, -...","[0.04326609522104263, -0.03631548956036568, -0...","[-0.012982221320271492, -0.0448671355843544, 0...","[0.0014109815238043666, -0.05849870666861534, ...","[0.03179146721959114, -0.03972075507044792, -0...","[0.024700647220015526, -0.04204489663243294, 0..."
4,IMP205,"[0.02445601485669613, -0.04656608775258064, 0....","[0.005030200816690922, -0.04483957588672638, 0...","[0.003042041091248393, -0.04473581910133362, 0...","[0.013423990458250046, -0.05831574648618698, -...","[-0.002570592798292637, -0.04824570193886757, ...","[0.014627245254814625, -0.04130963608622551, 0...","[-0.011793104000389576, -0.04982934892177582, ...","[0.014974703080952168, -0.0418265238404274, 0....","[0.00881048385053873, -0.026931874454021454, -...","[0.010092210955917835, -0.05954776331782341, 0..."


In [4]:
# load cleaned text
cleaned_text = load_dataset("anordkvist/gu-course-syllabus")
df_text = cleaned_text['train'].to_pandas()

print(df_text.shape)
df_text.head()

(2975, 11)


Unnamed: 0,course_code,Confirmation,Position in the educational system,Entry requirements,Learning outcomes,Course content,Form of teaching,Assessment,Grades,Course evaluation,Additional information
0,GS2532,"passage: course code: gs2532, confirmation thi...","passage: course code: gs2532, position in the ...","passage: course code: gs2532, entry requiremen...","passage: course code: gs2532, learning outcome...","passage: course code: gs2532, course content e...","passage: course code: gs2532, form of teaching...","passage: course code: gs2532, assessment progr...","passage: course code: gs2532, grades the gradi...","passage: course code: gs2532, course evaluatio...","passage: course code: gs2532,"
1,FEA471,"passage: course code: fea471, confirmation the...","passage: course code: fea471, position in the ...","passage: course code: fea471, entry requiremen...","passage: course code: fea471,","passage: course code: fea471, course content t...","passage: course code: fea471,","passage: course code: fea471, assessment learn...","passage: course code: fea471, grades are trans...","passage: course code: fea471, course evaluatio...","passage: course code: fea471, additional infor..."
2,PT2214,"passage: course code: pt2214, confirmation thi...","passage: course code: pt2214, position in the ...","passage: course code: pt2214, entry requiremen...","passage: course code: pt2214, learning outcome...","passage: course code: pt2214, course content s...","passage: course code: pt2214, form of teaching...","passage: course code: pt2214, assessment of th...","passage: course code: pt2214, grades the gradi...","passage: course code: pt2214, course evaluatio...","passage: course code: pt2214, additional infor..."
3,DIT822,"passage: course code: dit822, confirmation thi...","passage: course code: dit822, position in the ...","passage: course code: dit822, entry requiremen...","passage: course code: dit822, learning outcome...","passage: course code: dit822, course content t...","passage: course code: dit822, form of teaching...","passage: course code: dit822, assessment the c...","passage: course code: dit822, grades the gradi...","passage: course code: dit822, course evaluatio...","passage: course code: dit822, additional infor..."
4,IMP205,"passage: course code: imp205, confirmation thi...","passage: course code: imp205, position in the ...","passage: course code: imp205, entry requiremen...","passage: course code: imp205, learning outcome...","passage: course code: imp205, course content t...","passage: course code: imp205, form of teaching...","passage: course code: imp205, assessment the s...","passage: course code: imp205, grades the gradi...","passage: course code: imp205, course evaluatio...","passage: course code: imp205, additional infor..."


In [5]:
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def embed_query(query, tokenizer, model):
  """
  Given a query, tokenier and model, creates an embedding return as 1d np array
  """
  # Tokenize the input texts
  batch_dict = tokenizer([query], max_length=512, padding=True, truncation=True, return_tensors='pt')
  # create embeddings
  outputs = model(**batch_dict)
  embedding = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
  # normalize embeddings
  embedding = F.normalize(embedding, p=2, dim=1)

  return embedding.detach().numpy().reshape(-1) # reshape to 1d np array

def cosine_similarity(references, query):
  """
  Computes cosine similarity between references and query, vectorized.
  """
  # Convert references to a NumPy array
  references = np.vstack(references)

  # Normalize the reference embeddings and the query embedding
  norm_references = np.linalg.norm(references, axis=1)
  norm_query = np.linalg.norm(query)

  # Compute dot product
  dot_product = np.dot(references, query)

  # Compute cosine similarity
  cosine_similarity = dot_product / (norm_references * norm_query)
  return cosine_similarity

def get_top_k(docs, similarities, k, verbose=False):
  """
  get the top k similarities
  """
  # sort in descending order
  sorted_indices = np.argsort(similarities)[::-1]
  # Select the top 5 indices and their similarity score
  top_k_indices = sorted_indices[:k]
  top_k_values = similarities[top_k_indices]

  if verbose:
    print(f'idx: {top_k_indices}, similarity: {top_k_values}')

  # get the text for top docs
  top_docs_text = df_text.iloc[top_k_indices].reset_index()

  return top_docs_text

def reranker(query, docs, model_name):
  """
  Reranker...
  docs should be a list with the retrieved documents from similarity search
  """
  # init model
  model = CrossEncoder(model_name, max_length=512)
  # create sentence pairs
  docs_course_content = docs['Course content'] # temporary way to get only the course content
  sentence_pairs = [(query, doc) for doc in docs_course_content]
  # predict
  scores = model.predict(sentence_pairs)
  # sort scores in descending order
  sorted_indices = np.argsort(scores)[::-1]

  # sort docs in new ranking order
  sorted_docs_course_content = docs_course_content.iloc[sorted_indices]

  return sorted_docs_course_content

In [6]:
# must use the same model as the stored embeddings
embedding_model_name = 'intfloat/e5-large-v2'

tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
model = AutoModel.from_pretrained(embedding_model_name)

# test queries to embed - important!! include "query:"
query1 = 'query: What advanced mathematics courses are available for engineering students?'
# query2 = 'query: I want to study economics, what are the first basic course i should take?'
# query3 = 'query: Databases'

# embed
query_embedding1 = embed_query(query1, tokenizer, model)
# query_embedding2 = embed_query(query2, tokenizer, model)
# query_embedding3 = embed_query(query3, tokenizer, model)

# get the reference embeddings we want to search on
reference_embeddings = df_embeddings['Course content']

In [13]:
# run the similarity
similarities = cosine_similarity(reference_embeddings, query_embedding1)
# get the top k similarities
top_docs_text = get_top_k(df_text, similarities, 5, verbose=True)

idx: [ 743 2621  234  858 2800], similarity: [0.82176449 0.81848319 0.81707513 0.81566413 0.81565551]


In [45]:
# cross encoder
cross_encoder_name = 'cross-encoder/ms-marco-MiniLM-L-12-v2'
# run the reranker
reranked_top_docs = reranker(query1, top_docs_text, cross_encoder_name)

# show the new rank of the retrieved docs
reranked_top_docs.head()

# next step would be to pass the docs we want to the generation part

0    passage: course code: mma630, course content e...
2    passage: course code: mmg410, course content b...
4    passage: course code: kem131, course content t...
3    passage: course code: mmg800, course content c...
1    passage: course code: ki1130, course content p...
Name: Course content, dtype: object

In [49]:
# compare to cosine similarity rankings
top_docs_text['Course content'].head()

0    passage: course code: mma630, course content e...
1    passage: course code: ki1130, course content p...
2    passage: course code: mmg410, course content b...
3    passage: course code: mmg800, course content c...
4    passage: course code: kem131, course content t...
Name: Course content, dtype: object