In [None]:
!pip install datasets
!pip install tqdm

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
from torch import Tensor
import torch.nn.functional as F

from tqdm import tqdm

In [None]:
# load data

dataset = load_dataset("anordkvist/gu-course-syllabus")
df = dataset['train'].to_pandas()

print(df.shape)
df.head()

In [None]:
# list of titles (section headers) in the order they appear in the documents
titles = [
    "Confirmation",
    "Position in the educational system",
    "Entry requirements",
    "Learning outcomes",
    "Course content",
    "Form of teaching",
    "Assessment",
    "Grades",
    "Course evaluation",
    "Additional information"
]

num_titles = len(titles)

# make sure that padding tokens isnt included when creating the embedding
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def create_section_embeddings(model_name, df_docs, save_path=None):

  """
  Creates embedding for each section in the document.
  """
  # init model and tokenizer
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModel.from_pretrained(model_name)
  # move to gpu if available
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model.to(device)

  # def create_embedding(text_batch):
  #     # convert series into list of strings
  #     input_texts = text_batch.tolist()
  #     # Tokenize the input texts
  #     batch_dict = tokenizer(input_texts, max_length=256, padding=True, truncation=True, return_tensors='pt')
  #     # create embeddings
  #     outputs = model(**batch_dict)
  #     embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
  #     # normalize embeddings
  #     embeddings = F.normalize(embeddings, p=2, dim=1)

  #     return embeddings

  def create_embedding(text_batch):
    with torch.no_grad(): # This ensures that gradients are not tracked
      # convert series into list of strings
      input_texts = text_batch.tolist()
      # Tokenize the input texts
      batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
      # Move batch_dict to GPU
      batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
      # create embeddings
      outputs = model(**batch_dict)
      embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
      # normalize embeddings
      embeddings = F.normalize(embeddings, p=2, dim=1)

      # Free up memory
      del batch_dict
      del outputs
      torch.cuda.empty_cache()

    return embeddings.cpu()  # Move embeddings to CPU to free up GPU memory


  embeddings = []

  print(f'Creating embeddings...')
  # iterate over each document at a time, this will process each document as a batch (we might want to have a larger batch size, in that case we would rework this a little bit)
  for _, document in tqdm(df_docs.iterrows(), total=df_docs.shape[0]):
      # exclude course code column
      sections = document[1:]
      # create embeddings for each document
      section_embeddings = create_embedding(sections)
      # this list will contain all the section embeddings, where the first 10 belongs to the first document, next 10 belongs to the second...
      embeddings.extend(section_embeddings)

  # init dict to store section embeddings
  d_section_embeddings = {}

  # iterate over the list of embeddings, map embeddings to course codes and section titles
  for index, course_code in enumerate(df_docs['course_codes']):
      # calculate the start and end index for the embeddings of this course
      start_idx = index * num_titles
      end_idx = start_idx + num_titles

      # map section titles to their embedding
      sections_embeddings = dict(zip(titles, embeddings[start_idx:end_idx]))

      # Add this to your dictionary
      d_section_embeddings[course_code] = sections_embeddings

  # if save_path:
  #     print(f'Saving section embeddings to: {save_path}')
  #     with open(save_path+'section_embeddings.pickle', 'wb') as f:
  #         pickle.dump(d_section_embeddings, f, pickle.HIGHEST_PROTOCOL)

  return d_section_embeddings

In [None]:
model_name = 'intfloat/e5-large-v2'

dict_section_embeddings = create_section_embeddings(model_name, df)

In [None]:
# remake nested dict to df to be able to save in hf
import pandas as pd

df = pd.DataFrame.from_dict(dict_section_embeddings, orient='index').reset_index()
df = df.rename(columns={'index': 'course_code'})

# convert pytorch tensors to numpy arrays to python lists (hf reasons), skip the first course_code column
for column in df.columns[1:]:
    # Convert each tensor to a numpy array, then to a list
    df[column] = df[column].apply(lambda x: x.numpy().tolist()) # maybe we dont need to make it to list, migh work with np array

In [None]:
# push embeddings to anordkvist/gu-course-syllabus-embeddings
from huggingface_hub import login
from datasets import Dataset

dataset = Dataset.from_pandas(df)

login('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')

dataset.push_to_hub(repo_id='anordkvist/gu-course-syllabus-embeddings')

In [None]:
from datasets import load_dataset
dataset = load_dataset("anordkvist/gu-course-syllabus")
df_hf = dataset['train'].to_pandas()

In [None]:
df_hf.head()