In [None]:
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import torch

In [None]:
# Load the dataset using pandas
df = pd.read_csv('data/train_clean.csv')

In [None]:
# Split the dataset into comments and labels
comments = df['comment'].tolist()
labels = df['class'].tolist()

In [None]:
# Load the DarijaBERT tokenizer and model (It's over 900MB to download)
tokenizer = AutoTokenizer.from_pretrained("SI2M-Lab/DarijaBERT")
model = AutoModel.from_pretrained("SI2M-Lab/DarijaBERT")

In [None]:
# Enable GPU if available (for google colab)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Set batch size and number of batches
batch_size = 20
num_sentences = len(comments)
num_batches = (num_sentences + batch_size - 1) // batch_size

# Create a DataLoader for efficient batch processing
data_loader = DataLoader(comments, batch_size=batch_size, shuffle=False)

embeddings_list = []

# Process batches in parallel using multiple workers
with torch.no_grad():
    for batch_comments in data_loader:
        encoded_inputs = tokenizer(batch_comments, padding="max_length", max_length=128, truncation=True, return_tensors='pt').to(device)
        model.to(device)
        batch_embeddings = model(**encoded_inputs).last_hidden_state
        embeddings_list.append(batch_embeddings)
        print(f'Batch {len(embeddings_list)}/{num_batches} completed')

# Concatenate the embeddings from all batches
embeddings = torch.cat(embeddings_list, dim=0)

In [None]:
# save the embeddings for later use
np.save("model/embeddings.npy")