In [6]:
import requests
import pandas as pd
from io import StringIO
from collections import defaultdict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
import torch

In [7]:
# URL of the CSV file containing sample comments
url = "https://raw.githubusercontent.com/tobiaswtzl/dlss-project24/main/data/preprocessed/comments.csv"

# Headers for the HTTP request, including an authorization token
headers = {"Authorization": "token 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"}

# Sending a GET request to the specified URL with the provided headers
response = requests.get(url, headers=headers)

# Creating a StringIO object from the response text to simulate a file-like object
data = StringIO(response.text)

# Reading the CSV data into a pandas DataFrame
comments = pd.read_csv(data)

In [8]:
# Splitting the data into train, validation, and test sets
train_df, temp_df = train_test_split(comments, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

#Adding all comments for generating the vocabulary. If not an error occurs when tokens missing
total_comments_list = comments["lemmatized"].dropna().astype(str).tolist()

train_list = train_df["lemmatized"].dropna().astype(str).tolist()
val_list = val_df["lemmatized"].dropna().astype(str).tolist()
test_list = test_df["lemmatized"].dropna().astype(str).tolist()

# Ensure each entry is a string and split each sentence into words
total_corpus = [doc.split() for doc in total_comments_list]
corpus_train = [doc.split() for doc in train_list]
corpus_val = [doc.split() for doc in val_list]
corpus_test = [doc.split() for doc in test_list]

# Create a vocabulary: count occurrences of each word
vocab = defaultdict(int)
for sentence in total_corpus:
    for word in sentence:
        vocab[word] += 1

min_count = 6

# Remove infrequent words from the vocabulary
vocab = {word: count for word, count in vocab.items() if count >= min_count}

# Create word to index and index to word mappings
word_to_index = {word: idx for idx, (word, _) in enumerate(vocab.items())}
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Create DataFrame from vocabulary
vocab_df = pd.DataFrame(list(vocab.items()), columns=['Word', 'Count'])

vocab_set = set(vocab.keys())

def filter_corpus(corpus, vocab_set):
    return [[word for word in doc if word in vocab_set] for doc in corpus]

filtered_total_corpus = filter_corpus(total_corpus, vocab_set)
filtered_corpus_train = filter_corpus(corpus_train, vocab_set)
filtered_corpus_val = filter_corpus(corpus_val, vocab_set)
filtered_corpus_test = filter_corpus(corpus_test, vocab_set)

In [9]:
#FUNCTIONS
# Prepare the text for the model
def tokenize_corpus(corpus, tokenizer, max_length=512):
    """
    Tokenizes the corpus using the provided tokenizer.

    Args:
    - corpus: List of lists, where each sublist is a list of words.
    - tokenizer: The tokenizer to use.
    - max_length: Maximum length of tokens per input.

    Returns:
    - tokenized_corpus: List of tokenized inputs.
    """
    tokenized_corpus = []
    for sentence in corpus:
        sentence_str = " ".join(sentence)
        tokenized_input = tokenizer(
            sentence_str,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )
        tokenized_corpus.append(tokenized_input)
    return tokenized_corpus


def run_model_on_corpus(tokenized_corpus, model, device):
    """
    Passes the tokenized inputs to the model and returns the outputs.

    Args:
    - tokenized_corpus: List of tokenized inputs.
    - model: The model to run.
    - device: The device (CPU/GPU) to run the model on.

    Returns:
    - outputs: List of model outputs.
    """
    outputs = []
    model.to(device)  # Move model to the specified device
    for tokenized_input in tokenized_corpus:
        tokenized_input = {k: v.to(device) for k, v in tokenized_input.items()}  # Move input tensors to device
        with torch.no_grad():
            output = model(**tokenized_input)
            outputs.append(output)
    return outputs

And dunzhang/stella_en_1.5B_v5 which is the second best but with less memory needed.

https://huggingface.co/dunzhang/stella_en_1.5B_v5

In [10]:
# Load the tokenizer and model
tokenizer_stella = AutoTokenizer.from_pretrained("dunzhang/stella_en_1.5B_v5")
model_stella = AutoModel.from_pretrained("dunzhang/stella_en_1.5B_v5")

# Tokenize the filtered corpus
tokenized_train = tokenize_corpus(filtered_corpus_train, tokenizer_stella)
tokenized_val = tokenize_corpus(filtered_corpus_val, tokenizer_stella)
tokenized_test = tokenize_corpus(filtered_corpus_test, tokenizer_stella)

# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_stella.to(device)

# Run the model on the tokenized training data
train_outputs = run_model_on_corpus(tokenized_train, model_stella, device)

# If you want to extract embeddings or further process the outputs, you can do so here.
# For example, to get the last hidden state:
train_hidden_states = [output.last_hidden_state for output in train_outputs]

train_outputs

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


OutOfMemoryError: CUDA out of memory. Tried to allocate 18.00 MiB. GPU 

Usage of BAAI/bge-en-icl which is the number one NLP model on the leaderboard.

https://huggingface.co/BAAI/bge-en-icl

In [None]:
# Load the tokenizer and model
tokenizer_bge = AutoTokenizer.from_pretrained("BAAI/bge-en-icl")
model_bge = AutoModel.from_pretrained("BAAI/bge-en-icl")

# Tokenize the filtered corpus
tokenized_train = tokenize_corpus(filtered_corpus_train, tokenizer_bge)
tokenized_val = tokenize_corpus(filtered_corpus_val, tokenizer_bge)
tokenized_test = tokenize_corpus(filtered_corpus_test, tokenizer_bge)

# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_bge.to(device)

# Run the model on the tokenized training data
train_outputs = run_model_on_corpus(tokenized_train, model_bge)

# If you want to extract embeddings or further process the outputs, you can do so here.
# For example, to get the last hidden state:
train_hidden_states = [output.last_hidden_state for output in train_outputs]