In [1]:
import os
import pandas as pd
import numpy as np
import torch
from pathlib import Path
import nltk
from nemo.collections.nlp.models import BERTLMModel
from transformers import AutoTokenizer

In [2]:
# Download the punkt tokenizer data
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# Load Preprocessed Data
DirPpath = Path(os.path.abspath(''))
data_path = str(DirPpath.absolute()) + r"/data/CleanedData_Augmented.csv"

df = pd.read_csv(data_path)

# Debug: Print the first few entries of the DataFrame
print("First few entries of the DataFrame:")
print(df.head())

First few entries of the DataFrame:
   Unnamed: 0  Topic                                             Pledge
0           0      1  Actually we as an association are still pretty...
1           1      1  EFFAT welcomes the Commission Proposal for a R...
2           2      1  HOTREC calls for a level playing field and fai...
3           3      1  Estonia sees the need to synchronize and harmo...
4           4      1  Sphere Travel Club contributes to a flourishin...


In [4]:
# Tokenize the pledges on words
documents = [i for i in df["Pledge"]]
tokens = [nltk.word_tokenize(i) for i in documents]

In [6]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")

# Load the pre-trained BERT model
print("Loading model...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#bert_model = BERTLMModel.restore_from("/home/jovyan/datafabric/Bertlargeuncased/bertlargeuncased.nemo", strict=False)
bert_model = BERTLMModel.from_pretrained(model_name="bertlargeuncased", strict=False).to(device)
print("Model loaded successfully.")                                     

Loading model...
[NeMo I 2025-03-12 17:21:21 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/bertlargeuncased/versions/1.0.0rc1/files/bertlargeuncased.nemo to /root/.cache/torch/NeMo/NeMo_1.22.0/bertlargeuncased/ca4ebba9f05a8ffb79845249ca046983/bertlargeuncased.nemo
[NeMo I 2025-03-12 17:21:46 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2025-03-12 17:21:58 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    data_file: /home/yzhang/data/nlp/bert/47316/hdf5/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training/
    max_predictions_per_seq: 80
    batch_size: 16
    shuffle: true
    num_samples: -1
    num_workers: 2
    drop_last: false
    pin_memory: false
    


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

[NeMo W 2025-03-12 17:22:27 modelPT:617] Trainer wasn't specified in model constructor. Make sure that you really wanted it.


[NeMo I 2025-03-12 17:22:27 modelPT:728] Optimizer config = AdamW (
    Parameter Group 0
        amsgrad: False
        betas: (0.9, 0.999)
        capturable: False
        differentiable: False
        eps: 1e-08
        foreach: None
        fused: None
        lr: 4.375e-05
        maximize: False
        weight_decay: 0.01
    )


[NeMo W 2025-03-12 17:22:27 lr_scheduler:890] Neither `max_steps` nor `iters_per_batch` were provided to `optim.sched`, cannot compute effective `max_steps` !
    Scheduler will not be instantiated !


[NeMo I 2025-03-12 17:22:29 save_restore_connector:249] Model BERTLMModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.22.0/bertlargeuncased/ca4ebba9f05a8ffb79845249ca046983/bertlargeuncased.nemo.
Model loaded successfully.


In [7]:
# Function to Generate Embeddings in Batches using NeMo BERT model
def generate_embeddings_in_batches(texts, tokenizer, model, batch_size=32):
    """
    Converts a list of texts into embeddings using NeMo BERT model in batches.
    """
    model.eval()  # Set model to evaluation mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        encoded_input = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
        encoded_input = {key: val.to(device) for key, val in encoded_input.items()}

        with torch.no_grad():  # Disable gradient calculation for inference
            output = model.bert_model(**encoded_input)

        # Extract the CLS token representation for embeddings
        embeddings = output[:, 0, :].cpu().numpy()  # CLS token representation
        all_embeddings.append(embeddings)

    return np.vstack(all_embeddings)

In [9]:
# Generate & Save Embeddings
vectorsLLM = generate_embeddings_in_batches(documents, tokenizer, bert_model, batch_size=16)

df_embeddings = pd.DataFrame(vectorsLLM)
output_path = "./data/embedded_data.csv"
df_embeddings.to_csv(output_path, index=False)

print("✅ Embedding completed and saved to:", output_path)

✅ Embedding completed and saved to: ./data/embedded_data.csv
