Since the models were run on python without Jupyter notebooks on the HPC in a job queue, we have the results from the model in a csv and the logs from training in .err and .out files

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
from torch.utils.data import DataLoader
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import bitsandbytes as bnb
import transformers
from trl import SFTTrainer, SFTConfig
import logging
import csv

In [None]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set the TOKENIZERS_PARALLELISM environment variable
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
def check_cuda():
    logger.info(f"CUDA available: {torch.cuda.is_available()}")
    logger.info(f"CUDA device count: {torch.cuda.device_count()}")
    logger.info(f"Current CUDA device: {torch.cuda.current_device()}")
    logger.info(f"CUDA device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")

# Loading Model and Tokenizer

In [None]:
def load_model_and_tokenizer(model_id, cache_dir):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, cache_dir=cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True, cache_dir=cache_dir)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"  # Set padding side to right
    return model, tokenizer

# Cleaning the data into a form that the model can consume

In [None]:
def load_and_clean_dataset(dataset_name, cache_dir):
    dataset = load_dataset(dataset_name, "en", cache_dir=cache_dir, trust_remote_code=True)
    def remove_extra_columns(example):
        example["entities"] = [entity["surfaceform"] for entity in example["entities"]]
        example["relations"] = [
            {
                "subject": example["entities"][relation["subject"]],
                "predicate": relation["predicate"],
                "object": example["entities"][relation["object"]],
            }
            for relation in example["relations"]
        ]
        return example
    return dataset.map(remove_extra_columns)

# Adding the input to our models into the dataset

In [None]:
def preprocess_function(data_point):
    """
    Convert entities and relations to the expected output text format.
    """
    query = data_point['text']
    entities = ", ".join([f'"{entity}"' for entity in data_point['entities']])
    relations = "\n".join([f'"{relations}"' for relations in data_point['relations']])

    text = f"Given the following text, identify and extract all entities and their relations. Query: {query}\n Entities: [{entities}]\nRelations:\n{relations}"
    return text

# Finding all the layers that LORA can be applied to

In [None]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

# Running Gemma 7b-it

In [None]:
check_cuda()

model_id = "google/gemma-2b-it"
cache_dir = "SLM/gemma2b"  # Specify an alternative cache directory
dataset_name = "Babelscape/SREDFM"

model, tokenizer = load_model_and_tokenizer(model_id, cache_dir)
dataset = load_and_clean_dataset(dataset_name, cache_dir)

tokenized_datasets = dataset.map(lambda dp: {"model_input": preprocess_function(dp)})
logger.info(tokenized_datasets["test"][0])

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

modules = find_all_linear_names(model)
logger.info(modules)

lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

trainable, total = model.get_nb_trainable_parameters()
logger.info(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    dataset_text_field='model_input',
    peft_config=lora_config,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,  # Increase batch size if memory allows
        gradient_accumulation_steps=8,
        warmup_steps=100,  # Add warmup steps
        max_steps=1000,  # Increase max steps for better training
        learning_rate=2e-4,
        logging_steps=10,  # Log less frequently to reduce overhead
        output_dir="outputs",
        optim="paged_adamw_8bit",
        save_strategy="epoch",
        fp16=True,  # Use mixed precision training
        dataloader_num_workers=4,  # Optimize data loading
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False
trainer.train()

new_model = "gemma2b-trained"
trainer.model.save_pretrained(new_model)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
    cache_dir="SLM/gemma2b_trained"  # Specify an alternative cache directory
)
merged_model = PeftModel.from_pretrained(base_model, new_model)
merged_model = merged_model.merge_and_unload()

merged_model.to("cpu")
tokenizer.save_pretrained("gemma2b_trained")
merged_model.save_pretrained("gemma2b_trained", safe_serialization=True)

merged_model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

# Running Gemma7b-it

In [None]:
check_cuda()

model_id = "google/gemma-7b-it"
cache_dir = "SLM/gemma7b"  # Specify an alternative cache directory
dataset_name = "Babelscape/SREDFM"

model, tokenizer = load_model_and_tokenizer(model_id, cache_dir)
dataset = load_and_clean_dataset(dataset_name, cache_dir)

tokenized_datasets = dataset.map(lambda dp: {"model_input": preprocess_function(dp)})
logger.info(tokenized_datasets["test"][0])

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

modules = find_all_linear_names(model)
logger.info(modules)

lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

trainable, total = model.get_nb_trainable_parameters()
logger.info(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    dataset_text_field='model_input',
    peft_config=lora_config,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,  # Increase batch size if memory allows
        gradient_accumulation_steps=8,
        warmup_steps=100,  # Add warmup steps
        max_steps=1000,  # Increase max steps for better training
        learning_rate=2e-4,
        logging_steps=10,  # Log less frequently to reduce overhead
        output_dir="outputs",
        optim="paged_adamw_8bit",
        save_strategy="epoch",
        fp16=True,  # Use mixed precision training
        dataloader_num_workers=4,  # Optimize data loading
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False
trainer.train()

new_model = "gemma7b-trained"
trainer.model.save_pretrained(new_model)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
    cache_dir="SLM/gemma7b_trained"  # Specify an alternative cache directory
)
merged_model = PeftModel.from_pretrained(base_model, new_model)
merged_model = merged_model.merge_and_unload()

merged_model.to("cpu")
tokenizer.save_pretrained("gemma7b_trained")
merged_model.save_pretrained("gemma7b_trained", safe_serialization=True)

merged_model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

# Compiling model results into a CSV

Models have been pushed to hugging-face and can be pulled from there for inference

# Preparing a dataloader for our model

In [None]:
def get_completion_batch(queries, model, tokenizer, device):
    prompt_template = """
    <start_of_turn>user
    Given the following text, identify and extract all entities and their relations.
    {query}
    <end_of_turn>\n<start_of_turn>model
    """
    # Generate prompts
    prompts = [prompt_template.format(query=query) for query in queries]
    encodeds = tokenizer(
        prompts,
        return_tensors="pt",
        padding="longest",  # Explicitly specify padding
        truncation=True,
        max_length=1024  # Adjust based on your model's max input length
    )
    encodeds = {key: value.to(device) for key, value in encodeds.items()}

    # Mixed precision inference for speedup
    generated_ids = model.generate(
            **encodeds,
            max_new_tokens=500,  # Adjust as per expected output length
            do_sample=False,     # Disable sampling for faster, deterministic results
            pad_token_id=tokenizer.pad_token_id
        )

    outputs = []
    for ids in generated_ids:
        full_output = tokenizer.decode(ids, skip_special_tokens=True)
        if "Entities and Relations:" in full_output:
            relevant_part = full_output.split("Entities and Relations:")[1].strip()
            outputs.append(relevant_part)
        else:
            outputs.append(full_output.strip())  # Fallback to entire output if section is missing
    return outputs

# Loading first 1000 rows of the dataset

In [None]:
dataset = load_dataset("Babelscape/SREDFM", "en", cache_dir="SLM/datasets", trust_remote_code=True)
test_dataset = dataset['test'].select(range(1000))  # Select the first 1000 rows

In [None]:
class QueryDataset(torch.utils.data.Dataset):
    def __init__(self, texts, true_entities, true_relations):
        self.texts = texts
        self.true_entities = true_entities
        self.true_relations = true_relations

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {
            "text": self.texts[idx],
            "true_entities": self.true_entities[idx],
            "true_relations": self.true_relations[idx],
        }

def custom_collate_fn(batch):
    texts = [item["text"] for item in batch]
    true_entities = [item["true_entities"] for item in batch]
    true_relations = [item["true_relations"] for item in batch]
    return texts, true_entities, true_relations

query_dataset = QueryDataset(
    texts=test_dataset['text'],
    true_entities=test_dataset['entities'],
    true_relations=test_dataset['relations']
)
batch_size = 16  # Adjust based on available GPU memory
dataloader = DataLoader(query_dataset, batch_size=batch_size, collate_fn=custom_collate_fn, pin_memory=True)

# Gemma 2b inference

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_path = "Chinmay0701/gemma2b-trained"
tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir="gemma2b-trained", padding_size = "left")
tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
tokenizer.padding_side = "left"  # Fix padding side for decoder-only models

model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    torch_dtype=torch.float16 if device == "cuda" else torch.float32, 
    cache_dir="gemma2b-trained"
)
model.to(device)

output_file = "gemma2b_entities_relations.csv"

with open(output_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Query", "Predicted Entities", "Predicted Relations", "Output"])  # Write header row

    for batch_idx, (queries, true_entities, true_relations) in enumerate(dataloader):
        print(f"Processing batch {batch_idx + 1}/{len(dataloader)}...")
        
        # Get predictions for the batch
        predicted = get_completion_batch(queries, model, tokenizer, device)

        # Split predicted results into entities and relations
        for query, pred_output, true_ent, true_rel in zip(queries, predicted, true_entities, true_relations):
            if "Entities:" in pred_output and "Relations:" in pred_output:
            # Split the predicted output into entities and relations parts
                parts = pred_output.split("Relations:")
                pred_entities = parts[0].split("Entities:")
                pred_entities = pred_entities[1].replace("Entities:", "").strip()
                pred_relations = parts[1].strip()
            else:
                pred_entities = "NA"
                pred_relations = "NA"
            
            writer.writerow([query,pred_output ,pred_entities, pred_relations])

print(f"Processing complete. Results saved to {output_file}.")

# Gemma 7b Inference

In [None]:
model_path = "Chinmay0701/gemma7b-trained"  # Path where the model and tokenizer are saved

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir="gemma7b-trained", padding_size = "left")
tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
tokenizer.padding_side = "left" 

output_file = "gemma7b_entities_relations.csv"

with open(output_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Query", "Predicted Entities", "Predicted Relations", "Output"])  # Write header row

    for batch_idx, (queries, true_entities, true_relations) in enumerate(dataloader):
        print(f"Processing batch {batch_idx + 1}/{len(dataloader)}...")
        
        # Get predictions for the batch
        predicted = get_completion_batch(queries, model, tokenizer, device)

        # Split predicted results into entities and relations
        for query, pred_output, true_ent, true_rel in zip(queries, predicted, true_entities, true_relations):
            if "Entities:" in pred_output and "Relations:" in pred_output:
            # Split the predicted output into entities and relations parts
                parts = pred_output.split("Relations:")
                pred_entities = parts[0].split("Entities:")
                pred_entities = pred_entities[1].replace("Entities:", "").strip()
                pred_relations = parts[1].strip()
            else:
                pred_entities = "NA"
                pred_relations = "NA"
            
            writer.writerow([query,pred_output ,pred_entities, pred_relations])

print(f"Processing complete. Results saved to {output_file}.")