In [13]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch import Tensor
from bitnet import replace_linears_in_hf
import logging

logging.getLogger('zeta.nn.modules.flow_transformer').setLevel(logging.WARNING)


# Load a model from Hugging Face's Transformers
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Replace Linear layers with BitLinear
replace_linears_in_hf(model)

# Example text to classify
text = "Replace this with your text"
inputs = tokenizer(
    text, return_tensors="pt", padding=True, truncation=True, max_length=512
)


In [14]:
# Perform inference
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    outputs = model(**inputs)
    print(outputs.logits.dtype)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    print(predictions)

# Process predictions
predicted_class_id = predictions.argmax().item()
print(f"Predicted class ID: {predicted_class_id}")

# Optionally, map the predicted class ID to a label, if you know the classification labels
# labels = ["Label 1", "Label 2", ...]  # Define your labels corresponding to the model's classes
# print(f"Predicted label: {labels[predicted_class_id]}")

AttributeError: 'SequenceClassifierOutput' object has no attribute 'to'

In [None]:
# from zeta.nn.modules.simple_rmsnorm import SimpleRMSNorm
from transformers import DataCollatorForLanguageModeling,Trainer,TrainingArguments
from transformers imtoport AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig
from transformers.models.bert.modeling_bert import * 
import torch.nn.functional as F
from torch import nn, Tensor
from tqdm import tqdm 
import re

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
MODEL_CONFIG        = "MikaSie/LegalBERT_BART_fixed_V1"
CONTEXT_LENGTH      = 256
tokenizer           = AutoTokenizer.from_pretrained(MODEL_CONFIG)
HEADS               = 6
DIMENSIONS          = 768
LAYERS              = 6
INTERMEDIATE_SIZE   = 1024
CONTEXT_LENGTH      = 256

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = AutoConfig.from_pretrained(
    MODEL_CONFIG,
    vocab_size=len(tokenizer),
    n_ctx=CONTEXT_LENGTH,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

config.hidden_size = DIMENSIONS
config.max_position_embeddings = DIMENSIONS
config.num_attention_heads = HEADS
config.num_hidden_layers = LAYERS
config.num_key_value_heads = HEADS
config.intermediate_size = INTERMEDIATE_SIZE

In [6]:
def convert_to_bitnet(model, copy_weights):
    for name, module in model.named_modules():
        # Replace linear layers with BitNet
            for child_name, child_module in module.named_children():
                if isinstance(child_module, nn.Linear):
                    bitlinear = BitLinear(child_module.in_features,child_module.out_features,child_module.bias is not None,).to(device=device)
                    if copy_weights:
                        bitlinear.weight = child_module.weight
                        if child_module.bias is not None:
                            bitlinear.bias = child_module.bias
                    setattr(module, child_name, bitlinear)

In [7]:
# Create the llama model with our custom config. Convert it to bitnet.

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CONFIG)

model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size / 1000 ** 2:.1f}M parameters")

convert_to_bitnet(model, copy_weights=False)
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size / 1000 ** 2:.1f}M parameters")

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

Model size: 406.3M parameters
Model size: 457.8M parameters


In [8]:
BATCH_SIZE = 8
LEARNING_RATE = 1.5e-3
EPOCHS = 2

In [9]:
def dataLoader(texts, gt_summaries):
    data_loader = []
    for ref, summary in zip(texts, gt_summaries):
        data_loader.append({
            "reference": ref,
            "summary": summary
        })
    return data_loader

def extract_references_and_summaries(file_path):
    texts = []
    gt_summaries = []

    reference_pattern = re.compile(r'"reference"\s*:\s*"([^"]+)"')
    summary_pattern = re.compile(r'"summary"\s*:\s*"([^"]+)"')

    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        texts.extend(reference_pattern.findall(content))
        gt_summaries.extend(summary_pattern.findall(content))
    return texts, gt_summaries

def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=False,
        max_length=CONTEXT_LENGTH,
        return_overflowing_tokens=True,
        return_length=True,
    )
    # Combine all tokens
    combined = []
    for tokenized_doc in outputs["input_ids"]:
        combined += tokenized_doc + [tokenizer.eos_token_id]
    # Chunk
    input_batch = []
    for i in range(0, len(combined) - CONTEXT_LENGTH, CONTEXT_LENGTH):
        input_batch.append(combined[i: i + CONTEXT_LENGTH])
    return {"input_ids": input_batch}

In [10]:
from datasets import Dataset, DatasetDict 

texts, gt_summaries = extract_references_and_summaries('resources/train.json')
test_texts, test_summaries = extract_references_and_summaries('resources/test.json')
data_train = Dataset.from_dict({"text": texts, "summary": gt_summaries})
data_test = Dataset.from_dict({"text": test_texts, "summary": test_summaries})

data = DatasetDict({"train": data_train,"test": data_test})

tokenized_data = data.map(
    tokenize,
    batched=True,
    remove_columns=data["train"].column_names,
)

Map: 100%|██████████| 1129/1129 [00:24<00:00, 46.86 examples/s]
Map: 100%|██████████| 188/188 [00:06<00:00, 31.23 examples/s]


In [16]:
total_tokens = tokenized_data["train"].num_rows * CONTEXT_LENGTH
print(f"Training on {total_tokens:_} tokens")

total_tokens = tokenized_data["test"].num_rows * CONTEXT_LENGTH
print(f"Training on {total_tokens:_} tokens")

Training on 15_153_664 tokens
Training on 3_896_832 tokens


In [None]:
summaries    = []

for idx, text in tqdm(enumerate(texts),total = len(texts)):

    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=CONTEXT_LENGTH).to(device)
    
    
    model.eval()          # Evaluation mode 
    with torch.no_grad(): # Perform inference
        outputs = model.generate(**inputs, max_length=CONTEXT_LENGTH, num_beams=4, early_stopping=True)
    
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    summaries.append(summary)

In [None]:
from bert_score import score

P, R, F1 = score(summaries,gt_summaries, model_type="bert-base-uncased", lang="en")

print("\nBERTScore Model Benchmark Results:")
print(f"Precision: Mean = {torch.mean(P):.4f}, Std = {torch.std(P):.4f}")
print(f"Recall   : Mean = {torch.mean(R):.4f}, Std = {torch.std(R):.4f}")
print(f"F1-score : Mean = {torch.mean(R):.4f}, Std = {torch.std(R):.4f}")

In [None]:
output_path = "./out"
args = TrainingArguments(
    output_dir=output_path,
    per_device_train_batch_size=BATCH_SIZE,
    logging_steps=100,
    gradient_accumulation_steps=2,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    warmup_steps=10,
    lr_scheduler_type="cosine",
    learning_rate=LEARNING_RATE,
    save_steps=0.25,
    fp16=True,
    report_to="wandb",
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_data["train"],
)

trainer.train()