In [8]:
import pandas as pd

# Read the dataset
data = pd.read_csv('output_data.csv')

# Create the first dataset with the first two columns
original_dataset = data.iloc[:, :2]

# Create the second dataset with the third and fourth columns
synthesized_datasets = data.iloc[:, 2:4]

In [9]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import random
import numpy as np
import torch

In [10]:
model_name = "meta-llama/Llama-2-7b-chat-hf"  

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Fix the padding token issue
tokenizer.pad_token = tokenizer.eos_token  # Set eos_token as the pad_token

# Load the model without quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # Automatically maps the model to available GPUs
    torch_dtype=torch.float16  # Use mixed precision for better performance
)

model.to('cuda')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (no

In [19]:
import torch
import numpy as np
from nltk.util import ngrams
import pandas as pd

def calculate_ngram_accuracy(reference, generated, n=2):
    """
    Calculate N-gram accuracy between a reference and generated text.
    """
    ref_ngrams = set([' '.join(reference[i:i + n]) for i in range(len(reference) - n + 1)])
    gen_ngrams = set([' '.join(generated[i:i + n]) for i in range(len(generated) - n + 1)])
    matches = len(ref_ngrams & gen_ngrams)
    return matches / len(ref_ngrams) if len(ref_ngrams) > 0 else 0.0

def process_text(text):
    """
    Tokenize text into words.
    """
    return text.split()

def evaluate_atomic_metric(original_dataset, synthesized_dataset, n=2):
    """
    Evaluate the atomic metric (N-gram accuracy) and calculate decrement (∆) and percentage decrease (δ).
    """
    if len(original_dataset) != len(synthesized_dataset):
        raise ValueError("Original and synthesized datasets must have the same number of rows.")

    # N-gram accuracies for the original dataset
    original_accuracies_question = []
    original_accuracies_answer = []
    
    for _, row in original_dataset.iterrows():
        original_question = process_text(row["Original Question"])
        original_answer = process_text(row["Original Answer"])
        
        # Use original dataset against itself for full similarity (optional)
        question_accuracy = calculate_ngram_accuracy(original_question, original_question, n=n)
        answer_accuracy = calculate_ngram_accuracy(original_answer, original_answer, n=n)
        
        original_accuracies_question.append(question_accuracy)
        original_accuracies_answer.append(answer_accuracy)
    
    M_ori_question = np.mean(original_accuracies_question)
    M_ori_answer = np.mean(original_accuracies_answer)

    # N-gram accuracies for the synthesized dataset
    synthesized_accuracies_question = []
    synthesized_accuracies_answer = []
    
    for (_, orig_row), (_, synth_row) in zip(original_dataset.iterrows(), synthesized_dataset.iterrows()):
        original_question = process_text(orig_row["Original Question"])
        synthesized_question = process_text(synth_row["Predicted Question"])
        original_answer = process_text(orig_row["Original Answer"])
        synthesized_answer = process_text(synth_row["Rewritten Answer"])
        
        question_accuracy = calculate_ngram_accuracy(original_question, synthesized_question, n=n)
        answer_accuracy = calculate_ngram_accuracy(original_answer, synthesized_answer, n=n)
        
        synthesized_accuracies_question.append(question_accuracy)
        synthesized_accuracies_answer.append(answer_accuracy)
    
    M_ref_question = np.mean(synthesized_accuracies_question)
    M_ref_answer = np.mean(synthesized_accuracies_answer)

    # Calculate decrement (∆) and percentage decrease (δ)
    delta_question = M_ori_question - M_ref_question
    delta_answer = M_ori_answer - M_ref_answer

    delta_relative_question = (delta_question / M_ori_question) * 100 if M_ori_question != 0 else 0
    delta_relative_answer = (delta_answer / M_ori_answer) * 100 if M_ori_answer != 0 else 0

    return {
        "Questions": {
            "M_ori": M_ori_question,
            "M_ref": M_ref_question,
            "Delta (∆)": delta_question,
            "Delta Relative (δ)": delta_relative_question
        },
        "Answers": {
            "M_ori": M_ori_answer,
            "M_ref": M_ref_answer,
            "Delta (∆)": delta_answer,
            "Delta Relative (δ)": delta_relative_answer
        }
    }

# Example Usage
# Calculate metrics
results = evaluate_atomic_metric(original_dataset, synthesized_datasets, n=2)

# Display results
print("Questions:")
for key, value in results["Questions"].items():
    print(f"  {key}: {value:.4f}")

print("\nAnswers:")
for key, value in results["Answers"].items():
    print(f"  {key}: {value:.4f}")



Questions:
  M_ori: 1.0000
  M_ref: 0.3133
  Delta (∆): 0.6867
  Delta Relative (δ): 68.6661

Answers:
  M_ori: 1.0000
  M_ref: 0.2834
  Delta (∆): 0.7166
  Delta Relative (δ): 71.6597


In [14]:
calculate_atomic_metric_on_benchmarks(model, tokenizer, data)

(0.31333891190181756, 0.28340289504629607)