In [16]:
!pip install datasets transformers torch 

Defaulting to user installation because normal site-packages is not writeable


In [17]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import random
import numpy as np
import torch

In [33]:
from datasets import load_dataset

ds_train = load_dataset("mandarjoshi/trivia_qa", "rc", split = 'train')
ds_test = load_dataset("mandarjoshi/trivia_qa", "rc", split = 'test')
ds_train = ds_train[:200]
ds_test = ds_test[:50]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/24 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

In [19]:
for i in range(5):
    print(ds_train["question"][i])

Which American-born Sinclair won the Nobel Prize for Literature in 1930?
Where in England was Dame Judi Dench born?
In which decade did Billboard magazine first publish and American hit chart?
From which country did Angola achieve independence in 1975?
Which city does David Soul come from?


In [34]:
def preprocess_trivia_qa(data):
    """
    Prepares the MMLU dataset by randomly masking one incorrect option.
    """
    processed_data = []
    for example in data:  # Access the dataset directly
        question = example
        
        processed_data.append({
            "question": question,
        })
    return processed_data

processed_data_train = preprocess_trivia_qa(ds_train["question"])
processed_data_test = preprocess_trivia_qa(ds_test["question"])

In [22]:
import torch
import gc

# Freeing cache memory
torch.cuda.empty_cache()

# Deleting unused variables
del model
#del tensor

# Calling garbage collection
gc.collect()

# Verifying memory usage
print(f"Allocated Memory: {torch.cuda.memory_allocated() / 1e9} GB")
print(f"Reserved Memory: {torch.cuda.memory_reserved() / 1e9} GB")


Allocated Memory: 1.071694336 GB
Reserved Memory: 12.264144896 GB


In [23]:
model_name = "meta-llama/Llama-2-7b-chat-hf"  

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Fix the padding token issue
tokenizer.pad_token = tokenizer.eos_token  # Set eos_token as the pad_token

# Load the model without quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # Automatically maps the model to available GPUs
    torch_dtype=torch.float16  # Use mixed precision for better performance
)

model.to('cuda')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (no

In [31]:
def predict_rephrase_update(question, tokenizer, model):
    """
    Rephrases a question and its answer separately, with a concise and immediate output.
    Stops the question at the first "?" and the answer at the first "."
    """
    # Prepare the concise input prompt for rephrasing the question
    question_prompt = (
        f"Rephrase the following question keeping the meaning the same\n"
        f"I will give you an example of how to do this. Follow this pattern to rephrase the given question.\n"
        f"Question: The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change? .\n"
        f"Rephrased Question: The sanctions against the school were a harsh setback, and they seemed to undermine the efforts the school had made to change. \n"
        "###"
        f"Question: {question}\n"
        f"Rewritten Question:"
    )

    # Tokenize the input for question
    question_inputs = tokenizer(
        question_prompt,
        return_tensors="pt",
        #padding=True,
        #truncation=True,
        #return_attention_mask=True
    ).to("cuda")  # Move input to GPU

    # Generate prediction for question with tighter configurations for immediate output
    with torch.no_grad():
        question_output_ids = model.generate(
            question_inputs.input_ids,
            #attention_mask=question_inputs.attention_mask,
            #max_length=question_inputs.input_ids.shape[1] + 100,  # Limit generation length
            temperature=0.8,  # Reduce randomness for precision
            top_k=10,
                # Lower top_p for more focused sampling
            #num_beams=1,  # Use greedy search for quicker results
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decode the generated output for question
    rewritten_question = tokenizer.decode(question_output_ids[0], skip_special_tokens=True).split("Rewritten Question:")[-1].strip()

    # Stop the question at the first "?"
    #rewritten_question = f"{rewritten_question}"
    if "?" in rewritten_question:
        rewritten_question = rewritten_question.split("?")[0] + "?"

    return rewritten_question

In [36]:
import csv

with open('output_data_question_test.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    # Write the header row
    writer.writerow(["Original Question", "Rewritten Question"])

    # Iterate through processed_data and write the results
    for example in processed_data_test:
        question = example
        #answer = example["question2"]
        
        # Get the rewritten question and answer (assuming this function exists)
        rewritten_question = predict_rephrase_update(question, tokenizer, model)
        #output = predict_rephrase_update(question,answer,tokenizer,model)
        # Print the outputs (optional for logging purposes)
        print(f"Original Question: {question}")
        #print(f"Original Answer: {answer} \n")
        print(f"Rewritten Question: {rewritten_question}")
        #print(f"Rewritten Answer: {rewritten_answer}")
        print(f"\n")
        print(f"\n")
        
        # Write the results to the CSV file
        writer.writerow([question, rewritten_question])

Original Question: {'question': 'Asmara international airport is in which country?'}
Rewritten Question: The airport in question is located in which nation?




Original Question: {'question': 'At whose concert were 11 people trampled to death in Ohio in 1979?'}
Rewritten Question: Whose concert in Ohio in 1979 resulted in the deaths of 11 people due to a tragic trampling incident?




Original Question: {'question': "Andy Warhol/'s 60s exhibition featured cans of which product?"}
Rewritten Question: Andy Warhol's 60s exhibition showcased cans of which product?




Original Question: {'question': 'San Giusto international airport is in which county?'}
Rewritten Question: {'question': 'Which county is San Giusto international airport located in?




Original Question: {'question': "Who had a 60s No 1 with Travelin' Man?"}
Rewritten Question: Who had a 1960s number one hit with "Travelin' Man"?




Original Question: {'question': 'In which country did the notorious security force the Ton

In [37]:
with open('output_data_question_train.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    # Write the header row
    writer.writerow(["Original Question", "Rewritten Question"])

    # Iterate through processed_data and write the results
    for example in processed_data_train:
        question = example
        #answer = example["question2"]
        
        # Get the rewritten question and answer (assuming this function exists)
        rewritten_question = predict_rephrase_update(question, tokenizer, model)
        #output = predict_rephrase_update(question,answer,tokenizer,model)
        # Print the outputs (optional for logging purposes)
        print(f"Original Question: {question}")
        #print(f"Original Answer: {answer} \n")
        print(f"Rewritten Question: {rewritten_question}")
        #print(f"Rewritten Answer: {rewritten_answer}")
        print(f"\n")
        print(f"\n")
        
        # Write the results to the CSV file
        writer.writerow([question, rewritten_question])

Original Question: {'question': 'Which American-born Sinclair won the Nobel Prize for Literature in 1930?'}
Rewritten Question: Who among American-born Sinclairs won the Nobel Prize for Literature in 1930?




Original Question: {'question': 'Where in England was Dame Judi Dench born?'}
Rewritten Question: Where did Dame Judi Dench, the renowned English actress, born?




Original Question: {'question': 'In which decade did Billboard magazine first publish and American hit chart?'}
Rewritten Question: {'question': 'When did Billboard magazine first release its American hit chart?




Original Question: {'question': 'From which country did Angola achieve independence in 1975?'}
Rewritten Question: {'question': 'Which country did Angola gain independence from in 1975?




Original Question: {'question': 'Which city does David Soul come from?'}
Rewritten Question: Which city is David Soul from?




Original Question: {'question': 'Who won Super Bowl XX?'}
Rewritten Question: {'question': 

In [66]:
import pandas as pd
import re




In [67]:
import torch
import numpy as np
from nltk.util import ngrams
import pandas as pd

def calculate_ngram_accuracy(reference, generated, n=2):
    """
    Calculate N-gram accuracy between a reference and generated text.
    """
    ref_ngrams = set([' '.join(reference[i:i + n]) for i in range(len(reference) - n + 1)])
    gen_ngrams = set([' '.join(generated[i:i + n]) for i in range(len(generated) - n + 1)])
    matches = len(ref_ngrams & gen_ngrams)
    return matches / len(ref_ngrams) if len(ref_ngrams) > 0 else 0.0

def process_text(text):
    """
    Tokenize text into words.
    """
    return text.split()


In [70]:
import pandas as pd
import numpy as np

# Placeholder function for text processing (you can customize this as needed)
def process_text(text):
    return text.lower() if isinstance(text, str) else ""

# Placeholder function for n-gram accuracy calculation
def calculate_ngram_accuracy(text1, text2, n=2):
    """
    Calculates the N-gram accuracy between two texts.
    """
    def ngrams(text, n):
        tokens = text.split()
        return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    
    ngrams1 = set(ngrams(text1, n))
    ngrams2 = set(ngrams(text2, n))
    
    if not ngrams1:  # Handle empty text case
        return 0.0
    return len(ngrams1.intersection(ngrams2)) / len(ngrams1)

def evaluate_atomic_metric(original_dataset, rewritten_dataset, n=2):
    """
    Evaluate the atomic metric (N-gram accuracy) and calculate decrement (∆) and percentage decrease (δ).
    """
    if len(original_dataset) != len(rewritten_dataset):
        raise ValueError("Original and rewritten datasets must have the same number of rows.")
    
    # N-gram accuracies for the rewritten dataset
    accuracies = []
    for (_, orig_row), (_, rewritten_row) in zip(original_dataset.iterrows(), rewritten_dataset.iterrows()):
        original_question = process_text(orig_row["Original Question"])
        rewritten_question = process_text(rewritten_row["Rewritten Question"])
        
        accuracy = calculate_ngram_accuracy(original_question, rewritten_question, n=n)
        accuracies.append(accuracy)
    
    M_ref = np.mean(accuracies)

    # N-gram accuracies for the original dataset against itself (optional: full similarity)
    self_accuracies = []
    for _, row in original_dataset.iterrows():
        original_question = process_text(row["Original Question"])
        accuracy = calculate_ngram_accuracy(original_question, original_question, n=n)
        self_accuracies.append(accuracy)
    
    M_ori = np.mean(self_accuracies)

    # Calculate decrement (∆) and percentage decrease (δ)
    delta = M_ori - M_ref
    delta_relative = (delta / M_ori) * 100 if M_ori != 0 else 0

    return {
        "M_ori": M_ori,
        "M_ref": M_ref,
        "Delta (∆)": delta,
        "Delta Relative (δ)": delta_relative
    }

# Example Usage
# Load the CSV files
training_dataset = pd.read_csv('output_data_question_train.csv')  # Replace with the actual file path
testing_dataset = pd.read_csv('output_data_question_test.csv')    # Replace with the actual file path

# Evaluate metrics for training data
print("Training Dataset:")
training_results = evaluate_atomic_metric(
    training_dataset[["Original Question"]], 
    training_dataset[["Rewritten Question"]],
    n=2
)
for key, value in training_results.items():
    print(f"  {key}: {value:.4f}")

# Evaluate metrics for testing data
print("\nTesting Dataset:")
testing_results = evaluate_atomic_metric(
    testing_dataset[["Original Question"]], 
    testing_dataset[["Rewritten Question"]],
    n=2
)
for key, value in testing_results.items():
    print(f"  {key}: {value:.4f}")



Training Dataset:
  M_ori: 1.0000
  M_ref: 0.3052
  Delta (∆): 0.6948
  Delta Relative (δ): 69.4818

Testing Dataset:
  M_ori: 1.0000
  M_ref: 0.3031
  Delta (∆): 0.6969
  Delta Relative (δ): 69.6932
