# Introduction
This notebook demonstrates the process of creating finetuning data for a consistency benchmark.

In [1]:
import sys
sys.path.append("../")

from dotenv import load_dotenv
load_dotenv("../.env")

False

## Setup: Import Libraries and Define Environment

In [13]:
import os
import argparse
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
from datasets import load_dataset
from transformers import pipeline

from langchain.llms import HuggingFacePipeline
from langchain.chat_models import ChatOpenAI

from generators import CoGGenerator, BaseGenerator
from metrics.scorer import PairwiseScorer
from perturbations import paraphrase

# Set device for Torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Define Parameters
Define all key parameters that were previously set via the argparse module.

In [15]:
data_name = "truthful_qa"
model_name = "gpt-3.5-turbo"
aux_model_name = "gpt-3.5-turbo"
variation_type = "paraphrasing"
eval_agreements = "llm,0.5;contradiction,0.5;ner,0.5"
metrics = ["pp", "entailment", "bertscore"]

## Data Loading
Load the `truthful_qa` dataset from Hf.

In [5]:
data = load_dataset("truthful_qa", "generation")
df = data["validation"].to_pandas()

README.md:   0%|          | 0.00/9.59k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/223k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/817 [00:00<?, ? examples/s]

In [6]:
# Parse evaluation agreements
agreements = [
    (x.split(",")[0], float(x.split(",")[1])) for x in eval_agreements.split(";")
]

## Model Initialization
Initialize the main model which you will use to get finetuning samples from.

In [10]:
if model_name in ["gpt-3.5-turbo", "gpt-4"]:
    model = ChatOpenAI(
        model_name=model_name,
        temperature= 0.1,
        max_tokens=100,
    )
else:
    task = "text2text-generation" if "t5" in model_name else "text-generation"
    model = HuggingFacePipeline.from_model_id(
        model_id=model_name,
        task=task,
        device=0,
        model_kwargs={"temperature": 0.1, "max_length": 100},
    )

## Auxiliary LLM

In [17]:
## Define the Auxilliary LLM
aux_model = None
if "llm" in metrics:
    if aux_model_name in ["gpt-3.5-turbo", "gpt-4"]:
        aux_model = ChatOpenAI(
            model_name=aux_model_name,
            temperature=0.1,
            max_tokens=100,
        )
    else:
        ## If not using OpenAI models, use an instruction following model like "FlanT5"
        task = (
            "text2text-generation"
            if "t5" in aux_model_name
            else "text-generation"
        )
        aux_model = HuggingFacePipeline.from_model_id(
            model_id=aux_model_name,
            task=task,
            device=0,
            temperature=0.1,
            max_length=100,
        )

## Initialize Generators and Scorers

In [18]:
a2c = CoGGenerator(model, variation_type) # Chain of Guidance Generation
base = BaseGenerator(model, variation_type) # Ordinary Generation
scorer = PairwiseScorer(metrics=metrics, aux_model=aux_model) # Consistency Scorer meta class

Some weights of the model checkpoint at microsoft/deberta-base-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using the latest cached version of the module from /root/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bertscore/cf4907b18f8f741f202232c0f8009a3bd49ff98802c245abcb6ea51a37a8c05b (last modified on Mon Jan 27 21:25:30 2025) since it couldn't be found locally at evaluate-metric--bertscore, or remotely on the Hugging Face Hub.


## Generation and Scoring

In [19]:
# Initialize result containers
all_input, all_input_perturb, all_output, all_output_cons = [], [], [], []
all_correct_output, all_scores, all_cons_scores = [], [], []

# Process each input in the dataset
for i in tqdm(range(len(df))):
    input_text = df.question[i]
    correct_output = df.best_answer[i]

    # Generate variations (paraphrasing or sampling)
    if variation_type == "paraphrasing":
        input_perts = [
            paraphrase.llm_prompting(input_text, method=idx) for idx in range(1, 5)
        ]
    else:
        input_perts = []

    # Generate outputs
    outputs = base.generate(input_text, input_perts)
    cons_outputs = a2c.generate(input_text, input_perts)

    # Score outputs
    score = scorer.score(input_text, outputs)
    cons_score = scorer.score(input_text, cons_outputs)

    # Store results
    all_input.extend([input_text] * len(outputs))
    all_input_perturb.extend(input_perts if input_perts else [""] * len(outputs))
    all_output.extend(outputs)
    all_output_cons.extend(cons_outputs)
    all_correct_output.extend([correct_output] * len(outputs))
    all_scores.extend([score] * len(outputs))
    all_cons_scores.extend([cons_score] * len(outputs))

  0%|          | 0/817 [00:00<?, ?it/s]

  input_pp = llm(messages, stop="\n").content
  chain = LLMChain(llm=self.model, prompt=self.question_prompt)
  output = chain.run(
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Save Results to CSV

In [None]:
# Create a DataFrame with the results
res_df = pd.DataFrame(
    {
        "input": all_input,
        "input_pert": all_input_perturb,
        "outputs_correct": all_correct_output,
        "output_sampled": all_output,
        "output_consistent": all_output_cons,
        "score": all_scores,
        "score_consistent": all_cons_scores,
    }
)

# Save to CSV
output_file = f"result_{model_name.replace('/', '')}_{variation_type}.csv"
res_df.to_csv(output_file, index=False)