In [1]:
import sys
import torch
import numpy as np
import torch
import json
from pathlib import Path
project_root = Path().resolve().parent  
sys.path.append(str(project_root))

from transformers import EarlyStoppingCallback
from trl import RewardConfig, RewardTrainer
from transformers import TrainingArguments
from datasets import Dataset, load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "distilgpt2"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

print("Model and tokenizer are ready.")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer are ready.


In [3]:
def load_and_prepare_dataset(json_file_path):


    rca_data = []
    print(f"\nAttempting to load data from '{json_file_path}'...")
    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                if not line.strip():
                    continue
                try:
                  
                    rca_data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"  [Warning] Skipping line {line_num} due to JSON decoding error: {e}")
        print(f"Successfully loaded {len(rca_data)} RCA data entries from '{json_file_path}'.")
    except FileNotFoundError:
        print(f"Error: The file '{json_file_path}' was not found.")
        return []
    except Exception as e:
        print(f"An unexpected error occurred while reading the file: {e}")
        return []

   
   
    prepared_data = []
    print("\nTokenizing and preparing dataset...")
    for i, entry in enumerate(rca_data):
        prompt_text = entry['prompt']
        chosen_text = entry['chosen']
        rejected_text = entry['rejected']

        
        chosen_full_text = prompt_text + "\n\n" + chosen_text
        rejected_full_text = prompt_text + "\n\n" + rejected_text

        tokenized_pair = tokenizer(
            [chosen_full_text, rejected_full_text],
            padding='max_length',   
            truncation=True,     
            max_length=512,         
            return_tensors="pt"     
        )

        prepared_data.append({
         
            'input_ids_chosen': tokenized_pair['input_ids'][0],
            'attention_mask_chosen': tokenized_pair['attention_mask'][0],
            'input_ids_rejected': tokenized_pair['input_ids'][1],
            'attention_mask_rejected': tokenized_pair['attention_mask'][1],
        })

        
        if (i + 1) % 100 == 0:
            print(f"  Processed {i + 1}/{len(rca_data)} entries...")

    print("Dataset preparation complete.")
    return prepared_data

In [4]:
sample_file_path = "rca_dataset_1500_entries.jsonl"

prepared_data = load_and_prepare_dataset(sample_file_path)
prepared_dataset = Dataset.from_list(prepared_data)


       
print("\n--- Inspecting the first tokenized data point ---")
print(prepared_dataset[0])

print(prepared_dataset[0]["input_ids_chosen"])

print(tokenizer.decode(prepared_dataset[0]["input_ids_chosen"]))
        


print("Splitting the dataset into train and evaluation sets...")
prepared_dataset = Dataset.from_list(prepared_dataset)
split_dataset = prepared_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]


print("\nDataset generated and prepared!")
print("Train size:", len(train_dataset))
print("Eval size:", len(eval_dataset))
print("\nSample of processed data:", train_dataset[0])


Attempting to load data from 'rca_dataset_1500_entries.jsonl'...
Successfully loaded 1500 RCA data entries from 'rca_dataset_1500_entries.jsonl'.

Tokenizing and preparing dataset...
  Processed 100/1500 entries...
  Processed 200/1500 entries...
  Processed 300/1500 entries...
  Processed 400/1500 entries...
  Processed 500/1500 entries...
  Processed 600/1500 entries...
  Processed 700/1500 entries...
  Processed 800/1500 entries...
  Processed 900/1500 entries...
  Processed 1000/1500 entries...
  Processed 1100/1500 entries...
  Processed 1200/1500 entries...
  Processed 1300/1500 entries...
  Processed 1400/1500 entries...
  Processed 1500/1500 entries...
Dataset preparation complete.

--- Inspecting the first tokenized data point ---
{'input_ids_chosen': [5122, 6060, 9253, 1080, 318, 13456, 10059, 15536, 13, 1867, 318, 262, 6808, 2728, 30, 198, 198, 464, 7103, 2728, 373, 6060, 9253, 5287, 11, 475, 262, 6808, 2728, 373, 20577, 9262, 26925, 13, 50256, 50256, 50256, 50256, 50256, 5

In [5]:
# training_args = RewardConfig(
#     output_dir="reward_model_checkpoints", 
#     num_train_epochs=3,                     
#     per_device_train_batch_size=4,         
#     learning_rate=2e-5,                    
#     logging_steps=50,
#     disable_dropout=True,
#     fp16=False,             
#     bf16=False
                         
# )


training_args = RewardConfig(
    output_dir="reward_model_checkpoints", 
    num_train_epochs=3,                     
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,         
    learning_rate=1e-5,                    
    logging_steps=50,
    eval_strategy="steps",  
    eval_steps=50,
    save_strategy="steps", 
    save_steps=50,
    load_best_model_at_end=True,
    fp16=False,
    bf16=False,
    weight_decay=0.01
)


    
def compute_metrics(eval_pred):
    predictions, _ = eval_pred
    rewards_chosen = predictions[:, 0]
    rewards_rejected = predictions[:, 1]
    
    accuracy = (rewards_chosen > rewards_rejected).mean()
    
    return {"accuracy": accuracy}

print("Metrics function defined......")


    
trainer = RewardTrainer(
   model=model,
   args=training_args,
   train_dataset=train_dataset,
   eval_dataset=eval_dataset,
   processing_class=tokenizer,
   compute_metrics=compute_metrics,
   callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


# trainer = RewardTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=processed_dataset,
#     processing_class=tokenizer
# )


trainer.train()

print("Model Training Completed.....")


metrics = trainer.evaluate()
print("Final evaluation metrics:", metrics)


with open("reward_model_eval_metrics.json", "w") as f:
    json.dump(metrics, f, indent=4) 

print("Metrics saved to reward_model_eval_metrics.json")

Metrics function defined......


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss,Accuracy
50,0.1564,0.00085,1.0
100,0.0003,0.000109,1.0
150,0.0001,6.9e-05,1.0
200,0.0001,5.8e-05,1.0




Model Training Completed.....




Final evaluation metrics: {'eval_loss': 5.818112913402729e-05, 'eval_accuracy': 1.0, 'eval_runtime': 245.8494, 'eval_samples_per_second': 1.22, 'eval_steps_per_second': 0.155, 'epoch': 3.0}
Metrics saved to reward_model_eval_metrics.json


In [None]:

model_path = "reward_model_checkpoints/checkpoint-150"   
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

def get_reward_score(prompt, rca_text):
   
    
    input_text = prompt + "\n\n" + rca_text
    
   
    
    inputs = tokenizer(
        input_text,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

  
    with torch.no_grad():
        outputs = model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"]
        )
       
        score = outputs.logits.squeeze().item()
    
    return score



prompt = "Our Data corruption system is experiencing unexpected failures. What is the root cause?"
good_rca = "The failures were caused by a corrupted index file due to unclean shutdown during a power outage."
bad_rca = "The system is failing because it does not like the data."

print("Good RCA Score:", get_reward_score(prompt, good_rca))
print("Bad RCA Score:", get_reward_score(prompt, bad_rca))


# Test case 1: Same Input Lengths
prompt1 = "What could have caused the decrease in production of the product line C and also the defects in the manufacturing quality"
good_rca1 = "Following a systematic root cause analysis of the manufacturing quality defects that affected 12% of Product Line C during the February production run, we identified multiple contributing factors that ultimately trace to fundamental gaps in our quality management system. The investigation revealed that while individual incidents appeared to involve operator error and equipment calibration issues, the underlying root cause was the absence of a comprehensive preventive maintenance and quality assurance framework. Our analysis using fishbone methodology and failure mode analysis showed that the 340 defective units resulted from a cascade of process failures: inadequate supplier qualification procedures allowed substandard raw materials to enter production, insufficient statistical process control monitoring failed to detect drift in key parameters, and missing cross-functional communication protocols prevented early escalation of emerging quality trends. The root cause extends to our organizational quality culture, specifically the lack of integrated quality management procedures that should connect supplier management, production control, and continuous improvement processes. This systemic gap enabled quality issues to compound across multiple production stages without triggering appropriate corrective responses, ultimately requiring a complete redesign of our quality assurance protocols and implementation of real-time monitoring systems."
bad_rca1 = "There are numerous complex factors contributing to the quality problems we've been experiencing in our manufacturing operations, and it's really quite difficult to pin down exactly what's going wrong because there are so many different things happening at the same time. The operators seem to be having trouble with the new machines, and there have been some complaints about the training program not being comprehensive enough to cover all the different scenarios they might encounter on the production floor. Additionally, some people think that the suppliers might not be providing materials that meet our specifications, though we haven't really been able to verify this conclusively. The maintenance team has also mentioned that some of the equipment might need updating or replacement, but budget constraints have made it challenging to invest in new machinery right away. There have also been some suggestions that better communication between shifts could help, since sometimes information about problems doesn't get passed along properly when the teams change over. Management is considering various options including hiring more quality control inspectors, implementing additional training programs, and possibly renegotiating contracts with suppliers, though it's unclear which of these approaches would be most effective or whether we need to try multiple solutions simultaneously."
print("\n\nPassing inputs of same lengths.......\n")
print("Good RCA Score:", get_reward_score(prompt1, good_rca1))
print("Bad RCA Score:", get_reward_score(prompt1, bad_rca1))


#Test Case 2: Different Input Lengths
prompt2 = "I run an e-commerce platform selling clothes, our customers we not able to "
good_rca2 = "Our comprehensive investigation into the e-commerce platform downtime on March 15th revealed a complex series of interconnected factors leading to the 4-hour service interruption. While the immediate trigger was a database connection timeout, our systematic analysis using the 5 Whys methodology uncovered the underlying root cause: inadequate change management procedures within our DevOps pipeline. Specifically, the deployment process lacked automated rollback mechanisms and pre-production load testing protocols. The incident began when a routine code update introduced a memory leak that gradually consumed available database connections over a 6-hour period. However, the deeper systemic issue was the absence of connection pool monitoring and automatic scaling procedures in our infrastructure management framework. This process gap allowed the resource exhaustion to persist undetected until critical thresholds were breached. The root cause extends beyond the technical failure to encompass insufficient monitoring processes, inadequate capacity planning procedures, and missing incident response protocols that should have prevented this escalation."
bad_rca2 = "The website went down because someone deployed bad code. The developer should have tested it better before pushing to production."
print("\n\nPassing inputs of different lengths.......\n")
print("Good RCA Score:", get_reward_score(prompt2, good_rca2))
print("Bad RCA Score:", get_reward_score(prompt2, bad_rca2))


Good RCA Score: 5.226436138153076
Bad RCA Score: -1.5571659803390503


Passing inputs of same lengths.......

Good RCA Score: 5.282775402069092
Bad RCA Score: -2.501983165740967


Passing inputs of different lengths.......

Good RCA Score: 4.961697578430176
Bad RCA Score: -0.68803870677948
