In [1]:
import os
import time
import re
import requests
from groq import Groq
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from sentence_transformers import SentenceTransformer
import evaluate
import numpy as np
import pandas as pd
import spacy
import gc




In [2]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
bert_score = evaluate.load("bertscore")
bleurt = evaluate.load("bleurt")
nlp = spacy.load("en_core_web_sm")
sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: evaluate.load('bleurt', 'bleurt-large-512').


Computing checksums: 100%|##########| 1/1 [00:14<00:00, 14.67s/it]


INFO:tensorflow:Reading checkpoint C:\MSC.Software\Adams\2018\huggingface\metrics\bleurt\default\downloads\extracted\1456ab3b3d57d5d29151304b510d08404b6fcc6aceb338619880c4b379f7228f\bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.

INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


In [38]:
def get_config():
    return {
        "chunk_size": 800,
        "chunk_overlap": 50,
        "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
        "llm_model": "llama3-8b-8192",
        "retriever_k": 20,
        "context_docs": 3,
        "api_key": ""
    }

In [39]:
class DocumentProcessor:
    def __init__(self, config):
        self.config = config
        self.text_splitter = CharacterTextSplitter(
            chunk_size=config["chunk_size"],
            chunk_overlap=config["chunk_overlap"]
        )
        
    def process_documents(self, pdf_folder):
        all_docs = []
        processing_stats = {"total_files": 0, "successful": 0, "failed": 0, "total_chunks": 0}
        
        for file in os.listdir(pdf_folder):
            if file.endswith(".pdf"):
                processing_stats["total_files"] += 1
                pdf_path = os.path.join(pdf_folder, file)
                try:
                    loader = PyMuPDFLoader(pdf_path)
                    documents = loader.load()
                    docs = self.text_splitter.split_documents(documents)
                    all_docs.extend(docs)
                    processing_stats["successful"] += 1
                    processing_stats["total_chunks"] += len(docs)
                except Exception as e:
                    processing_stats["failed"] += 1
                    print(f"Error processing {file}: {str(e)}")
        
        return all_docs, processing_stats

In [40]:
class PromptingStrategies:
    @staticmethod
    def zero_shot(question, context):
        return [{"role": "user", "content": f"Context: {context}\nQuestion: {question}"}]
    

    
    @staticmethod
    def few_shot(question, context):
        examples = """Q: How do transformers work?
                    A: Self-attention mechanisms.

                    Q: What is BERT's objective?
                    A: Masked language modeling."""
        return [{"role": "user", "content": f"{examples}\n\nContext: {context}\nQuestion: {question}"}]
    
    @staticmethod
    def chain_of_thought(question, context):
        return [{"role": "user", "content": f"Context: {context}\nQuestion: {question}\nLet's solve step by step:"}]

In [41]:
class EnhancedMetrics:
    def __init__(self):
        self.rouge = evaluate.load("rouge")
        self.bleu = evaluate.load("bleu")
        self.sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        self.nlp = spacy.load('en_core_web_sm')
        
    
    def compute_semantic_similarity(self, pred, ref):
        pred_emb = self.sentence_model.encode([pred])[0]
        ref_emb = self.sentence_model.encode([ref])[0]
        return np.dot(pred_emb, ref_emb) / (np.linalg.norm(pred_emb) * np.linalg.norm(ref_emb))

In [48]:
def compute_semantic_similarity(prediction, reference):
    pred_embedding = sentence_model.encode([prediction])[0]
    ref_embedding = sentence_model.encode([reference])[0]
    return np.dot(pred_embedding, ref_embedding) / (np.linalg.norm(pred_embedding) * np.linalg.norm(ref_embedding))

def check_factual_consistency(prediction, context):
    pred_ents = set(ent.text.lower() for ent in nlp(prediction).ents)
    context_ents = set(ent.text.lower() for ent in nlp(context).ents)
    return len(pred_ents.intersection(context_ents)) / len(pred_ents) if pred_ents else 1.0

def compute_context_utilization(prediction, context):
    pred_words = set(prediction.lower().split())
    context_words = set(context.lower().split())
    return len(pred_words.intersection(context_words)) / len(pred_words)

def compute_recall_at_k(retrieved_docs, ground_truth, k=20):

    
    ground_truth = ground_truth.lower()
    
   
    top_k_docs = retrieved_docs[:k]
    
    
    for doc in top_k_docs:
        if ground_truth in doc.page_content.lower():
            return 1.0 
    
    return 0.0



def exact_match(prediction, reference):
    return 1 if prediction.strip().lower() == reference.strip().lower() else 0


In [43]:
class EvaluationPipeline:
    def __init__(self, config):
        self.config = config
        self.api_key = config["api_key"]
        self.client = Groq(api_key=self.api_key)
        self.metrics = EnhancedMetrics()
        self.doc_processor = DocumentProcessor(config)
    
    def compute_all_metrics(self, prediction, reference, context, retrieved_docs):
        if not prediction or not reference:
            return {
                "rouge": 0,
                "bleu": 0,
                "bert_score": 0,
                "bleurt": 0,
                "semantic_similarity": 0,
                "factual_consistency": 0,
                "context_utilization": 0,
                "recall_at_k": 0,
                "exact_match": 0
            }
        
        return {
            "rouge": rouge.compute(predictions=[prediction], references=[reference])['rougeL'],
            "bleu": bleu.compute(predictions=[prediction], references=[[reference]])['bleu'],
            "bert_score": bert_score.compute(predictions=[prediction], references=[reference], lang="en")['f1'],
            "bleurt": bleurt.compute(predictions=[prediction], references=[reference])['scores'][0],
            "semantic_similarity": compute_semantic_similarity(prediction, reference),
            "factual_consistency": check_factual_consistency(prediction, context),
            "context_utilization": compute_context_utilization(prediction, context),
            "recall_at_k": compute_recall_at_k(retrieved_docs, reference),
            "exact_match": exact_match(prediction, reference)
        }


    def evaluate_single_prompt(self, vectorstore, strategy_name, strategy_func, question, reference):
        start_time = time.time()

       
        retrieved_docs = vectorstore.similarity_search(question, k=self.config["retriever_k"])
        if not retrieved_docs:
            context = "No relevant documents found. Answer based on prior knowledge."
        else:
            context = "\n".join([doc.page_content for doc in retrieved_docs[:self.config["context_docs"]]])

        
        messages = strategy_func(question, context)

       
        for attempt in range(3):
            try:
                response = self.client.chat.completions.create(
                    model=self.config["llm_model"],
                    messages=messages
                )
                output = response.choices[0].message.content
                break
            except Exception as e:
                print(f"[{strategy_name}] Error: {e}. Retrying... ({attempt + 1}/3)")
                time.sleep(2 ** attempt)
                output = "Error generating response."

        end_time = time.time()

       
        metrics = self.compute_all_metrics(output, reference, context, retrieved_docs)
        metrics["latency"] = end_time - start_time
        metrics["strategy"] = strategy_name
        metrics["question"] = question
        metrics["generated_answer"] = output
        metrics["reference"] = reference

        return metrics
    
    def extract_wait_time(self, error_message, attempt):
        match = re.search(r'Please try again in ([\d\.]+)m([\d\.]+)s', error_message)
        if match:
            minutes = float(match.group(1))
            seconds = float(match.group(2))
            return int(minutes * 60 + seconds)
        return min(30 * (2 ** attempt), 300)
    
    
    def run_evaluation(self, questions, pdf_folder):
            # Process documents
            all_docs, processing_stats = self.doc_processor.process_documents(pdf_folder)
            
           
            embedding_model = HuggingFaceEmbeddings(model_name=self.config["embedding_model"])
            vectorstore = FAISS.from_texts([doc.page_content for doc in all_docs], embedding_model)

            
            results = []

          
            for question, reference in questions:
                for strategy_name, strategy_func in [
                    ("zero-shot", PromptingStrategies.zero_shot),
                    ("few-shot", PromptingStrategies.few_shot),
                    ("chain-of-thought", PromptingStrategies.chain_of_thought)
                ]:
                    result = self.evaluate_single_prompt(
                        vectorstore, strategy_name, strategy_func, question, reference
                    )
                    results.append(result)

            
            return pd.DataFrame(results), processing_stats


In [50]:
questions = [
    
    ("What is OpenScholar and how is it used?", 
     "OpenScholar is a research platform designed to enable large language models to generate and synthesize scientific literature. It incorporates retrieval-augmented generation to improve accuracy in scholarly tasks."),
    
   
    ("What is the ELI5 dataset and what is it used for?", 
     "The ELI5 (Explain Like I'm 5) dataset contains questions and answers from Reddit that aim to simplify complex topics into layman's terms. It is widely used to evaluate language models' ability to generate explanations."),
    
   
    ("What is LLAMA and how does it differ from other language models?", 
     "LLAMA (Large Language Model Meta AI) is a family of large language models developed by Meta. It focuses on efficiency, achieving high performance with fewer parameters compared to models like GPT-3."),
    
   
    ("How is citation generation performed for medical datasets?", 
     "Citation generation for medical data typically involves fine-tuning large language models on biomedical corpora. Retrieval-augmented generation is often used to enhance accuracy by referencing scientific articles during the generation process."),

     ("What kind of Questions does WebGPT answer?","Long-Form Questions")
]


In [51]:
pipeline = EvaluationPipeline(get_config())
batch_size = 3
pdf_folder = r"C:\Users\Usman\Desktop\For Thesis\Citation_Generation\Papers"


all_results = []


for i in range(0, len(questions), batch_size):
    batch = questions[i:i + batch_size]
    results_df, stats = pipeline.run_evaluation(batch, pdf_folder)
    
    
    all_results.append(results_df)
    
   
    gc.collect()
    print(f"Batch {i // batch_size + 1} processed.")


final_results_df = pd.concat(all_results, ignore_index=True)


print("\nAll Results Summary:")
display(final_results_df)

Batch 1 processed.
Batch 2 processed.

All Results Summary:


Unnamed: 0,rouge,bleu,bert_score,bleurt,semantic_similarity,factual_consistency,context_utilization,recall_at_k,exact_match,latency,strategy,question,generated_answer,reference
0,0.117647,0.0,[0.838688850402832],-0.90138,0.560973,1.0,0.615385,0.0,0,2.764239,zero-shot,What is OpenScholar and how is it used?,I'll review the provided context and ask quest...,OpenScholar is a research platform designed to...
1,0.164384,0.0,[0.8449322581291199],-0.906185,0.39981,1.0,0.352941,0.0,0,0.738377,few-shot,What is OpenScholar and how is it used?,"I've read the context, which appears to be a p...",OpenScholar is a research platform designed to...
2,0.119205,0.0,[0.8427514433860779],-1.253345,0.699241,0.714286,0.511905,0.0,0,0.755626,chain-of-thought,What is OpenScholar and how is it used?,I'd be happy to help you verify the informatio...,OpenScholar is a research platform designed to...
3,0.262626,0.098104,[0.8890885710716248],-0.500832,0.747911,1.0,0.595745,0.0,0,0.886178,zero-shot,What is the ELI5 dataset and what is it used for?,There is no ELI5 dataset mentioned in the prov...,The ELI5 (Explain Like I'm 5) dataset contains...
4,0.126984,0.0,[0.8362592458724976],-1.149957,0.562324,0.333333,0.5,0.0,0,0.768972,few-shot,What is the ELI5 dataset and what is it used for?,The paper does not mention the ELI5 dataset. T...,The ELI5 (Explain Like I'm 5) dataset contains...
5,0.116279,0.0,[0.8290041089057922],-0.51831,0.349708,1.0,0.731183,0.0,0,0.835151,chain-of-thought,What is the ELI5 dataset and what is it used for?,I'd be happy to help you understand the contex...,The ELI5 (Explain Like I'm 5) dataset contains...
6,0.058968,0.007305,[0.7882489562034607],-1.01979,0.550364,0.857143,0.716,0.0,0,1.305808,zero-shot,What is LLAMA and how does it differ from othe...,The text you provided appears to be a set of p...,LLAMA (Large Language Model Meta AI) is a fami...
7,0.031088,0.0,[0.8284688591957092],-0.616609,0.424361,0.173913,0.411765,0.0,0,1.756505,few-shot,What is LLAMA and how does it differ from othe...,Transformer architectures are a type of neural...,LLAMA (Large Language Model Meta AI) is a fami...
8,0.061625,0.008247,[0.8182438611984253],-0.779397,0.563842,0.153846,0.405714,0.0,0,1.183125,chain-of-thought,What is LLAMA and how does it differ from othe...,The provided text appears to be a collection o...,LLAMA (Large Language Model Meta AI) is a fami...
9,0.037037,0.0,[0.824767529964447],-1.003538,0.479608,1.0,0.772727,0.0,0,0.765095,zero-shot,How is citation generation performed for medic...,"The paper being referenced is likely ""How well...",Citation generation for medical data typically...
