In [1]:
# 1. Standard Libraries
import warnings
import logging
from functools import lru_cache

# 2. Data Manipulation Libraries
import pandas as pd
import numpy as np

# 3. Natural Language Processing (NLP) Libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate import meteor_score

# 4. Scoring and Evaluation Metrics
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
from bert_score import score as bert_score

# 5. Transformers Library (for Model Loading and Tokenization)
from transformers import BitsAndBytesConfig
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from sentence_transformers import SentenceTransformer  # For embeddings


# 6. ChromaDBSearcher & ChatModel
from common.chroma_db import ChromaDBSearcher
from common.chat_model import ModelQA

# 7. Logging Configuration
logging.basicConfig(level=logging.CRITICAL)

# 8. Warning Configuration
warnings.filterwarnings("ignore")


### Data Loading and Preprocessing

This cell loads and preprocesses a manually curated CSV file containing questions and answers. The steps ensure the data is cleaned and formatted correctly for further processing.

1. **Setting up Paths**:
   - The `ProjectRoot` variable is set to an empty string initially, but it can be configured to point to the root directory of your project.
   - The `DatasetRoot` variable concatenates the `ProjectRoot` with the folder `"manuals/"`, where the dataset is stored.

2. **Loading the CSV File**:
   - The file path to the curated CSV file is constructed using `DatasetRoot` and the file name `Fraggles_X700_2022_HCM_QA_Curated.csv`.
   - The `pandas` library (`pd.read_csv()`) is used to load the CSV file into a DataFrame (`df`). The file is read using the `latin1` encoding to handle any special characters properly.

3. **Data Cleaning**:
   - **Drop missing values**: Rows where either the `question` or `answer` column is missing are removed with `dropna()`.
   - **Remove duplicates**: Rows with duplicate `para_id` values are removed using `drop_duplicates()`, keeping only the first occurrence of each `para_id`.
   - **Reset index**: After cleaning, the index is reset to ensure the DataFrame has a consistent numbering scheme, which is done twice (`reset_index(dleaned properly.

This step prepares the data for further processing by ensuring that only relevant and clean rcords are included.


In [2]:
# provide project root path
ProjectRoot = ""
DatasetRoot = ProjectRoot + "manuals/"

# Load the manually curated CSV file
file_path = DatasetRoot + '/Fraggles_X700_2022_HCM_QA_Curated.csv'
df = pd.read_csv(file_path,  encoding='latin1')

#keep only one question per chunk
df.dropna(subset=['question', 'answer'], inplace=True)
df = df.drop_duplicates(subset=['para_id'], keep='first')
df = df.reset_index(drop=False)  # Adds the index as a new column
df =df.reset_index(drop=False)

# Check the first few rows to understand the structure
df.head(5)

Unnamed: 0,level_0,index,page_number,para_id,sentence_chunk,question,answer
0,0,0,1,e6c45229dc996b6ba2ecb9ddb251f77c5ee3f2de93cc1d...,check that the rod is ï¬rmly placed to avoid ...,What should you do to prevent the hood from ac...,"Before closing the hood, make sure the support..."
1,1,3,1,69dedd5e3d5bbf7c8e9a12176d60789e110a65cd533d26...,"If you feel any resistance, stop and check fo...",What should you do if you feel resistance when...,"f you feel any resistance, stop immediately an..."
2,2,6,1,fc843f5ed9e92f0a42868ed7813e7c197902b5f2194f87...,Pull the hood release handle: The hood release...,Where is the hood release handle typically loc...,The hood release handle is usually located in ...
3,3,9,1,0c927bca6c80fe20d270cb43d62ea95099483181509007...,The support rod is typically located on the u...,Where is the support rod typically located in ...,The support rod is typically located on the un...
4,4,12,2,560865acae00a7a1c7b35d8567b972195079a68eddcf40...,Genuine FragglesX700HCM Motor Oil (for optimal...,What are the benefits of using Genuine Fraggle...,Genuine FragglesX700HCM Motor Oil is a premium...



### Overview of Evaluation Metrics:

- **BLEU (Bilingual Evaluation Understudy)**: Measures how many n-grams in the generated text match n-grams in the reference text. Commonly used in machine translation evaluation.
  
- **ROUGE (Recall-Oriented Understudy for Gisting Evaluation)**: Measures the overlap of n-grams, word sequences, and word pairs between the reference and generated text. Used for summarization tasks.
  
- **BERTScore**: Leverages BERT embeddings for a more context-sensitive evaluation of text generation.
  
- **METEOR**: Takes into account synonymy, stemming, and word order in its evaluation of generated text.
  
- **Cosine Similarity**: A measure of similarity between two vectors, commonly used in evaluating semantic similarity between sentences.
 model is performing.
cords are included.


In [3]:
def evaluate_bleu(reference: str, candidate: str) -> float:
    reference_tokens = nltk.word_tokenize(reference.lower())
    candidate_tokens = nltk.word_tokenize(candidate.lower())
    return sentence_bleu([reference_tokens], candidate_tokens)

def evaluate_rouge(reference: str, candidate: str) -> dict:
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores

# Function to calculate BERTScore
def evaluate_bertscore(reference, generated):
    P, R, F1 = bert_score([generated], [reference], lang='en', verbose=True)
    return P.item(), R.item(), F1.item()

# Function to calculate METEOR score
def evaluate_meteor(reference, generated):
    # Tokenize the reference and generated answers
    reference_tokens = word_tokenize(reference)
    generated_tokens = word_tokenize(generated)
    
    # Calculate and return the METEOR score
    return meteor_score.single_meteor_score(reference_tokens, generated_tokens)

# Load a pre-trained Sentence Transformer model
# Function to calculate Cosine Similarity
def calculate_cosine_similarity(reference, generated):
    # Encode sentences to embeddings
    reference_embedding = embedding_model.encode([reference])
    generated_embedding = embedding_model.encode([generated])
    
    # Compute cosine similarity between the two embeddings
    similarity = cosine_similarity(reference_embedding, generated_embedding)
    return similarity[0][0] 

# **Model Evaluation Overview**

This process evaluates text generation models using multiple NLP metrics. The evaluation steps involve generating answers to questions and scoring the generated responses across various models.

## **Model List**
- **Llama-2**: `meta-llama/Llama-2-7b-chat-hf`
- **gemma-2**: `google/gemma-2b-it`
- **GPT2**: `openai-community/gpt2-medium`
- **phi2**: `microsoft/phi-2`

---

## **Workflow**
1. **Load Data**: Curated CSV with questions and answers.
2. **Process**: For each chunk (grouped by `para_id`), evaluate answers generated by multiple models.
3. **Metrics Evaluation**: Use BLEU, ROUGE, BERTScore, METEOR, and Cosine Similarity to score each generated answer.
4. **Result Storage**: Collect all scores in a structured format (DataFrame).

In [4]:
# Ensure nltk's punkt tokenizer is downloaded
nltk.download('punkt')
warnings.filterwarnings("ignore", message=".*weights.*initialized.*")

# Initialize the embedding model and searcher (ChromaDB)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
searcher = ChromaDBSearcher()  # Assuming ChromaDBSearcher is defined elsewhere

# Function to load model id
def get_model_id(model_name):
    if model_name == "Llama-2":
        return "meta-llama/Llama-2-7b-chat-hf"
    elif model_name == "gemma-2":
        return "google/gemma-2b-it"
    elif model_name == 'GPT1':
        return "openai-community/openai-gpt"
    elif model_name == "GPT2":
        return "openai-community/gpt2-medium"
    elif model_name == "phi2":
        return "microsoft/phi-2"
    else:
        raise ValueError(f"Unknown model name: {model_name}")

# Create an empty list to store the evaluation results
evaluation_results = []
models = ["Llama-2", "gemma-2", "GPT2", "phi2"]
model_cache  = {}

# Loop through the chunks in the dataframe (assuming each chunk is already grouped)
for chunk_index, chunk in enumerate(df.groupby('para_id')):  # Assuming 'para_id' is the identifier
    chunk_data = chunk[1]  # Get the actual chunk

    # Process each question-answer pair in the chunk
    for index, row in chunk_data.iterrows():
        question = row['question']
        reference_answer = row['answer']
        
        # Loop through each model to generate answers
        for model_name in models:
            # Load the model and tokenizer dynamically for each model using ModelQA class
            model_id = get_model_id(model_name)
            model_qa = model_cache.get(model_id)
            if model_qa is None: 
                 model_qa = ModelQA(model_id=model_id, searcher=searcher)
                 model_cache[model_id] =  model_qa 
            # Generate answer using the `ask` function
            generated_answer = model_qa.ask(document_source="document_source", query=question)
            
            # Evaluate BLEU score
            bleu_score = evaluate_bleu(reference_answer, generated_answer)
            
            # Evaluate ROUGE score
            rouge_scores = evaluate_rouge(reference_answer, generated_answer)
            
            # Evaluate BERTScore
            bert_p, bert_r, bert_f1 = evaluate_bertscore(reference_answer, generated_answer)
            
            # Evaluate METEOR score
            meteor = evaluate_meteor(reference_answer, generated_answer)

            # Calculate Cosine Similarity
            cosine_similarity_score = calculate_cosine_similarity(reference_answer, generated_answer)

            # Append the results for this chunk, question, and model
            evaluation_results.append({
                "Model": model_name,
                "Chunk": chunk_index,
                "Question": question,
                "BLEU": bleu_score,
                "ROUGE-1": rouge_scores['rouge1'].fmeasure,
                "ROUGE-2": rouge_scores['rouge2'].fmeasure,
                "ROUGE-L": rouge_scores['rougeL'].fmeasure,
                "BERTScore_Precision": bert_p,
                "BERTScore_Recall": bert_r,
                "BERTScore_F1": bert_f1,
                "METEOR": meteor,
                "Cosine_Similarity": cosine_similarity_score
            })

# Convert evaluation results into a DataFrame
eval_df = pd.DataFrame(evaluation_results)


[nltk_data] Downloading package punkt to /home/vinoj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


NameError: name 'SentenceTransformer' is not defined

## **Plotting the Evaluation Metrics for BLEU, ROUGE-1, ROUGE-2, and ROUGE-L**

This bar chart compares the performance of different models (e.g., Llama-2, GPT2, phi2) based on several evaluation metrics: BLEU, ROUGE-1, ROUGE-2, and ROUGE-L. Each model is evaluated on these metrics, and the results are displayed as grouped bars for easy comparison. 

- **X-Axis**: Different models.
- **Y-Axis**: Evaluation score values (BLEU, ROUGE-1, ROUGE-2, ROUGE-L).
- **Grouped Bars**: Each model has separate bars for each metric, color-coded for clarity.

This visualization allows us to quickly assess how each model performs on various NLP tasks, aiding in model comparison and selection.


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Reshaping the data into a long format
melted_scores = eval_df.melt(id_vars="Model", 
                                    value_vars=["BLEU", "ROUGE-1", "ROUGE-2", "ROUGE-L"],
                                    var_name="Metric", 
                                    value_name="Score")

# Set the plotting style
sns.set(style="whitegrid")

# Create the grouped bar chart
plt.figure(figsize=(24, 8))
ax = sns.barplot(x="Model", y="Score", hue="Metric", data=melted_scores, palette="Set2")

# Title and labels
plt.title("Comparison of BLEU and ROUGE Scores for Different Models", fontsize=16)
plt.ylabel("Score", fontsize=12)
plt.xlabel("Model", fontsize=12)

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Add annotations for each bar
for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=12, color='black',
                xytext=(0, 8), textcoords='offset points')

# Display the plot
plt.legend(title="Metric", loc="upper right")
plt.tight_layout()
plt.show()


# Plotting the Evaluation Metrics for BERT, METEOR &  Cosine Similarity Scores


This chart compares multiple models based on several evaluation metrics:

- **BERTScore (Precision, Recall, F1)**
- **METEOR Score**
- **Cosine Similarity**

### Steps:

1. **Data Preparation**: The `eval_df` DataFrame is reshaped into a long format with metrics (BERTScore, METEOR, Cosine Similarity) as separate columns.
2. **Visualization**: 
   - A grouped bar chart is created using `seaborn`, with models on the x-axis and scores on the y-axis.
   - Bars are color-coded by metric, and annotations display exact values for each bar.
   - X-axis labels are rotated for readability, and a legend distinguishes metrics.

### Outcome:
The chart provides a clear visual comparison of model performance across various evaluation metrics, helping assess the strengths of each model.
heir answer generation quality.


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Reshaping the data into a long format
melted_scores = eval_df.melt(id_vars="Model", 
                                    value_vars=[ 'BERTScore_Precision', 'BERTScore_Recall', 'BERTScore_F1', 'METEOR', 'Cosine_Similarity'],
                                    var_name="Metric", 
                                    value_name="Score")

# Set the plotting style
sns.set(style="whitegrid")

# Create the grouped bar chart
plt.figure(figsize=(24, 8))
ax = sns.barplot(x="Model", y="Score", hue="Metric", data=melted_scores, palette="Set2")

# Title and labels
plt.title("Comparison of BERT, METEOR &  Cosine Similarity Scores for Different Models", fontsize=16)
plt.ylabel("Score", fontsize=12)
plt.xlabel("Model", fontsize=12)

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha="right")

# Add annotations for each bar
for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=12, color='black',
                xytext=(0, 8), textcoords='offset points')

# Display the plot
plt.legend(title="Metric", loc="upper right")
plt.tight_layout()
plt.show()
