# Comparison of RAG vs No-RAG Performance
This experiment evaluates the effectiveness of incorporating Retrieval-Augmented Generation (RAG) in answering predefined cybersecurity questions from blog posts.

**Experiment Setup:**

1. With RAG: The model answers questions based on relevant passages retrieved from the blog (retrieval-augmented).

2. Without RAG: The model answers using the full blog content directly, without retrieval or context filtering.

3. Both setups were executed using the mistralai/mistral-small-3-1-24b-instruct-2503 model, a compact 7.3 billion parameter LLM.

4. The dataset includes 35 blogs and 350 predefined questions, ensuring a consistent and fair evaluation across both methods.

5. Answers were graded using meta-llama/llama-3-70b-instruct (70B parameters), assigning a grade from 0 (incorrect) to 1 (correct) with partial values indicating partly correct answers.

6. The evaluation emphasizes answer correctness (not exact match to ground truth), using a WatsonX-based grading system.

This comparison demonstrates whether small, efficient models like Mistral can benefit from RAG to improve performance and reduce the dependency on large, expensive LLMs.


# Without RAG:
Using the LLM directly on the full article without retrieval.

In [None]:
import json
import pandas as pd

# Load the evaluation file
with open('/content/answers_without_rag.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)

# Extract blog URL from each entry (if available)
df['blog_url'] = df.get('article_url') if 'article_url' in df.columns else None
if 'blog_url' not in df.columns:
    df['blog_url'] = df['question'].map(lambda x: next((row['url'] for row in data if row['question'] == x), None))

# Basic statistics
num_questions = len(df)
num_blogs = df['blog_url'].nunique()
eval_counts = df['evaluation'].value_counts()
percentages = df['evaluation'].value_counts(normalize=True) * 100

print(f"Total unique blogs: {num_blogs}")
print(f"Total questions evaluated: {num_questions}\n")

# Overall average grade
overall_avg_grade = df['grade'].mean()
print(f"Overall average grade: {overall_avg_grade:.3f}\n")

print("Evaluation Breakdown:")
for label in eval_counts.index:
    print(f" - {label}: {eval_counts[label]} ({percentages[label]:.1f}%)")

# Show side-by-side answers
df = df.rename(columns={'rag_answer': 'model_answer'})
print("\nSample comparison of RAG vs Ground Truth:")
display(df[['question', 'ground_truth_answer', 'model_answer', 'evaluation']].head(10))


# Accuracy by question
accuracy_per_question = df.groupby('question')['evaluation'].apply(
    lambda x: (x == 'Correct').sum() / len(x)
).reset_index().rename(columns={'evaluation': 'accuracy'})
accuracy_per_question = accuracy_per_question.sort_values(by='accuracy', ascending=False)

print("\nAccuracy per question:")
display(accuracy_per_question)


# With RAG:
Using retrieval-augmented generation over the article content.

In [None]:
import json
import pandas as pd

# Load the evaluation file
with open('/content/answers_with_rag.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)

# Extract blog URL from each entry (if available)
df['blog_url'] = df.get('article_url') if 'article_url' in df.columns else None
if 'blog_url' not in df.columns:
    df['blog_url'] = df['question'].map(lambda x: next((row['url'] for row in data if row['question'] == x), None))

# Basic statistics
num_questions = len(df)
num_blogs = df['blog_url'].nunique()
eval_counts = df['evaluation'].value_counts()
percentages = df['evaluation'].value_counts(normalize=True) * 100

print(f"Total unique blogs: {num_blogs}")
print(f"Total questions evaluated: {num_questions}\n")

# Overall average grade
overall_avg_grade = df['grade'].mean()
print(f"Overall average grade: {overall_avg_grade:.3f}\n")

print("Evaluation Breakdown:")
for label in eval_counts.index:
    print(f" - {label}: {eval_counts[label]} ({percentages[label]:.1f}%)")

# Show side-by-side answers
print("\nSample comparison of RAG vs Ground Truth:")
display(df[['question', 'ground_truth_answer', 'rag_answer', 'evaluation']].head(10))

# Accuracy by question
accuracy_per_question = df.groupby('question')['evaluation'].apply(
    lambda x: (x == 'Correct').sum() / len(x)
).reset_index().rename(columns={'evaluation': 'accuracy'})
accuracy_per_question = accuracy_per_question.sort_values(by='accuracy', ascending=False)

print("\nAccuracy per question:")
display(accuracy_per_question)


# RAG vs No-RAG Performance

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

# Load both evaluation files from local paths
with open('/content/answers_with_rag.json', 'r', encoding='utf-8') as f:
    with_rag = pd.DataFrame(json.load(f))

with open('/content/answers_without_rag.json', 'r', encoding='utf-8') as f:
    without_rag = pd.DataFrame(json.load(f))

# Rename answer columns to distinguish
with_rag = with_rag.rename(columns={'rag_answer': 'answer_with_rag'})
without_rag = without_rag.rename(columns={'rag_answer': 'answer_without_rag'})

# Merge on question
merged_df = pd.merge(
    with_rag[['question', 'answer_with_rag', 'evaluation', 'grade']],
    without_rag[['question', 'answer_without_rag', 'evaluation', 'grade']],
    on='question',
    suffixes=('_with', '_without')
).rename(columns={
    'evaluation_with': 'eval_with_rag',
    'evaluation_without': 'eval_without_rag'
})

# Display comparison table
display(merged_df[['question', 'answer_without_rag', 'answer_with_rag']].head(10))

# Bar plot comparing evaluation counts
eval_counts = pd.DataFrame({
    'Without RAG': without_rag['evaluation'].value_counts(),
    'With RAG': with_rag['evaluation'].value_counts()
}).fillna(0)

# Reorder
eval_counts = eval_counts.reindex(['Correct', 'Partially Correct', 'Incorrect'])

# Plot
eval_counts.plot(kind='bar', color=['lightcoral', 'mediumseagreen'], edgecolor='black')
plt.title("Evaluation Results: With vs Without RAG")
plt.ylabel("Number of Answers")
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.tight_layout()
plt.show()
