In [None]:
import sys
sys.path.insert(0, '../src')

from naive_rag import NaiveRAG
import pandas as pd
import matplotlib.pyplot as plt

#Loading the dataset
rag = NaiveRAG()
documents, qa_pairs = rag.load_dataset()

print("Dataset Loaded, yes")
print("Document Length: ", f"{len(documents)}")
print("QA Pairs: ", f"{len(documents)}")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:naive_rag:Loading RAG Mini Wikipedia dataset...
INFO:naive_rag:Loading text corpus...


In [None]:
doc_lengths = [len(doc.split()) for doc in documents]

print("Document statistics:")
print(f"Total Documents: {len(documents)}")
print(f"Average length: {sum(doc_lengths)/ len(doc_lengths):.1f} word")
print(f"Minimum length: {min(doc_lengths)} words")
print(f"Maximum length: {max(doc_lengths)} words")

#Visualizing the document courpus' nature
plt.figure(figsize = (10, 5))
plt.hist(doc_lengths, bins = 50, edgecolor = 'black')
plt.xlabel('Document Length (words)')
plt.ylabel('Frequency')
plt.title('Distribution of Document Lengths')
plt.grid(alpha = 0.3)
plt.show()

In [None]:
print('Sample Documents:\n')
for i in range(5):
    print(f"Document {i+1}")
    print(documents[i][:250] + '...\n')

In [None]:
print("QA Pair Analysis")
print(f"\nTotal Questions: {len(qa_pairs)}")

answer_lengths = [len(qa['answer'].split()) for qa in qa_pairs]
print(f"Answer length - Average: {sum(answer_lengths)/len(answer_lengths):.2f} words")
print(f"Answer length - Min: {min(answer_lengths)} words")
print(f"Answer length - Max: {max(answer_lengths)} words")

question_lengths = [len(qa['question'].split()) for qa in qa_pairs]
print(f"Question length - Average: {sum(question_lengths)/len(question_lengths):.2f} words")

print("\n")
print(f"Sample QA Pairs")
for i in range(10):
    print(f"\nQ{i+1}: {qa_pairs[i]['question']}")
    print(f"A{i+1}: {qa_pairs[i]['answer']}")

In [None]:
print("Data Quality Assessment") #Data quality check
empty_docs = [i for i, doc in enumerate(documents) if not doc.strip()] #checking for empty documents
print("\nEmpty Documents: ", len(empty_docs))

short_docs = [i for i, doc in enumerate(documents) if len(doc.split()) < 10] #checking for very short documents
#potential issues can occur due to short lengths
print("\nVery short documents - less than 10 words: ", len(short_docs))

from collections import Counter #checking answer types and distribution
answers_sample = [qa_pairs[i]['answer'] for i in range(min(100, len(qa_pairs)))]
answer_types = Counter(answers_sample)
print(f"\nMost common answer types (sample of 100):")
for answer, count in answer_types.most_common(10):
    print(f" '{answer}' : {count}")

missing_questions = sum(1 for i in range(len(qa_pairs)) if not qa_pairs[i].get('question'))
missing_answers = sum(1 for i in range(len(qa_pairs)) if not qa_pairs[i].get('answer'))
print("Number of missing questions, answers: ", missing_questions, ",", missing_answers)

In [None]:
print("Key Insights\n")

print(f"1. Dataset Scale:")
print(f"   - {len(documents)} Wikipedia passages")
print(f"   - {len(qa_pairs)} question-answer pairs")
print(f"   - Avg document: {sum(doc_lengths)/len(doc_lengths):.0f} words")

print(f"\n2. Document Characteristics:")
print(f"   - Range: {min(doc_lengths)}-{max(doc_lengths)} words")
print(f"   - All documents are informative paragraphs from Wikipedia")
print(f"   - Normally structured and factual")

print(f"\n3. Q&A Characteristics:")
print(f"   - Average question length: {sum(question_lengths)/len(question_lengths):.1f} words")
print(f"   - Answers vary from single words to short phrases")
print(f"   - Mix of factual, yes/no, and descriptive questions")

print(f"\n4. Challenges:")
print(f"   - Shorter documents may not allow context matching for retrieval.")
print(f"   - Exact match metrics may nt be reflective of good/ bad implementation - since 'Yes', 'yes' both exist. F1 score will be comparatively more realistic as a metric of good/ bad implementation.")
print(f"   - Answer length is short - so verbose LLM behaviour should be avoided.")

In [None]:
print("Current working directory:", os.getcwd())

In [None]:
# Save basic statistics for documentation
import json

exploration_stats = {
    'total_documents': len(documents),
    'total_qa_pairs': len(qa_pairs),
    'avg_doc_length': sum(doc_lengths)/len(doc_lengths),
    'min_doc_length': min(doc_lengths),
    'max_doc_length': max(doc_lengths),
    'avg_question_length': sum(question_lengths)/len(question_lengths),
    'avg_answer_length': sum(answer_lengths)/len(answer_lengths)
}

# Create data directory if needed
import os
os.makedirs('../data/processed', exist_ok=True)

with open('../data/processed/exploration_stats.json', 'w') as f:
    json.dump(exploration_stats, f, indent=2)

print("Exploration statistics saved to data/processed/exploration_stats.json")