# BioASQ Dataset Exploration

This notebook explores the rag-mini-bioasq dataset structure and provides visualizations.

In [None]:
import sys
sys.path.insert(0, '../src')

from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
%matplotlib inline

## 1. Load Dataset

In [None]:
# Load the dataset
dataset = load_dataset('rag-datasets/rag-mini-bioasq')
print(f"Dataset splits: {list(dataset.keys())}")

In [None]:
# Examine structure
for split in dataset.keys():
    print(f"\n{split} split:")
    print(f"  Number of samples: {len(dataset[split])}")
    print(f"  Features: {dataset[split].features}")
    if len(dataset[split]) > 0:
        print(f"  Sample keys: {list(dataset[split][0].keys())}")

## 2. Examine Sample Data

In [None]:
# Look at a sample
sample = dataset['test'][0]
print("Sample structure:")
for key, value in sample.items():
    if isinstance(value, str):
        print(f"  {key}: {value[:200]}..." if len(str(value)) > 200 else f"  {key}: {value}")
    elif isinstance(value, list):
        print(f"  {key}: list of {len(value)} items")
    else:
        print(f"  {key}: {type(value).__name__}")

In [None]:
# Display a few questions and answers
print("Sample Questions and Answers:")
print("="*60)
for i in range(min(5, len(dataset['test']))):
    item = dataset['test'][i]
    q = item.get('question', item.get('query', 'N/A'))
    a = item.get('answer', item.get('answers', 'N/A'))
    if isinstance(a, list):
        a = a[0] if a else 'N/A'
    print(f"\nQ{i+1}: {q}")
    print(f"A{i+1}: {a[:200]}..." if len(str(a)) > 200 else f"A{i+1}: {a}")

## 3. Analyze Relevant Passages

In [None]:
# Count relevant passages per question
relevant_counts = []
for item in dataset['test']:
    relevant_ids = item.get('relevant_passage_ids', 
                           item.get('positive_passage_ids', 
                           item.get('relevant_passages', [])))
    if isinstance(relevant_ids, list) and len(relevant_ids) > 0 and isinstance(relevant_ids[0], dict):
        relevant_ids = [p.get('passage_id', p.get('id')) for p in relevant_ids]
    relevant_counts.append(len(relevant_ids))

print(f"Number of relevant passages per question:")
print(f"  Min: {min(relevant_counts)}")
print(f"  Max: {max(relevant_counts)}")
print(f"  Mean: {sum(relevant_counts)/len(relevant_counts):.2f}")
print(f"  Median: {sorted(relevant_counts)[len(relevant_counts)//2]}")

In [None]:
# Plot distribution
plt.figure(figsize=(10, 5))
plt.hist(relevant_counts, bins=range(0, max(relevant_counts)+2), edgecolor='black', alpha=0.7)
plt.xlabel('Number of Relevant Passages')
plt.ylabel('Number of Questions')
plt.title('Distribution of Relevant Passages per Question')
plt.axvline(sum(relevant_counts)/len(relevant_counts), color='r', linestyle='--', label=f'Mean: {sum(relevant_counts)/len(relevant_counts):.2f}')
plt.legend()
plt.tight_layout()
plt.show()

## 4. Analyze Passage Corpus

In [None]:
# Collect all unique passages
passages = {}
for item in dataset['test']:
    if 'passages' in item:
        for p in item['passages']:
            pid = p.get('passage_id', p.get('id'))
            text = p.get('passage_text', p.get('text'))
            if pid and text:
                passages[pid] = text

print(f"Total unique passages: {len(passages)}")

In [None]:
# Analyze passage lengths
if passages:
    passage_lengths = [len(text.split()) for text in passages.values()]
    
    print(f"Passage word counts:")
    print(f"  Min: {min(passage_lengths)}")
    print(f"  Max: {max(passage_lengths)}")
    print(f"  Mean: {sum(passage_lengths)/len(passage_lengths):.2f}")
    
    plt.figure(figsize=(10, 5))
    plt.hist(passage_lengths, bins=50, edgecolor='black', alpha=0.7)
    plt.xlabel('Passage Length (words)')
    plt.ylabel('Count')
    plt.title('Distribution of Passage Lengths')
    plt.tight_layout()
    plt.show()

## 5. Question Analysis

In [None]:
# Analyze question lengths
questions = [item.get('question', item.get('query', '')) for item in dataset['test']]
question_lengths = [len(q.split()) for q in questions]

print(f"Question word counts:")
print(f"  Min: {min(question_lengths)}")
print(f"  Max: {max(question_lengths)}")
print(f"  Mean: {sum(question_lengths)/len(question_lengths):.2f}")

plt.figure(figsize=(10, 5))
plt.hist(question_lengths, bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('Question Length (words)')
plt.ylabel('Count')
plt.title('Distribution of Question Lengths')
plt.tight_layout()
plt.show()

In [None]:
# Common question patterns
from collections import Counter

first_words = [q.split()[0].lower() if q.split() else '' for q in questions]
word_counts = Counter(first_words)

print("Most common question starters:")
for word, count in word_counts.most_common(10):
    print(f"  {word}: {count}")

## 6. Summary

In [None]:
print("="*60)
print("DATASET SUMMARY")
print("="*60)
print(f"\nQuestions: {len(dataset['test'])}")
print(f"Unique passages: {len(passages)}")
print(f"Avg relevant passages per question: {sum(relevant_counts)/len(relevant_counts):.2f}")
print(f"Avg question length: {sum(question_lengths)/len(question_lengths):.1f} words")
if passages:
    print(f"Avg passage length: {sum(passage_lengths)/len(passage_lengths):.1f} words")