In [3]:
import pandas as pd
from datasets import load_dataset
def _load_topicqa_dataset(max_conversations: int):
    """Load TopiOCQA dataset"""
    print("Loading TopiOCQA dataset...")
    dataset = load_dataset("McGill-NLP/TopiOCQA", trust_remote_code=True)
    
    # Group turns by conversation number  
    conversation_turns = {}
    count = 0
    
    for example in dataset['train']:
        conv_no = example['Conversation_no']
        turn_no = example['Turn_no']
        
        if conv_no not in conversation_turns:
            conversation_turns[conv_no] = {}
        
        conversation_turns[conv_no][turn_no] = {
            'question': str(example['Question']),
            'answer': str(example['Answer']),
            'topic': str(example['Topic']),
            'context': example.get('Context', []),
            'gold_passage': example.get('Gold_passage', {})
        }
        
        count += 1
        if count >= max_conversations * 15:
            break
    
    # Build conversations from grouped turns
    conversations = []
    corpus = {}
    
    for conv_no, turns_dict in list(conversation_turns.items())[:max_conversations]:
        sorted_turns = sorted(turns_dict.items(), key=lambda x: x[0])
        
        if len(sorted_turns) >= 2:
            turns = []
            for turn_no, turn_data in sorted_turns:
                turns.append({
                    'turn_id': turn_no - 1,
                    'question': turn_data['question'],
                    'answer': turn_data['answer'],
                    'topic': turn_data['topic'],
                    'context': turn_data['context']
                })
            
            conversations.append({
                'conversation_id': conv_no,
                'turns': turns,
                'topic': sorted_turns[0][1]['topic']
            })
            
            # Add context to corpus - improved handling
            gold_passage = sorted_turns[0][1]['gold_passage']
            context_text = ""
            
            if isinstance(gold_passage, dict):
                context_text = str(gold_passage.get('text', gold_passage.get('content', '')))
            elif isinstance(gold_passage, str):
                context_text = gold_passage
            elif isinstance(gold_passage, list) and len(gold_passage) > 0:
                context_text = ' '.join([str(item) for item in gold_passage])
            
            # Also add context from turns if available
            if not context_text:
                for turn_no, turn_data in sorted_turns:
                    if turn_data['context']:
                        if isinstance(turn_data['context'], list):
                            context_text = ' '.join([str(item) for item in turn_data['context']])
                        else:
                            context_text = str(turn_data['context'])
                        break
            
            if context_text and len(context_text.strip()) > 10:
                corpus[conv_no] = context_text.strip()
    
    print("Corpus:")
    if corpus:
        for doc_id, doc_text in corpus.items():
            print(f"Document ID: {doc_id}")
            print(doc_text[:200])  # Print first 200 characters of each document
            print("-" * 50)
    else:
        print("Corpus is empty.")
    
    print(f"Loaded {len(conversations)} conversations with {len(corpus)} corpus documents")
    return {'conversations': conversations, 'corpus': corpus}

data = _load_topicqa_dataset(10)

Loading TopiOCQA dataset...
Corpus:
Document ID: 1
Australian personnel also took part in the invasion of Southern France in August 1944, and RAAF airmen continued to operate against German forces until the end of the war in May 1945. However, the rel
--------------------------------------------------
Document ID: 2
Yttrium is a chemical element with the symbol Y and atomic number 39. It is a silvery-metallic transition metal chemically similar to the lanthanides and has often been classified as a "rare-earth ele
--------------------------------------------------
Document ID: 3
The Pacific swift ("Apus pacificus") is a species of bird that is part of the Swift family. It breeds in eastern Asia. It is strongly migratory, spending the northern hemisphere's winter in Southeast 
--------------------------------------------------
Document ID: 4
PCH is a direct-marketing company that sells merchandise, magazine subscriptions and operates several prize-based websites. While best known for the