# Data Processing for Xaiqo Chatbot

This notebook processes PDF documents into training data for the chatbot model.

In [None]:
# Install required dependencies
%pip install PyPDF2>=3.0.0 tqdm

In [None]:
import os
import json
import PyPDF2
import re
from tqdm import tqdm

os.makedirs('/content/data/pdf_documents', exist_ok=True)
os.makedirs('/content/data/processed_data', exist_ok=True)

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
        return text.strip()
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return ""

def chunk_text(text, chunk_size=512, overlap=50):
    """Split text into overlapping chunks."""
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        if end < len(text):
            # Try to end at a sentence boundary
            last_period = text.rfind('.', start, end)
            if last_period > start:
                end = last_period + 1
        chunks.append(text[start:end].strip())
        start = end - overlap
    return chunks

def create_training_pairs(chunks):
    """Create training pairs from text chunks."""
    training_pairs = []
    for chunk in chunks:
        # Create direct QA pair
        qa_pair = {
            'input': f"Please explain this text: {chunk}",
            'output': chunk
        }
        training_pairs.append(qa_pair)
        
        # Create summary pair
        sentences = re.split(r'(?<=[.!?]) +', chunk)
        if len(sentences) > 2:
            summary = ' '.join(sentences[:2]) + '...'
            summary_pair = {
                'input': f"Summarize this text: {chunk}",
                'output': summary
            }
            training_pairs.append(summary_pair)
    
    return training_pairs

In [None]:
def process_pdf_directory(pdf_dir):
    """Process all PDFs in directory and create training data."""
    all_training_pairs = []
    pdf_files = [f for f in os.listdir(pdf_dir) if f.lower().endswith('.pdf')]
    
    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        text = extract_text_from_pdf(pdf_path)
        if text:
            chunks = chunk_text(text)
            training_pairs = create_training_pairs(chunks)
            all_training_pairs.extend(training_pairs)
    
    return all_training_pairs

# Process PDFs and save training data
training_pairs = process_pdf_directory('/content/data/pdf_documents')
output_path = '/content/data/processed_data/training_data.json'

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(training_pairs, f, indent=2)

print(f"Created {len(training_pairs)} training pairs")