# Xaiqo Chatbot Training on PDF Documents

This notebook trains the Xaiqo chatbot model on PDF documents from the `/data/pdf_documents` directory. It extracts text from PDFs, processes the data, and fine-tunes a GPT-2 model.

In [None]:
# Install required dependencies
%pip install transformers==4.30.2 torch==2.0.1 PyPDF2==3.0.1 datasets==2.13.1

In [None]:
# Check if GPU is available
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")

In [None]:
# Mount Google Drive to save the trained model
from google.colab import drive
drive.mount('/content/drive')

# Create directories for data
!mkdir -p /content/data/pdf_documents
!mkdir -p /content/data/processed_data
!mkdir -p /content/data/extracted_text

In [None]:
from google.colab import files
import os

print("Upload your PDF files")
uploaded = files.upload()

for filename, content in uploaded.items():
    if filename.lower().endswith('.pdf'):
        with open(f'/content/data/pdf_documents/{filename}', 'wb') as f:
            f.write(content)
        print(f"Saved {filename} to /content/data/pdf_documents/")


In [None]:
# PDF Text Extraction
import os
import PyPDF2
import re
import json

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    text = ""
    
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text() + "\n\n"
        
        # Clean the text
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
        text = text.strip()
        
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

def process_all_pdfs(pdf_dir, output_dir):
    """Process all PDFs in a directory and save the extracted text."""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Get all PDF files
    pdf_files = [f for f in os.listdir(pdf_dir) if f.lower().endswith('.pdf')]
    
    if not pdf_files:
        print(f"No PDF files found in {pdf_dir}")
        return []
    
    extracted_texts = []
    
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        output_file = os.path.join(output_dir, pdf_file.replace('.pdf', '.txt'))
        
        # Extract text from PDF
        text = extract_text_from_pdf(pdf_path)
        
        if text:
            # Save to text file
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(text)
            
            extracted_texts.append(text)
            print(f"Processed {pdf_file} -> {output_file}")
        else:
            print(f"Failed to process {pdf_file}")
    
    return extracted_texts

# Process all PDFs in the directory
pdf_dir = '/content/data/pdf_documents'
output_dir = '/content/data/extracted_text'
extracted_texts = process_all_pdfs(pdf_dir, output_dir)

print(f"Extracted text from {len(extracted_texts)} PDF documents")

In [None]:
# Prepare training data in the format expected by the model
import json
import os
import re

def chunk_text(text, chunk_size=1000, overlap=200):
    """Split text into overlapping chunks."""
    chunks = []
    start = 0
    
    while start < len(text):
        end = min(start + chunk_size, len(text))
        
        # Try to end at a sentence boundary
        if end < len(text):
            # Look for sentence end within the last 100 characters
            search_end = max(end - 100, start)
            sentence_end = text.rfind('. ', search_end, end)
            if sentence_end > search_end:
                end = sentence_end + 1
        
        chunks.append(text[start:end])
        start = end - overlap
    
    return chunks

def create_qa_pairs(chunks):
    """Create question-answer pairs from text chunks for training."""
    qa_pairs = []
    
    for i, chunk in enumerate(chunks):
        # Create a simple QA pair for each chunk
        qa_pair = {
            'user': f"What information can you provide about this text: {chunk[:100]}...",
            'bot': chunk
        }
        qa_pairs.append(qa_pair)
        
        # Create some more specific questions based on content
        # This is a simple approach - in a real scenario, you might want to use
        # more sophisticated NLP techniques to generate better questions
        sentences = re.split(r'(?<=[.!?]) +', chunk)
        if len(sentences) > 3:
            summary = ' '.join(sentences[:3]) + '...'
            qa_pair = {
                'user': f"Can you summarize this information?",
                'bot': f"Here's a summary: {summary}"
            }
            qa_pairs.append(qa_pair)
    
    return qa_pairs

# Process all extracted texts
all_qa_pairs = []

for text in extracted_texts:
    chunks = chunk_text(text)
    qa_pairs = create_qa_pairs(chunks)
    all_qa_pairs.extend(qa_pairs)

# Save the training data
training_data_path = '/content/data/processed_data/train.json'
os.makedirs(os.path.dirname(training_data_path), exist_ok=True)

with open(training_data_path, 'w', encoding='utf-8') as f:
    json.dump(all_qa_pairs, f, indent=2)

print(f"Created {len(all_qa_pairs)} training examples from PDF content")
print(f"Training data saved to {training_data_path}")

In [None]:
# Train the model on the processed data
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    GPT2Tokenizer, 
    GPT2LMHeadModel, 
    AdamW, 
    get_linear_schedule_with_warmup,
    TrainingArguments,
    Trainer
)
import json
import logging

# Set up logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

class ConversationDataset(Dataset):
    """Dataset for training the chatbot on conversation data"""
    def __init__(self, data_path, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.examples = []
        
        # Load data from file
        if os.path.exists(data_path):
            with open(data_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                
            # Process conversations
            for conversation in data:
                if isinstance(conversation, list):
                    # Format: list of message pairs
                    formatted_text = self.tokenizer.bos_token
                    for user_msg, bot_msg in conversation:
                        formatted_text += f"User: {user_msg}{self.tokenizer.sep_token}"
                        formatted_text += f"Bot: {bot_msg}{self.tokenizer.sep_token}"
                    
                    self.examples.append(formatted_text)
                elif isinstance(conversation, dict) and 'user' in conversation and 'bot' in conversation:
                    # Format: dict with 'user' and 'bot' keys
                    formatted_text = self.tokenizer.bos_token
                    formatted_text += f"User: {conversation['user']}{self.tokenizer.sep_token}"
                    formatted_text += f"Bot: {conversation['bot']}{self.tokenizer.sep_token}"
                    
                    self.examples.append(formatted_text)
        else:
            logger.warning(f"Data file {data_path} not found. Using empty dataset.")
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        text = self.examples[idx]
        encodings = self.tokenizer(text, truncation=True, max_length=self.max_length, padding="max_length")
        
        # Create labels (same as input_ids for language modeling)
        encodings["labels"] = encodings["input_ids"].copy()
        
        # Convert to tensors
        item = {key: torch.tensor(val) for key, val in encodings.items()}
        return item

# Define training parameters
output_dir = '/content/drive/MyDrive/xaiqo_models'
os.makedirs(output_dir, exist_ok=True)

training_args = TrainingArguments(
    output_dir=f'{output_dir}/results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=f'{output_dir}/logs',
    logging_steps=10,
    save_steps=1000,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU is available
)

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Add special tokens
special_tokens = {
    'pad_token': '<PAD>',
    'bos_token': '<BOS>',
    'eos_token': '<EOS>',
    'sep_token': '<SEP>'
}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

# Load your dataset
train_dataset = ConversationDataset('/content/data/processed_data/train.json', tokenizer)

# Split into train and validation if dataset is large enough
if len(train_dataset) > 10:  # Only split if we have enough examples
    train_size = int(0.9 * len(train_dataset))
    val_size = len(train_dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])
else:
    val_dataset = None
    training_args.evaluation_strategy = "no"  # Disable evaluation if no validation set

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

# Save the model
model_save_path = f'{output_dir}/final_model'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model successfully trained and saved to {model_save_path}")

In [None]:
# Test the trained model
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

class ImprovedChatbot:
    """A simplified version of the chatbot for testing"""
    def __init__(self, model_path):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print(f"Using device: {self.device}")
        
        # Load tokenizer and model
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        self.model = GPT2LMHeadModel.from_pretrained(model_path).to(self.device)
        self.model.eval()
        
    def answer_question(self, question, max_length=100):
        # Format input
        input_text = f"{self.tokenizer.bos_token}User: {question}{self.tokenizer.sep_token}Bot:"
        
        # Encode input
        input_ids = self.tokenizer.encode(input_text, return_tensors='pt').to(self.device)
        
        # Generate response
        with torch.no_grad():
            output = self.model.generate(
                input_ids,
                max_length=input_ids.shape[1] + max_length,
                temperature=0.7,
                top_k=50,
                top_p=0.95,
                do_sample=True,
                pad_token_id=self.tokenizer.pad_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
                no_repeat_ngram_size=3,
                repetition_penalty=1.2,
                num_return_sequences=1
            )
        
        # Decode response
        response = self.tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
        
        # Clean up response
        response = response.strip()
        if "User:" in response:
            response = response.split("User:")[0].strip()
            
        return response

# Initialize chatbot with the trained model
model_path = '/content/drive/MyDrive/xaiqo_models/final_model'
chatbot = ImprovedChatbot(model_path)

test_questions = [
    "What information can you provide based on the documents you were trained on?",
    "Can you summarize the key points from the documents?",
]

for question in test_questions:
    print(f"\nQuestion: {question}")
    response = chatbot.answer_question(question)
    print(f"Response: {response}")

## Conclusion

This notebook has demonstrated how to:

1. Extract text from PDF documents
2. Process the extracted text into a format suitable for training
3. Fine-tune a GPT-2 model on the processed data
4. Test the trained model with sample questions
