In [1]:
import boto3
import tabula
import faiss
import json
import base64
import pymupdf
import requests
import os
import logging
import numpy as np
import warnings
import pandas as pd
import cv2
import pytesseract
from sentence_transformers import SentenceTransformer
from PIL import Image
from tqdm import tqdm
from botocore.exceptions import ClientError
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_aws import ChatBedrock
from typing import List, Dict, Optional, Tuple
import re
from pathlib import Path
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jdk-24'  # Replace XX with your version

In [3]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore")

class FinancialPDFProcessor:
    """
    A comprehensive PDF processor specialized for financial documents with multi-modal RAG capabilities.
    Handles both text-based and image-based tables with high precision.
    """
    
    # def __init__(self, embedding_dimension: int = 384, chunk_size: int = 700, chunk_overlap: int = 200):
    #     self.embedding_dimension = embedding_dimension
    #     self.text_splitter = RecursiveCharacterTextSplitter(
    #         chunk_size=chunk_size, 
    #         chunk_overlap=chunk_overlap, 
    #         length_function=len
    #     )
    #     self.bedrock_client = boto3.client(service_name="bedrock-runtime", region_name='us-east-1')
    #     self.model_id = "amazon.titan-embed-image-v1"
    #     self.llm_model_id = "amazon.nova-pro-v1:0"
    #     self.index = None
    #     self.items = []
    #     self.current_pdf_info = {}
    
    def __init__(self, embedding_dimension: int = 384, chunk_size: int = 700, chunk_overlap: int = 200):
        self.embedding_dimension = embedding_dimension
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap, 
            length_function=len
        )
        # Replace AWS with free models
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.generator = pipeline("text2text-generation", model="google/flan-t5-base")
        self.qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
        self.index = None
        self.items = []
        self.current_pdf_info = {}
        
    def select_pdf_source(self) -> Tuple[str, str]:
        """
        Interactive PDF selection - either from URL or local file.
        Returns: (filepath, source_type)
        """
        print("\n=== PDF Source Selection ===")
        print("1. Download from URL")
        print("2. Use local file")
        print("3. Select from common financial document URLs")
        
        choice = input("Choose option (1-3): ").strip()
        
        if choice == "1":
            return self._download_from_url()
        elif choice == "2":
            return self._select_local_file()
        elif choice == "3":
            return self._select_from_common_urls()
        else:
            print("Invalid choice. Using local file selection.")
            return self._select_local_file()
    
    def _download_from_url(self) -> Tuple[str, str]:
        """Download PDF from URL"""
        url = input("Enter PDF URL: ").strip()
        filename = input("Enter filename (with .pdf extension): ").strip()
        
        if not filename.endswith('.pdf'):
            filename += '.pdf'
            
        filepath = os.path.join("data", filename)
        os.makedirs("data", exist_ok=True)
        
        try:
            response = requests.get(url, stream=True, timeout=30)
            response.raise_for_status()
            
            with open(filepath, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            
            print(f"✓ Downloaded: {filepath}")
            return filepath, "url"
        except Exception as e:
            print(f"✗ Download failed: {e}")
            return self._select_local_file()
    
    def _select_local_file(self) -> Tuple[str, str]:
        """Select local PDF file"""
        filepath = input("Enter path to local PDF file: ").strip()
        
        if not os.path.exists(filepath):
            print(f"✗ File not found: {filepath}")
            # List available PDFs in current directory
            pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]
            if pdf_files:
                print("\nAvailable PDF files in current directory:")
                for i, pdf in enumerate(pdf_files, 1):
                    print(f"{i}. {pdf}")
                try:
                    choice = int(input("Select file number: ")) - 1
                    filepath = pdf_files[choice]
                except (ValueError, IndexError):
                    raise FileNotFoundError("No valid PDF file selected")
            else:
                raise FileNotFoundError("No PDF files found")
        
        print(f"✓ Selected: {filepath}")
        return filepath, "local"
    
    def _select_from_common_urls(self) -> Tuple[str, str]:
        """Select from predefined financial document URLs"""
        common_urls = {
            "1": ("Sample Annual Report", "https://www.sec.gov/Archives/edgar/data/320193/000032019323000077/aapl-20230930.htm"),
            "2": ("Financial Statement Sample", "https://arxiv.org/pdf/1706.03762.pdf"),  # Placeholder
            "3": ("Custom URL", "")
        }
        
        print("\nCommon Financial Documents:")
        for key, (name, url) in common_urls.items():
            print(f"{key}. {name}")
        
        choice = input("Select option: ").strip()
        
        if choice == "3":
            return self._download_from_url()
        elif choice in common_urls:
            name, url = common_urls[choice]
            filename = f"{name.lower().replace(' ', '_')}.pdf"
            filepath = os.path.join("data", filename)
            
            # Download logic here
            return self._download_specific_url(url, filepath)
        else:
            return self._download_from_url()
    
    def _download_specific_url(self, url: str, filepath: str) -> Tuple[str, str]:
        """Download from specific URL"""
        os.makedirs("data", exist_ok=True)
        
        try:
            response = requests.get(url, stream=True, timeout=30)
            response.raise_for_status()
            
            with open(filepath, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            
            print(f"✓ Downloaded: {filepath}")
            return filepath, "url"
        except Exception as e:
            print(f"✗ Download failed: {e}")
            return self._select_local_file()
    
    def create_directories(self, base_dir: str):
        """Create necessary directories for processing"""
        directories = ["images", "text", "tables", "page_images", "extracted_tables"]
        for directory in directories:
            os.makedirs(os.path.join(base_dir, directory), exist_ok=True)
    
    def extract_table_from_image(self, image_path: str, page_num: int) -> Optional[str]:
        """
        Extract tabular data from image using OCR with enhanced preprocessing for financial data.
        """
        try:
            # Load image
            img = cv2.imread(image_path)
            if img is None:
                return None
            
            # Preprocessing for better OCR
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            
            # Apply threshold to get image with only black and white
            _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            
            # Noise removal
            kernel = np.ones((1,1), np.uint8)
            # opening = cv2.morphologyEx(thresh, cv2.MORPH_OPENING, kernel, iterations=1)
            opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
            
            # Extract text with specific config for tables
            custom_config = r'--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.,()$%-+ '
            text = pytesseract.image_to_string(opening, config=custom_config)
            
            # Try to structure the extracted text as a table
            lines = [line.strip() for line in text.split('\n') if line.strip()]
            
            # Look for financial patterns (numbers with currency, percentages, etc.)
            financial_lines = []
            for line in lines:
                if re.search(r'[\d,]+\.?\d*[%$]?|^\s*\d', line):
                    financial_lines.append(line)
            
            if financial_lines:
                return '\n'.join(financial_lines)
            else:
                return text if text.strip() else None
                
        except Exception as e:
            logger.error(f"Error extracting table from image {image_path}: {e}")
            return None
    
    def process_tables_advanced(self, doc, page_num: int, base_dir: str, items: List[Dict]):
        """
        Enhanced table processing with multiple extraction methods for financial data.
        """
        filepath = self.current_pdf_info['filepath']
        
        # Method 1: Tabula extraction (for text-based tables)
        try:
            tables = tabula.read_pdf(filepath, pages=page_num + 1, multiple_tables=True)
            if tables:
                for table_idx, table in enumerate(tables):
                    # Enhanced table processing for financial data
                    table_text = self._format_financial_table(table)
                    table_file_name = f"{base_dir}/tables/{os.path.basename(filepath)}_table_{page_num}_{table_idx}.txt"
                    
                    with open(table_file_name, 'w', encoding='utf-8') as f:
                        f.write(table_text)
                    
                    items.append({
                        "page": page_num,
                        "type": "table",
                        "text": table_text,
                        "path": table_file_name,
                        "extraction_method": "tabula"
                    })
        except Exception as e:
            logger.warning(f"Tabula extraction failed for page {page_num}: {e}")
        
        # Method 2: Image-based table extraction
        page = doc[page_num]
        images = page.get_images()
        
        for idx, image in enumerate(images):
            try:
                xref = image[0]
                pix = pymupdf.Pixmap(doc, xref)
                image_path = f"{base_dir}/images/{os.path.basename(filepath)}_image_{page_num}_{idx}_{xref}.png"
                pix.save(image_path)
                
                # Try to extract table data from this image
                table_text = self.extract_table_from_image(image_path, page_num)
                
                if table_text and self._is_likely_financial_table(table_text):
                    table_file_name = f"{base_dir}/extracted_tables/{os.path.basename(filepath)}_img_table_{page_num}_{idx}.txt"
                    with open(table_file_name, 'w', encoding='utf-8') as f:
                        f.write(table_text)
                    
                    items.append({
                        "page": page_num,
                        "type": "table",
                        "text": table_text,
                        "path": table_file_name,
                        "extraction_method": "ocr",
                        "source_image": image_path
                    })
                
            except Exception as e:
                logger.error(f"Error processing image {idx} on page {page_num}: {e}")
    
    def _format_financial_table(self, table: pd.DataFrame) -> str:
        """Format pandas DataFrame as financial table with proper alignment"""
        try:
            # Clean and format the dataframe
            table = table.fillna('')
            
            # Convert to string with proper formatting
            formatted_rows = []
            
            # Add headers
            headers = [str(col) for col in table.columns]
            formatted_rows.append(" | ".join(headers))
            formatted_rows.append("-" * len(" | ".join(headers)))
            
            # Add data rows
            for _, row in table.iterrows():
                formatted_row = []
                for value in row:
                    if pd.isna(value):
                        formatted_row.append("")
                    elif isinstance(value, (int, float)):
                        # Format numbers properly
                        if abs(value) >= 1000000:
                            formatted_row.append(f"{value:,.0f}")
                        elif abs(value) >= 1000:
                            formatted_row.append(f"{value:,.2f}")
                        else:
                            formatted_row.append(f"{value}")
                    else:
                        formatted_row.append(str(value))
                formatted_rows.append(" | ".join(formatted_row))
            
            return "\n".join(formatted_rows)
        except Exception as e:
            logger.error(f"Error formatting table: {e}")
            return str(table)
    
    def _is_likely_financial_table(self, text: str) -> bool:
        """Determine if extracted text likely contains financial table data"""
        financial_keywords = [
            'revenue', 'income', 'profit', 'loss', 'assets', 'liabilities',
            'equity', 'cash', 'flow', 'balance', 'statement', 'fiscal',
            'quarter', 'annual', 'million', 'billion', 'thousand', '$',
            'expenses', 'costs', 'net', 'gross', 'total', 'year', 'ytd'
        ]
        
        text_lower = text.lower()
        
        # Check for financial keywords
        keyword_score = sum(1 for keyword in financial_keywords if keyword in text_lower)
        
        # Check for numerical patterns typical in financial data
        number_patterns = len(re.findall(r'\$?[\d,]+\.?\d*[MBK]?', text))
        percentage_patterns = len(re.findall(r'\d+\.?\d*%', text))
        
        return keyword_score >= 2 or number_patterns >= 3 or percentage_patterns >= 1
    
    def process_text_chunks(self, text: str, page_num: int, base_dir: str, items: List[Dict]):
        """Process text chunks with financial context awareness"""
        chunks = self.text_splitter.split_text(text)
        for i, chunk in enumerate(chunks):
            text_file_name = f"{base_dir}/text/{os.path.basename(self.current_pdf_info['filepath'])}_text_{page_num}_{i}.txt"
            
            with open(text_file_name, 'w', encoding='utf-8') as f:
                f.write(chunk)
            
            # Enhance chunk with financial context
            chunk_type = "financial_text" if self._is_likely_financial_table(chunk) else "text"
            
            items.append({
                "page": page_num,
                "type": chunk_type,
                "text": chunk,
                "path": text_file_name
            })
    
    def process_images(self, page, page_num: int, base_dir: str, items: List[Dict]):
        """Process images with base64 encoding"""
        doc = page.parent
        images = page.get_images()
        
        for idx, image in enumerate(images):
            try:
                xref = image[0]
                pix = pymupdf.Pixmap(doc, xref)
                image_name = f"{base_dir}/images/{os.path.basename(self.current_pdf_info['filepath'])}_image_{page_num}_{idx}_{xref}.png"
                pix.save(image_name)
                
                with open(image_name, 'rb') as f:
                    encoded_image = base64.b64encode(f.read()).decode('utf8')
                
                items.append({
                    "page": page_num,
                    "type": "image",
                    "path": image_name,
                    "image": encoded_image
                })
                
            except Exception as e:
                logger.error(f"Error processing image {idx} on page {page_num}: {e}")
    
    def process_page_images(self, page, page_num: int, base_dir: str, items: List[Dict]):
        """Process full page images"""
        try:
            pix = page.get_pixmap()
            page_path = os.path.join(base_dir, f"page_images/page_{page_num:03d}.png")
            pix.save(page_path)
            
            with open(page_path, 'rb') as f:
                page_image = base64.b64encode(f.read()).decode('utf8')
            
            items.append({
                "page": page_num,
                "type": "page",
                "path": page_path,
                "image": page_image
            })
            
        except Exception as e:
            logger.error(f"Error processing page image {page_num}: {e}")
    
    # def generate_multimodal_embeddings(self, prompt: str = None, image: str = None) -> Optional[List[float]]:
    #     """Generate embeddings using Amazon Titan"""
    #     if not prompt and not image:
    #         raise ValueError("Please provide either text prompt, base64 image, or both")
        
    #     body = {"embeddingConfig": {"outputEmbeddingLength": self.embedding_dimension}}
        
    #     if prompt:
    #         body["inputText"] = prompt
    #     if image:
    #         body["inputImage"] = image
        
    #     try:
    #         response = self.bedrock_client.invoke_model(
    #             modelId=self.model_id,
    #             body=json.dumps(body),
    #             accept="application/json",
    #             contentType="application/json"
    #         )
            
    #         result = json.loads(response.get("body").read())
    #         return result.get("embedding")
            
    #     except ClientError as err:
    #         logger.error(f"Couldn't invoke Titan embedding model: {err.response['Error']['Message']}")
    #         return None
    
    def generate_multimodal_embeddings(self, prompt: str = None, image: str = None) -> Optional[List[float]]:
        if prompt:
            return self.embedding_model.encode(prompt).tolist()
        return None  # Skip image embeddings for now
    
    def process_pdf(self, filepath: str) -> Dict:
        """Main PDF processing function"""
        self.current_pdf_info = {
            'filepath': filepath,
            'filename': os.path.basename(filepath)
        }
        
        print(f"\n=== Processing PDF: {self.current_pdf_info['filename']} ===")
        
        doc = pymupdf.open(filepath)
        num_pages = len(doc)
        base_dir = f"data_{Path(filepath).stem}"
        
        self.create_directories(base_dir)
        self.items = []
        
        # Process each page
        for page_num in tqdm(range(num_pages), desc="Processing PDF pages"):
            page = doc[page_num]
            text = page.get_text()
            
            # Enhanced table processing
            self.process_tables_advanced(doc, page_num, base_dir, self.items)
            
            # Process text chunks
            self.process_text_chunks(text, page_num, base_dir, self.items)
            
            # Process images
            self.process_images(page, page_num, base_dir, self.items)
            
            # Process page images
            self.process_page_images(page, page_num, base_dir, self.items)
        
        doc.close()
        
        # Generate embeddings
        self._generate_all_embeddings()
        
        # Create FAISS index
        self._create_faiss_index()
        
        print(f"✓ Processed {len(self.items)} items from {num_pages} pages")
        self._print_processing_summary()
        
        return {
            'filepath': filepath,
            'items_count': len(self.items),
            'pages': num_pages,
            'base_dir': base_dir
        }
    
    def _generate_all_embeddings(self):
        """Generate embeddings for all processed items"""
        print("\n=== Generating Embeddings ===")
        
        item_counts = {
            'text': sum(1 for item in self.items if item['type'] in ['text', 'financial_text']),
            'table': sum(1 for item in self.items if item['type'] == 'table'),
            'image': sum(1 for item in self.items if item['type'] == 'image'),
            'page': sum(1 for item in self.items if item['type'] == 'page')
        }
        
        counters = dict.fromkeys(item_counts.keys(), 0)
        
        with tqdm(total=len(self.items), desc="Generating embeddings") as pbar:
            for item in self.items:
                item_type = item['type']
                
                if item_type in ['text', 'table', 'financial_text']:
                    embedding = self.generate_multimodal_embeddings(prompt=item['text'])
                    counters['text' if item_type != 'table' else 'table'] += 1
                else:
                    embedding = self.generate_multimodal_embeddings(image=item['image'])
                    counters[item_type] += 1
                
                item['embedding'] = embedding
                
                pbar.set_postfix_str(
                    f"Text: {counters['text']}/{item_counts['text']}, "
                    f"Table: {counters['table']}/{item_counts['table']}, "
                    f"Image: {counters['image']}/{item_counts['image']}"
                )
                pbar.update(1)
    
    def _create_faiss_index(self):
        """Create and populate FAISS index"""
        all_embeddings = np.array([item['embedding'] for item in self.items if item['embedding']])
        
        self.index = faiss.IndexFlatL2(self.embedding_dimension)
        self.index.add(np.array(all_embeddings, dtype=np.float32))
        
        print(f"✓ Created FAISS index with {self.index.ntotal} embeddings")
    
    def _print_processing_summary(self):
        """Print summary of processed items"""
        summary = {}
        for item in self.items:
            item_type = item['type']
            summary[item_type] = summary.get(item_type, 0) + 1
        
        print("\n=== Processing Summary ===")
        for item_type, count in summary.items():
            print(f"{item_type.capitalize()}: {count}")
    
    def query_documents(self, query: str, k: int = 15) -> str:
        """Query the processed documents using RAG"""
        if not self.index or not self.items:
            return "No documents processed yet. Please process a PDF first."
        
        print(f"\n=== Querying: {query} ===")
        
        # Generate query embedding
        query_embedding = self.generate_multimodal_embeddings(prompt=query)
        if not query_embedding:
            return "Failed to generate query embedding."
        
        # Search for similar items
        distances, result = self.index.search(
            np.array(query_embedding, dtype=np.float32).reshape(1, -1), k=k
        )
        
        # Get matched items
        matched_items = []
        for idx in result.flatten():
            if idx < len(self.items):
                item = {k: v for k, v in self.items[idx].items() if k != 'embedding'}
                matched_items.append(item)
        
        # Generate response using Nova
        response = self._invoke_nova_multimodal(query, matched_items)
        
        print(f"✓ Found {len(matched_items)} relevant items")
        return response
    
    # def _invoke_nova_multimodal(self, prompt: str, matched_items: List[Dict]) -> str:
    #     context = "\n".join([f"[Page {item['page']}] {item['text']}" for item in matched_items if item['type'] in ['text', 'table']])
        
    #     try:
    #         result = self.qa_pipeline(question=prompt, context=context[:2000])  # Limit context length
    #         return f"{result['answer']} (confidence: {result['score']:.2f})"
    #     except:
    #         return f"Based on the documents:\n{context[:500]}..."
        
    def _invoke_nova_multimodal(self, prompt: str, matched_items: List[Dict]) -> str:
        # Build and clean context
        context_parts = []
        
        for item in matched_items:
            if item['type'] in ['text', 'table', 'financial_text']:
                # Clean table formatting
                text = item['text']
                text = re.sub(r'\s*\|\s*', ' ', text)  # Remove pipe separators
                text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
                context_parts.append(f"Page {item['page']}: {text}")
        
        full_context = "\n".join(context_parts)
        
        # Extract specific relevant information
        prompt_lower = prompt.lower()
        relevant_info = []
        confidence_score = 0.0
        
        # Search for direct answers
        for line in full_context.split('\n'):
            line_lower = line.lower()
            
            # Check for year patterns in query
            year_match = re.search(r'20\d{2}', prompt)
            if year_match:
                year = year_match.group()
                if year in line and any(keyword in line_lower for keyword in ['revenue', 'ebitda', 'profit', 'sales', 'income']):
                    relevant_info.append(line.strip())
                    confidence_score += 0.3
            
            # Check for financial metrics
            if any(metric in prompt_lower for metric in ['revenue', 'ebitda', 'profit', 'income']):
                if re.search(r'\d+\.?\d*', line) and any(metric in line_lower for metric in ['revenue', 'ebitda', 'profit', 'income', 'sales']):
                    relevant_info.append(line.strip())
                    confidence_score += 0.2
        
        # Prepare context for generation
        if relevant_info:
            context_for_generation = "\n".join(relevant_info[:10])
            confidence_score = min(confidence_score, 1.0)
        else:
            context_for_generation = full_context[:1500]
            confidence_score = 0.1
        
        # Generate answer with better prompt
        if "how many years" in prompt_lower:
            # Extract all years mentioned
            years = sorted(set(re.findall(r'20\d{2}', full_context)))
            if years:
                answer = f"The document contains data for {len(years)} years: {', '.join(years)}"
                confidence_score = 0.9
            else:
                answer = "Could not determine the number of years covered in the document."
                confidence_score = 0.2
        
        elif "what is this pdf about" in prompt_lower:
            # Look for title/header information
            input_text = f"Summarize what this document is about based on the following content:\n\n{context_for_generation[:1000]}\n\nSummary:"
            try:
                response = self.generator(input_text, max_new_tokens=100, min_length=20, do_sample=False)
                answer = response[0]['generated_text']
                confidence_score = 0.7
            except:
                answer = "This appears to be a financial/sustainability report containing performance metrics and data."
                confidence_score = 0.5
        
        else:
            # For specific queries
            input_text = f"""Based on the following data, answer the question precisely. If there are numbers, include them with units.

    Data:
    {context_for_generation}

    Question: {prompt}

    Direct answer:"""
            
            try:
                response = self.generator(input_text, max_new_tokens=50, min_length=5, do_sample=False)
                generated = response[0]['generated_text'].strip()
                
                # Extract the most relevant number if found
                numbers_in_context = re.findall(r'\d+\.?\d*', context_for_generation)
                if numbers_in_context and generated:
                    answer = generated
                    confidence_score = max(confidence_score, 0.6)
                else:
                    answer = generated if generated else "No specific data found."
                    confidence_score = max(confidence_score, 0.3)
                    
            except Exception as e:
                # Fallback: extract the most relevant line
                if relevant_info:
                    answer = relevant_info[0]
                    confidence_score = 0.4
                else:
                    answer = "Could not extract specific information for this query."
                    confidence_score = 0.1
        
        # Format final response with confidence
        confidence_text = "High" if confidence_score > 0.7 else "Medium" if confidence_score > 0.4 else "Low"
        
        return f"""Answer: {answer}

    Confidence: {confidence_text} ({confidence_score:.1%})
    Source pages: {', '.join(str(item['page']) for item in matched_items[:3])}

    {f"Additional context: {relevant_info[1]}" if len(relevant_info) > 1 else ""}"""

def main():
    """Main function to run the financial PDF processor"""
    processor = FinancialPDFProcessor()
    
    print("=== Financial PDF Data Extractor with RAG ===")
    print("Specialized for precise financial data extraction from PDFs")
    
    while True:
        print("\n=== Main Menu ===")
        print("1. Process new PDF")
        print("2. Query current PDF")
        print("3. Exit")
        
        choice = input("Choose option (1-3): ").strip()
        
        if choice == "1":
            try:
                filepath, source_type = processor.select_pdf_source()
                result = processor.process_pdf(filepath)
                print(f"\n✓ Successfully processed: {result['filepath']}")
                print(f"  - {result['items_count']} items extracted")
                print(f"  - {result['pages']} pages processed")
                
            except Exception as e:
                print(f"✗ Error processing PDF: {e}")
        
        elif choice == "2":
            if not processor.items:
                print("No PDF processed yet. Please process a PDF first.")
                continue
            
            query = input("\nEnter your question: ").strip()
            if query:
                response = processor.query_documents(query)
                print(f"\n=== Response ===\n{response}")
        
        elif choice == "3":
            print("Goodbye!")
            break
        
        else:
            print("Invalid choice. Please try again.")

if __name__ == "__main__":
    main()

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda:0
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Device set to use cpu
Device set to use cpu


=== Financial PDF Data Extractor with RAG ===
Specialized for precise financial data extraction from PDFs

=== Main Menu ===
1. Process new PDF
2. Query current PDF
3. Exit

=== PDF Source Selection ===
1. Download from URL
2. Use local file
3. Select from common financial document URLs
✓ Selected: D:\DSAI\ESG\ESG-SCG_documents\EconomicPerformance2023.pdf

=== Processing PDF: EconomicPerformance2023.pdf ===


Processing PDF pages: 100%|██████████| 2/2 [00:04<00:00,  2.26s/it]



=== Generating Embeddings ===


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.69it/s] ?it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 105.80it/s]06,  1.68it/s, Text: 0/9, Table: 1/1, Image: 0/0]
Batches: 100%|██████████| 1/1 [00:00<00:00, 100.69it/s]06,  1.68it/s, Text: 1/9, Table: 1/1, Image: 0/0]
Batches: 100%|██████████| 1/1 [00:00<00:00, 101.85it/s]05,  1.68it/s, Text: 2/9, Table: 1/1, Image: 0/0]
Batches: 100%|██████████| 1/1 [00:00<00:00, 88.15it/s]:05,  1.68it/s, Text: 3/9, Table: 1/1, Image: 0/0]
Batches: 100%|██████████| 1/1 [00:00<00:00, 86.69it/s]:04,  1.68it/s, Text: 4/9, Table: 1/1, Image: 0/0]
Batches: 100%|██████████| 1/1 [00:00<00:00, 105.55it/s]04,  1.68it/s, Text: 5/9, Table: 1/1, Image: 0/0]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.78it/s]:03,  1.68it/s, Text: 6/9, Table: 1/1, Image: 0/0]
Batches: 100%|██████████| 1/1 [00:00<00:00, 124.30it/s]00, 10.62it/s, Text: 7/9, Table: 1/1, Image: 0/0]
Batches: 100%|██████████| 1/1 [00:00<00:00, 110.24it/s]00, 10.62it/s, Text: 8/9, Table: 1/1, Image

✓ Created FAISS index with 10 embeddings
✓ Processed 12 items from 2 pages

=== Processing Summary ===
Table: 1
Financial_text: 9
Page: 2

✓ Successfully processed: D:\DSAI\ESG\ESG-SCG_documents\EconomicPerformance2023.pdf
  - 12 items extracted
  - 2 pages processed

=== Main Menu ===
1. Process new PDF
2. Query current PDF
3. Exit

=== Querying: what is the revenue of 2021 ===


Batches: 100%|██████████| 1/1 [00:00<00:00, 55.95it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (2385 > 512). Running this sequence through the model will result in indexing errors


✓ Found 15 relevant items

=== Response ===
Answer: 2021 EBITDA (Billion Baht) 32.0 34.1 47.2 21.4 25.9 GRI 201-1 Profit for the year (Billion Baht) 32.0 34.1 47.2 21.4 25.9

    Confidence: High (100.0%)
    Source pages: 0, 0, 0

    Additional context: Page 0: Performance Data 2019 2020 2021 2022 2023 GRI Standards Unnamed: 0 SASB --------------------------------------------------------------------------------------- Revenue from sales (Billion Baht) 438.0 399.9 530.1 569.6 499.6 GRI 201-1 Profit for the year (Billion Baht) 32.0 34.1 47.2 21.4 25.9 GRI 201-1 EBITDA (Billion Baht) 75.1 74.6 91.9 61.9 54.1 GRI 201-1 Employee compensation comprising salary, wage, welfare, 48,139 46,796 47,921 50,732 50,190 GRI 201-1 and regular contributions (Million Baht) Dividend to shareholders (Million Baht) 16,800 16,800 22,200 9,600 7,200 GRI 201-1 Interest and financial expenses to lender (Million Baht) 6,442 7,082 6,758 7,523 10,297 GRI 201-1 Taxes to government and local government authorities

Batches: 100%|██████████| 1/1 [00:00<00:00, 98.61it/s]


✓ Found 15 relevant items

=== Response ===
Answer: (Billion Baht) 438.0 399.9 530.1 569.6 499.6 GRI 201-1 Profit for the year (Billion Baht) 32.0 34.1 47.2 21.4 25.9

    Confidence: High (100.0%)
    Source pages: 0, 0, 0

    Additional context: Page 0: Performance Data 2019 2020 2021 2022 2023 GRI Standards Unnamed: 0 SASB --------------------------------------------------------------------------------------- Revenue from sales (Billion Baht) 438.0 399.9 530.1 569.6 499.6 GRI 201-1 Profit for the year (Billion Baht) 32.0 34.1 47.2 21.4 25.9 GRI 201-1 EBITDA (Billion Baht) 75.1 74.6 91.9 61.9 54.1 GRI 201-1 Employee compensation comprising salary, wage, welfare, 48,139 46,796 47,921 50,732 50,190 GRI 201-1 and regular contributions (Million Baht) Dividend to shareholders (Million Baht) 16,800 16,800 22,200 9,600 7,200 GRI 201-1 Interest and financial expenses to lender (Million Baht) 6,442 7,082 6,758 7,523 10,297 GRI 201-1 Taxes to government and local government authorities such a

Batches: 100%|██████████| 1/1 [00:00<00:00, 80.80it/s]


✓ Found 15 relevant items

=== Response ===
Answer: GRI 201-1 Employee compensation comprising salary, wage, welfare, and regular contributions (Million Baht) 48,139 46,796 47,921 50,732 50,190 GRI 201-1 Dividend

    Confidence: Medium (60.0%)
    Source pages: 0, 0, 0

    

=== Main Menu ===
1. Process new PDF
2. Query current PDF
3. Exit

=== PDF Source Selection ===
1. Download from URL
2. Use local file
3. Select from common financial document URLs
✓ Selected: D:\DSAI\ESG\ESG-SCG_documents\SocialPerformance2023.pdf

=== Processing PDF: SocialPerformance2023.pdf ===


Processing PDF pages: 100%|██████████| 6/6 [00:04<00:00,  1.44it/s]



=== Generating Embeddings ===


Batches: 100%|██████████| 1/1 [00:00<00:00, 90.92it/s] ?it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 58.34it/s] ?it/s, Text: 0/30, Table: 1/5, Image: 0/0]
Batches: 100%|██████████| 1/1 [00:00<00:00, 94.97it/s]:01, 25.17it/s, Text: 1/30, Table: 1/5, Image: 0/0]
Batches: 100%|██████████| 1/1 [00:00<00:00, 97.01it/s]:01, 34.75it/s, Text: 2/30, Table: 1/5, Image: 0/0]
Batches: 100%|██████████| 1/1 [00:00<00:00, 84.24it/s]:00, 40.81it/s, Text: 3/30, Table: 1/5, Image: 0/0]
Batches: 100%|██████████| 1/1 [00:00<00:00, 100.05it/s]00, 43.85it/s, Text: 4/30, Table: 1/5, Image: 0/0]
Batches: 100%|██████████| 1/1 [00:00<00:00, 94.14it/s]:00, 55.21it/s, Text: 5/30, Table: 1/5, Image: 0/0]
Batches: 100%|██████████| 1/1 [00:00<00:00, 94.92it/s]:00, 55.21it/s, Text: 5/30, Table: 2/5, Image: 0/0]
Batches: 100%|██████████| 1/1 [00:00<00:00, 98.15it/s]:00, 55.21it/s, Text: 6/30, Table: 2/5, Image: 0/0]
Batches: 100%|██████████| 1/1 [00:00<00:00, 98.93it/s]:00, 55.21it/s, Text: 7/30, Table: 2/5, Imag

✓ Created FAISS index with 35 embeddings
✓ Processed 41 items from 6 pages

=== Processing Summary ===
Table: 5
Financial_text: 30
Page: 6

✓ Successfully processed: D:\DSAI\ESG\ESG-SCG_documents\SocialPerformance2023.pdf
  - 41 items extracted
  - 6 pages processed

=== Main Menu ===
1. Process new PDF
2. Query current PDF
3. Exit

=== Querying: hours worked by Employee of 2020 ===


Batches: 100%|██████████| 1/1 [00:00<00:00, 119.12it/s]


✓ Found 15 relevant items

=== Response ===
Answer: 0 ------------------------------------------------------------------------ Occupational Illness Frequency Rate (Cases/1,000,000 Hours Worked) • Employee

    Confidence: Medium (60.0%)
    Source pages: 0, 0, 3

    

=== Main Menu ===
1. Process new PDF
2. Query current PDF
3. Exit
Goodbye!
