In [None]:
import os
import re
import gc
import tempfile
import pdfplumber
import camelot
import pandas as pd
from pathlib import Path
from typing import Dict, Any, Optional, Type, List
from dateutil import parser
from collections import defaultdict


class XacBank25Parser():
    """
    Parser for the TDB regular statement format.
    """

    def __init__(
        self,
        sources_dir: str = "sources",
        results_dir: str = "results",
        # model_class: Optional[Type[Model]] = None,
        name_pattern: str = r'Дансны нэр :\s*([^\n]+)',
        account_pattern: str = r'Дансны дугаар :\s*(\d+)\s*',
        account_type_pattern: str = r'Валют :\s*([^\n]+)',
        table_start_pattern: str = r'Цаг Салбар',
        table_end_pattern: str = r'Огноо Гүйлгээ хийсэн банк',
    ):
        # super().__init__(sources_dir, results_dir)
        self.name_pattern = name_pattern
        self.account_pattern = account_pattern
        self.account_type_pattern = account_type_pattern
        self.table_start_pattern = table_start_pattern
        self.table_end_pattern = table_end_pattern
        self.statement_parser = 'xacbank_25'

    def extract_customer_details(self, text: str) -> Dict[str, str]:
        """Extract customer details from the text content."""

        details = {
            'first_name': '',
            'last_name': '',
            'account_number': '',
            'account_type': '',
            'statement_parser': self.statement_parser
        }

        try:
            # Extract customer name
            name_match = re.search(self.name_pattern, text)
            if name_match:
                full_name = name_match.group(1).strip()
                # Split the name into first and last name
                name_parts = full_name.split()
                if len(name_parts) >= 2:
                    details['last_name'] = name_parts[0]
                    details['first_name'] = name_parts[1]

            # Extract account number
            account_match = re.search(self.account_pattern, text)
            if account_match:
                details['account_number'] = account_match.group(1)

            # Extract account type
            account_type_match = re.search(self.account_type_pattern, text)
            if account_type_match:
                details['account_type'] = account_type_match.group(0).strip()

        except Exception as e:
            print(f"Error extracting customer details: {str(e)}")

        return details

    def extract_statement_details(self, pdf_file: str, start_page: int = None, end_page: int = None) -> List[Dict[str, Any]]:
        """
        Extract statement details from the text content.

        Args:
            pdf_file: Either a file path string or a file object
            start_page (int, optional): Starting page number (1-indexed). If None, starts from page 1.
            end_page (int, optional): Ending page number (1-indexed). If None, processes all pages.

        Returns:
            List[Dict[str, Any]]: List of parsed transactions
        """

        try:
            # Determine page range for Camelot
            if start_page is not None and end_page is not None:
                # Convert to Camelot's page format (comma-separated string)
                page_range = ','.join(str(i) for i in range(start_page, end_page + 1))
                print(f"TDB Old processing pages: {page_range}")
            elif start_page is not None:
                page_range = f"{start_page}-end"
                print(f"TDB Old processing pages: {page_range}")
            elif end_page is not None:
                page_range = f"1-{end_page}"
                print(f"TDB Old processing pages: {page_range}")
            else:
                page_range = 'all'
                print("TDB Old processing all pages")

            # Handle both file paths and file objects
            if hasattr(pdf_file, 'read'):
                # If it's a file object, save it temporarily
                with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as temp_file:
                    for chunk in pdf_file.chunks():
                        temp_file.write(chunk)

                    print("temp file", temp_file.name)
                    temp_file_path = temp_file.name

                try:
                    tables = camelot.read_pdf(temp_file_path, pages=page_range, flavor='stream')
                except SystemExit as e:
                    print(f"Camelot SystemExit error: {e}. Falling back to empty result.")
                    tables = []
                except Exception as e:
                    print(f"Camelot processing error: {e}. Falling back to empty result.")
                    tables = []
                finally:
                    # Clean up the temporary file
                    if os.path.exists(temp_file_path):
                        os.unlink(temp_file_path)
            else:
                # If it's a file path, use it directly
                try:
                    tables = camelot.read_pdf(pdf_file, pages=page_range, flavor='stream')
                except SystemExit as e:
                    print(f"Camelot SystemExit error: {e}. Falling back to empty result.")
                    tables = []
                except Exception as e:
                    print(f"Camelot processing error: {e}. Falling back to empty result.")
                    tables = []
        except Exception as e:
            print(f"Error extracting tables from PDF: {str(e)}")
            return []

        print(f"Found {len(tables)} tables")

        clean_tran = []

        for idx, table in enumerate(tables):
            raw_data = table.data

            for row_idx, row in enumerate(raw_data):
                if "Хэвлэсэн огноо :" in row:
                    print("row")
                # Skip header rows
                if row_idx == 0:
                    continue
                elif row[2] == 'Эхний үлдэгдэл':
                    continue
                elif row[0] == 'Дансны төрөл :':
                    continue
                elif row[0] == 'Хугацаа :':
                    continue
                elif row[0] == 'Огноо':
                    continue
                elif any(val == 'Энэхүү дансны хуулга нь мэдээллийг орхигдуулсан болон алдаатай мэдээлэл агуулж байвал энэ талаар дансны хуулга авснаас хойш ажлын 10' for i, val in enumerate(row)):
                    continue
                elif any(val == 'өдөрт багтаан салбарын захирал, эсвэл нягтлан бодогчид бичгээр мэдэгдэнэ үү.' for i, val in enumerate(row)):
                    continue
                elif row[2] == 'Эцсийн үлдэгдэл :':
                    continue
                elif row[0] == 'Нийт':
                    break

                row = [str(val).strip() for val in row]

                if len(row) == 7:
                    row.pop(3) if row[4] else row.pop(4)

                clean_tran.append(row)

        print("Extracted clean data count: ", len(clean_tran))
        group_counter = 0

        for idx, row in enumerate(clean_tran):
            if row[0] == '':
                row.insert(0, group_counter)
            else:
                group_counter += 1
                row.insert(0, group_counter)

        for idx, row in enumerate(clean_tran):
            try:
                if row[1] == '':
                    continue

                next_idx = idx + 1

                while (next_idx < len(clean_tran)):
                    if clean_tran[next_idx][1] == '':
                        next_idx += 1
                    else:
                        break

                if next_idx < len(clean_tran) and next_idx != idx + 1 and clean_tran[next_idx - 1][2] != 'Used':
                    clean_tran[next_idx - 1][0] = clean_tran[next_idx][0]
                    clean_tran[next_idx + 1][0] = clean_tran[next_idx][0]
                    clean_tran[next_idx - 1][2] = 'Used'
                    clean_tran[next_idx + 1][2] = 'Used'
            except Exception as e:
                print("Grouping loop error:", e)

        group_rows = defaultdict(list)

        for idx, row in enumerate(clean_tran):
            group_rows[row[0]].append(row)


        if not group_rows[0][0][1]:
            group_rows[1] = group_rows[0] + group_rows[1]
            group_rows.pop(0)

        del clean_tran

        group_rows_list = list(group_rows.items())
        processed_list = []

        for idx, row in enumerate(group_rows_list):
            concat_desc = list()
            concat_row = list()

            if (idx == 0):
                continue

            for idx, value in enumerate(row[1]):
                concat_desc.append(value[3])

                if (value[1] != ''):
                    concat_row = value

            transaction = {
                'type': 'income' if concat_row[4] and float(concat_row[4].replace(',', '')) > 0 else 'expense',
                'date': None,
                'income': float(concat_row[4].replace(',', '')) if concat_row[4] else None,
                'expense': float(concat_row[5].replace(',', '')) if concat_row[5] else None,
                'balance_end': float(concat_row[6].replace(',', '')) if concat_row[5] else 0,
                'related_account': '',
                'related_account_name': '',
                'description': ' '.join(concat_desc)
            }

            processed_list.append(transaction)

        del group_rows_list

        return processed_list


    def parse_pdf(self, pdf_path: Path, register_number: str, request=None, start_page: int = None, end_page: int = None, chunk_size: int = 10) -> str:
        """Parse a single PDF file and extract its text content."""

        result = {
            'id': "",
            'name': "", # full name
            'account': "",
            'created_transactions': 0
        }

        try:
            with pdfplumber.open(pdf_path) as pdf:
                # Get first page for customer details
                first_page = pdf.pages[0]
                first_page_text = first_page.extract_text()

                # Extract and save customer details
                customer_details = self.extract_customer_details(first_page_text)
                # customer, account = self.save_customer_details(customer_details, register_number, request)
                account = customer_details
                customer = customer_details

                if account:
                    print(f"Successfully saved customer details for account {account['account_number']}")

                    # Determine page range for processing
                    total_pages = len(pdf.pages)
                    start_idx = (start_page - 1) if start_page is not None else 0
                    end_idx = end_page if end_page is not None else total_pages

                    # Validate page range
                    start_idx = max(0, start_idx)  # Ensure start is not negative
                    end_idx = min(total_pages, end_idx)  # Ensure end doesn't exceed total pages

                    if start_idx >= end_idx:
                        print(f"Invalid page range: start_page={start_page}, end_page={end_page}, total_pages={total_pages}")
                        return result

                    print(f"TDB Old processing pages {start_idx + 1} to {end_idx} of {total_pages} total pages in chunks of {chunk_size}")

                    # Process pages in chunks to prevent memory issues
                    total_transactions = 0
                    chunk_start = start_idx

                    while chunk_start < end_idx:
                        chunk_end = min(chunk_start + chunk_size, end_idx)
                        print(f"TDB Old processing chunk: pages {chunk_start + 1} to {chunk_end}")

                        # Extract transactions from this chunk using Camelot
                        chunk_transactions = self.extract_statement_details(pdf_path, chunk_start + 1, chunk_end)
                        print(f"TDB Old chunk {chunk_start + 1}-{chunk_end}: found {len(chunk_transactions)} transactions")

                        # Save transactions from this chunk
                        if chunk_transactions:
                            # if self.save_transactions(chunk_transactions, account):
                            total_transactions += len(chunk_transactions)
                            print(f"TDB Old chunk {chunk_start + 1}-{chunk_end}: successfully saved {len(chunk_transactions)} transactions")
                            # else:
                            #     print(f"TDB Old chunk {chunk_start + 1}-{chunk_end}: failed to save transactions")

                        # Clear chunk data from memory
                        del chunk_transactions
                        gc.collect()  # Force garbage collection

                        chunk_start = chunk_end

                    print(f"TDB Old successfully processed {total_transactions} total transactions from pages {start_idx + 1}-{end_idx}")
                else:
                    print(f"Failed to save customer details for {pdf_path.name}")

                # result['id'] = str(customer.id)
                result['name'] = customer['last_name'] + " " + customer['first_name']
                result['account'] = account['account_number']
                result['created_transactions'] = total_transactions

        except Exception as e:
            print(f"Error parsing {pdf_path}: {str(e)}")

        return result


In [114]:
file = "пос хуулга.pdf"

parser_class = XacBank25Parser()

result = parser_class.parse_pdf(file, 'УО02303134')

Successfully saved customer details for account 5005719683
TDB Old processing pages 1 to 15 of 15 total pages in chunks of 10
TDB Old processing chunk: pages 1 to 10
TDB Old processing pages: 1,2,3,4,5,6,7,8,9,10
Found 10 tables
Extracted clean data count:  701
TDB Old chunk 1-10: found 258 transactions
TDB Old chunk 1-10: successfully saved 258 transactions
TDB Old processing chunk: pages 11 to 15
TDB Old processing pages: 11,12,13,14,15
Found 5 tables
Extracted clean data count:  321
TDB Old chunk 11-15: found 110 transactions
TDB Old chunk 11-15: successfully saved 110 transactions
TDB Old successfully processed 368 total transactions from pages 1-15
