In [None]:
from datetime import datetime

import camelot.io as camelot
import pandas as pd

In [None]:
# Configuration
PDF_FILE_PATH = '../docs/providus-statement.pdf'
DATE_FORMAT = "%d/%m/%Y"

# Column Headers from the Document
COLUMN_NAMES = [
    "TXN DATE",
    "VAL DATE",
    "REMARKS",
    "DEBIT",
    "CREDIT",
    "BALANCE"
]

def extract_and_clean_transactions(pdf_path: str) -> pd.DataFrame:
    """Extract, build, and clean transaction data from a PDF."""
    print("Starting extraction...")
    tables = camelot.read_pdf(pdf_path, pages='all', flavor='stream', row_tol=10)

    num_columns = len(COLUMN_NAMES)

    # data cleanup
    transactions_df = pd.concat([table.df for table
                                 in tables], ignore_index=True)
    transactions_df.dropna(axis=1, how='all', inplace=True)

    print(f"extracted {len(tables)} tables with shape: {transactions_df.shape}")
    display(transactions_df.head(50))

    transactions_df.columns = COLUMN_NAMES[:num_columns]

    print("Performing final data cleaning...")
    cleaned_df = transactions_df.copy()

    # Clean up newline characters and leading/trailing spaces
    for col in cleaned_df.columns:
        if cleaned_df[col].dtype == 'object':
            cleaned_df.loc[:, col] = (
                cleaned_df[col]
                .str.replace('\n', ' ', regex=False)
                .str.strip()
            )

    def clean_numeric_col(series: pd.Series) -> pd.Series:
        # Remove commas, letters, and any non-numeric characters except for '.' and '-'
        cleaned_series = series.astype(str).str.replace(',', '', regex=False)
        cleaned_series = cleaned_series.str.replace(r'[^\d.-]', '', regex=True)
        return pd.to_numeric(cleaned_series, errors='coerce')

    cleaned_df.loc[:, 'DEBIT'] = clean_numeric_col(cleaned_df['DEBIT'])
    cleaned_df.loc[:, 'CREDIT'] = clean_numeric_col(cleaned_df['CREDIT'])
    cleaned_df.loc[:, 'BALANCE'] = clean_numeric_col(cleaned_df['BALANCE'])

    cleaned_df.loc[:, 'TXN DATE'] = pd.to_datetime(
        cleaned_df['TXN DATE'],
        format="%d-%m-%Y",
        errors='coerce'
    )
    cleaned_df.loc[:, 'VAL DATE'] = pd.to_datetime(
        cleaned_df['VAL DATE'],
        format="%d-%m-%Y",
        errors='coerce'
    )

    def concatenate_rows_until_balance(df: pd.DataFrame) -> pd.DataFrame:
        """
        Concatenate all succeeding rows until the next row with a balance.
        """
        concatenated_rows = []
        current_row = None

        for _, row in df.iterrows():
            if pd.isna(row['BALANCE']) and pd.isna(row['DEBIT']):
                # If no balance and no debit, continue concatenating
                if current_row is None:
                    # If no current row exists, create a new one
                    current_row = row.copy()
                else:
                    # Concatenate the remarks row with the previous one
                    current_row['REMARKS'] = f"{current_row['REMARKS']} {row['REMARKS']}".strip()
            else:
                # If a balance exists, save the current row and start a new one
                if current_row is not None:
                    concatenated_rows.append(current_row)
                current_row = row.copy()

        # Append the last row
        if current_row is not None:
            concatenated_rows.append(current_row)

        return pd.DataFrame(concatenated_rows)

    cleaned_df = concatenate_rows_until_balance(cleaned_df)

    print(f"Cleaned DataFrame:")
    display(cleaned_df.head(50))

    cleaned_df = cleaned_df[cleaned_df['VAL DATE'].notna()]
    cleaned_df.reset_index(drop=True, inplace=True)

    print("Extraction and cleaning complete.")
    return cleaned_df

In [None]:
if __name__ == '__main__':
    # --- Execute the script ---
    final_transactions_df = extract_and_clean_transactions(PDF_FILE_PATH)

    # --- Display Results ---
    print("\n--- Extracted Transactions ---")
    print(f"Total transactions extracted: {len(final_transactions_df)}")

    # Display the first 10 and last 10 transactions
    print("\nFirst 50 Transactions:")
    display(final_transactions_df.head(50))

    print("\nLast 50 Transactions:")
    display(final_transactions_df.tail(50))

    # # --- Optional: Save to CSV ---
    # try:
    #     output_csv_path = f'../output/providus_extracted_transactions{datetime.now().strftime('%d%m%Y%H%M%S')}.csv'
    #     final_transactions_df.to_csv(output_csv_path, date_format=DATE_FORMAT, index=False)
    #     print(f"\nSuccessfully saved cleaned data to '{output_csv_path}'")
    # except Exception as e:
    #     print(f"\nCould not save to CSV: {e}")