In [None]:
from datetime import datetime

import camelot
import pandas as pd

In [None]:
# Configuration
PDF_FILE_PATH = '../docs/statement-4.pdf'
DATE_FORMAT = "%m/%d/%Y"

# Column Headers from the Document
COLUMN_NAMES = [
    "Posted Date",
    "Value Date",
    "Description",
    "Debit",
    "Credit",
    "Balance"
]

def extract_and_clean_transactions(pdf_path: str) -> pd.DataFrame:
    """Extract, build, and clean transaction data from a PDF."""
    print("Starting extraction...")
    tables = camelot.read_pdf(pdf_path, pages='1-3', flavor='stream', row_tol=15)

    num_columns = len(COLUMN_NAMES)

    transactions_df = pd.concat([table.df for table
                                 in tables
                                 if table.df.shape[1] == num_columns], ignore_index=True)

    transactions_df.dropna(axis=1, how='all', inplace=True)

    transactions_df.columns = COLUMN_NAMES[:num_columns]

    print("Performing final data cleaning...")
    cleaned_df = transactions_df.copy()

    # Clean up newline characters and leading/trailing spaces
    for col in cleaned_df.columns:
        if cleaned_df[col].dtype == 'object':
            cleaned_df.loc[:, col] = (
                cleaned_df[col]
                .str.replace('\n', ' ', regex=False)
                .str.strip()
            )

    def clean_numeric_col(series: pd.Series) -> pd.Series:
        # Remove commas, letters, and any non-numeric characters except for '.' and '-'
        cleaned_series = series.astype(str).str.replace(',', '', regex=False)
        cleaned_series = cleaned_series.str.replace(r'[^\d.-]', '', regex=True)
        return pd.to_numeric(cleaned_series, errors='coerce')

    cleaned_df.loc[:, 'Debit'] = clean_numeric_col(cleaned_df['Debit'])
    cleaned_df.loc[:, 'Credit'] = clean_numeric_col(cleaned_df['Credit'])
    cleaned_df.loc[:, 'Balance'] = clean_numeric_col(cleaned_df['Balance'])

    cleaned_df.loc[:, 'Posted Date'] = pd.to_datetime(
        cleaned_df['Posted Date'],
        format="%d-%b-%y",
        errors='coerce'
    )
    cleaned_df.loc[:, 'Value Date'] = pd.to_datetime(
        cleaned_df['Value Date'],
        format="%d-%b-%y",
        errors='coerce'
    )

    def concatenate_rows_until_balance(df: pd.DataFrame) -> pd.DataFrame:
        """
        Concatenate all succeeding rows until the next row with a balance.
        """
        concatenated_rows = []
        current_row = None

        for _, row in df.iterrows():
            if not pd.isna(row['Balance']):
                # If a balance exists, save the current row and start a new one
                if current_row is not None:
                    concatenated_rows.append(current_row)
                current_row = row.copy()
            else:
                if current_row is None:
                    # If no current row exists, create a new one
                    current_row = row.copy()
                else:
                    # Concatenate the description row with the previous one
                    current_row['Description'] = f"{current_row['Description']} {row['Description']}".strip()

        # Append the last row
        if current_row is not None:
            concatenated_rows.append(current_row)

        return pd.DataFrame(concatenated_rows)

    cleaned_df = concatenate_rows_until_balance(cleaned_df)

    print(f"Extracted DataFrame shape: {cleaned_df.shape}")
    display(cleaned_df.head(50))

    cleaned_df = cleaned_df[cleaned_df['Value Date'].notna()]
    cleaned_df.reset_index(drop=True, inplace=True)

    print("Extraction and cleaning complete.")
    return cleaned_df

In [None]:
if __name__ == '__main__':
    # --- Execute the script ---
    final_transactions_df = extract_and_clean_transactions(PDF_FILE_PATH)

    # --- Display Results ---
    print("\n--- Extracted Transactions ---")
    print(f"Total transactions extracted: {len(final_transactions_df)}")

    # Display the first 10 and last 10 transactions
    print("\nFirst 50 Transactions:")
    display(final_transactions_df.head(50))

    print("\nLast 50 Transactions:")
    display(final_transactions_df.tail(50))

    # --- Optional: Save to CSV ---
    try:
        output_csv_path = f'../output/access_extracted_transactions{datetime.now().strftime('%d%m%Y%H%M%S')}.csv'
        final_transactions_df.to_csv(output_csv_path, date_format=DATE_FORMAT, index=False)
        print(f"\nSuccessfully saved cleaned data to '{output_csv_path}'")
    except Exception as e:
        print(f"\nCould not save to CSV: {e}")