In [37]:
from datetime import datetime

import camelot
import pandas as pd

In [38]:
# Configuration
PDF_FILE_PATH = '../docs/statement-2.pdf'
DATE_FORMAT = "%m/%d/%Y"

# Column Headers from the Document
COLUMN_NAMES = [
    "Trans. Time",
    "Value Date",
    "Description",
    "Debit/Credit(N)",
    "Balance(N)",
    "Channel",
    "Transaction Reference"
]

def is_valid_date(date_string: str) -> bool:
    """Check if a string matches 'YYYY Mon DD HH:MM:SS'."""
    try:
        datetime.strptime(date_string, "%Y %b %d %H:%M:%S")
        return True
    except ValueError:
        return False

def build_transactions_with_validation(raw_df: pd.DataFrame) -> pd.DataFrame:
    """
    The `build_transactions_with_validation` function processes a raw DataFrame of transaction data
    and constructs a cleaned DataFrame of transactions by identifying valid transaction boundaries
    based on date validation. The logic is as follows:

    1. **Initialize Variables**:
        - `transactions`: A list to store completed transactions.
        - `transaction_in_progress`: A dictionary to hold the current transaction being built.
        - `valid_transaction`: A flag to indicate if the current transaction is valid.

    2. **Iterate Through Rows**:
        - For each row, check if the first column contains a valid timestamp using the `is_valid_date` function.
        - If a valid timestamp is found:
            - Append the current `transaction_in_progress` to the `transactions` list (if it exists).
            - Start a new transaction with the current row.
            - Set `valid_transaction` to `True`.
        - If no valid timestamp is found:
            - If no transaction is in progress, initialize `transaction_in_progress` with the current row.
            - Otherwise, concatenate the current row's data into the existing `transaction_in_progress`.
            - If the concatenated data forms a valid timestamp, set `valid_transaction` to `True`.

    3. **Handle Remaining Transaction**:
        - After iterating through all rows, append the last `transaction_in_progress` to the `transactions` list.

    4. **Return Result**:
        - Convert the `transactions` list into a DataFrame and return it.

    Reasoning:
    This function is due to the fact that when opay transactions spills to the next line at the end of the page, it doesn't
    start recording most of the columns on the first line causing an issue when using the `concatenate_rows_until_balance` function
    as done for other banks. Instead, we will check for valid timestamps to determine the partitioning of transactions.
    """
    transactions = []
    transaction_in_progress = {}
    valid_transaction = False

    display(raw_df.head(50))

    for _, row in raw_df.iterrows():
        timestamp_candidate = str(row.iloc[0]).strip()
        if is_valid_date(timestamp_candidate):
            if transaction_in_progress:
                transactions.append(transaction_in_progress)
            transaction_in_progress = row.to_dict()
            valid_transaction = True
        elif not transaction_in_progress:
            transaction_in_progress = row.to_dict()
        else:
            if valid_transaction:
                merged_date = transaction_in_progress[raw_df.columns[0]] + timestamp_candidate
                if not is_valid_date(merged_date):
                    transactions.append(transaction_in_progress)
                    transaction_in_progress = row.to_dict()
                    valid_transaction = False
                    continue
            for col in raw_df.columns:
                existing_val = transaction_in_progress.get(col, "")
                transaction_in_progress[col] = f"{existing_val}{row[col]}".strip()

            if is_valid_date(transaction_in_progress[raw_df.columns[0]]):
                valid_transaction = True

    if transaction_in_progress:
        transactions.append(transaction_in_progress)

    return pd.DataFrame(transactions)

def extract_and_clean_transactions(pdf_path: str) -> pd.DataFrame:
    """Extract, build, and clean transaction data from a PDF."""
    print("Starting extraction...")
    tables = camelot.read_pdf(pdf_path, pages='all', flavor='stream', row_tol=10)

    num_columns = len(COLUMN_NAMES)
    raw_df = pd.concat([table.df for table
                                 in tables
                                 if table.df.shape[1] == num_columns], ignore_index=True)
    raw_df.dropna(axis=1, how='all', inplace=True)

    raw_df.columns = COLUMN_NAMES[:num_columns]

    print("Building transactions with validation logic...")
    transactions_df = build_transactions_with_validation(raw_df)

    print("Performing final data cleaning...")
    cleaned_df = transactions_df.copy()

    # Clean up newline characters and leading/trailing spaces
    for col in cleaned_df.columns:
        if cleaned_df[col].dtype == 'object':
            cleaned_df.loc[:, col] = (
                cleaned_df[col]
                .str.replace('\n', ' ', regex=False)
                .str.strip()
            )

    def clean_numeric_col(series: pd.Series) -> pd.Series:
        # Remove commas, letters, and any non-numeric characters except for '.' and '-'
        cleaned_series = series.astype(str).str.replace(',', '', regex=False)
        cleaned_series = cleaned_series.str.replace(r'[^\d.-]', '', regex=True)
        return pd.to_numeric(cleaned_series, errors='coerce')

    cleaned_df.loc[:, 'Debit/Credit(N)'] = clean_numeric_col(cleaned_df['Debit/Credit(N)'])
    cleaned_df.loc[:, 'Balance(N)'] = clean_numeric_col(cleaned_df['Balance(N)'])

    cleaned_df.loc[:, 'Trans. Time'] = pd.to_datetime(
        cleaned_df['Trans. Time'],
        format="%Y %b %d %H:%M:%S",
        errors='coerce'
    )
    cleaned_df.loc[:, 'Value Date'] = pd.to_datetime(
        cleaned_df['Value Date'],
        format="%d %b %Y",
        errors='coerce'
    )

    cleaned_df = cleaned_df[cleaned_df['Trans. Time'].notna()]
    cleaned_df.reset_index(drop=True, inplace=True)

    print("Extraction and cleaning complete.")
    return cleaned_df

In [39]:
if __name__ == '__main__':
    # --- Execute the script ---
    final_transactions_df = extract_and_clean_transactions(PDF_FILE_PATH)

    # --- Display Results ---
    print("\n--- Extracted Transactions ---")
    print(f"Total transactions extracted: {len(final_transactions_df)}")

    # Display the first 50 and last 50 transactions
    print("\nFirst 50 Transactions:")
    display(final_transactions_df.head(50))

    print("\nLast 50 Transactions:")
    display(final_transactions_df.tail(50))

    # --- Optional: Save to CSV ---
    try:
        output_csv_path = f'../output/opay_extracted_transactions{datetime.now().strftime('%d%m%Y%H%M%S')}.csv'
        final_transactions_df.to_csv(output_csv_path, date_format=DATE_FORMAT, index=False)
        print(f"\nSuccessfully saved cleaned data to '{output_csv_path}'")
    except Exception as e:
        print(f"\nCould not save to CSV: {e}")

Starting extraction...
Building transactions with validation logic...


  if self._document_has_no_text():


Unnamed: 0,Trans. Time,Value Date,Description,Debit/Credit(N),Balance(N),Channel,Transaction Reference
0,Trans. Time,Value Date,Description,Debit/Credit(₦),Balance(₦),Channel,Transaction Reference
1,2025 Jan 02 12:30:18,02 Jan 2025,OPay Card Payment,-7927.50,242859.33,E-Channel,250102330100645707036016
2,2025 Jan 02 19:34:,02 Jan 2025,OPay Card Payment,-8000.00,234859.33,POS,250102330100651099721981
3,57,,,,,,
4,2025 Jan 04 07:55:,04 Jan 2025,SMS Subscription,-60.00,234799.33,E-Channel,250104140200667984787233
5,18,,,,,,
6,2025 Jan 05 10:28:,05 Jan 2025,OPay Card Payment,-4515.00,230284.33,E-Channel,250105330100682686169895
7,01,,,,,,
8,2025 Jan 05 19:01:13,05 Jan 2025,OPay Card Payment,-33319.92,196964.41,POS,250105330100688931690018
9,2025 Jan 06 07:51:,06 Jan 2025,Transfer to jude nwori,-3800.00,193164.41,E-Channel,250106010100692793802279



Total completed transactions: 452


Unnamed: 0,Trans. Time,Value Date,Description,Debit/Credit(N),Balance(N),Channel,Transaction Reference
0,Trans. Time,Value Date,Description,Debit/Credit(₦),Balance(₦),Channel,Transaction Reference
1,2025 Jan 02 12:30:18,02 Jan 2025,OPay Card Payment,-7927.50,242859.33,E-Channel,250102330100645707036016
2,2025 Jan 02 19:34:57,02 Jan 2025,OPay Card Payment,-8000.00,234859.33,POS,250102330100651099721981
3,2025 Jan 04 07:55:18,04 Jan 2025,SMS Subscription,-60.00,234799.33,E-Channel,250104140200667984787233
4,2025 Jan 05 10:28:01,05 Jan 2025,OPay Card Payment,-4515.00,230284.33,E-Channel,250105330100682686169895
5,2025 Jan 05 19:01:13,05 Jan 2025,OPay Card Payment,-33319.92,196964.41,POS,250105330100688931690018
6,2025 Jan 06 07:51:39,06 Jan 2025,Transfer to jude nwori,-3800.00,193164.41,E-Channel,250106010100692793802279
7,2025 Jan 06 13:03:13,06 Jan 2025,OPay Card Payment,-7428.25,185736.16,E-Channel,250106330100697012151221
8,2025 Jan 06 17:21:08,06 Jan 2025,Transfer from ADEOLU BAJOMO,+1000000.00,1185736.16,E-Channel,000012250106182106998609553918
9,2025 Jan 06 17:21:14,06 Jan 2025,Electronic Money Transfer Levy,-50.00,1185686.16,E-Channel,250106140200700331738613


Performing final data cleaning...
Extraction and cleaning complete.

--- Extracted Transactions ---
Total transactions extracted: 451

First 50 Transactions:


Unnamed: 0,Trans. Time,Value Date,Description,Debit/Credit(N),Balance(N),Channel,Transaction Reference
0,2025-01-02 12:30:18,2025-01-02 00:00:00,OPay Card Payment,-7927.5,242859.33,E-Channel,250102330100645707036016
1,2025-01-02 19:34:57,2025-01-02 00:00:00,OPay Card Payment,-8000.0,234859.33,POS,250102330100651099721981
2,2025-01-04 07:55:18,2025-01-04 00:00:00,SMS Subscription,-60.0,234799.33,E-Channel,250104140200667984787233
3,2025-01-05 10:28:01,2025-01-05 00:00:00,OPay Card Payment,-4515.0,230284.33,E-Channel,250105330100682686169895
4,2025-01-05 19:01:13,2025-01-05 00:00:00,OPay Card Payment,-33319.92,196964.41,POS,250105330100688931690018
5,2025-01-06 07:51:39,2025-01-06 00:00:00,Transfer to jude nwori,-3800.0,193164.41,E-Channel,250106010100692793802279
6,2025-01-06 13:03:13,2025-01-06 00:00:00,OPay Card Payment,-7428.25,185736.16,E-Channel,250106330100697012151221
7,2025-01-06 17:21:08,2025-01-06 00:00:00,Transfer from ADEOLU BAJOMO,1000000.0,1185736.16,E-Channel,000012250106182106998609553918
8,2025-01-06 17:21:14,2025-01-06 00:00:00,Electronic Money Transfer Levy,-50.0,1185686.16,E-Channel,250106140200700331738613
9,2025-01-06 20:05:30,2025-01-06 00:00:00,OPay Card Payment,-58052.0,1127634.16,POS,250106330100702983351015



Last 50 Transactions:


Unnamed: 0,Trans. Time,Value Date,Description,Debit/Credit(N),Balance(N),Channel,Transaction Reference
401,2025-05-18 18:37:18,2025-05-18 00:00:00,OPay Card Payment,-20000.0,357370.94,POS,250518330100775950787467
402,2025-05-18 20:44:58,2025-05-18 00:00:00,OPay Card Payment,-44800.0,312570.94,POS,250518330100778271647455
403,2025-05-19 09:32:07,2025-05-19 00:00:00,Airtime,-35000.0,277570.94,E-Channel,250519100100783879761034
404,2025-05-19 10:23:49,2025-05-19 00:00:00,Transfer to ZAINAB OMOTAYO JIMOH,-1750.0,275820.94,E-Channel,100004250519102353133029874198
405,2025-05-20 07:19:54,2025-05-20 00:00:00,Transfer from RUNSEWE OLADAPO OREOLUWA,1000000.0,1275820.94,E-Channel,000013250520081847000169406232
406,2025-05-20 07:19:55,2025-05-20 00:00:00,Electronic Money Transfer Levy,-50.0,1275770.94,E-Channel,250520140200799042664346
407,2025-05-20 11:46:03,2025-05-20 00:00:00,OPay Card Payment,-1350.0,1274420.94,POS,250520330100803305694300
408,2025-05-20 11:52:10,2025-05-20 00:00:00,OPay Card Payment,-20999.96,1253420.98,POS,250520330100802921175193
409,2025-05-20 18:53:41,2025-05-20 00:00:00,OPay Card Payment,-10212.5,1243208.48,E-Channel,250520330100810606214331
410,2025-05-22 17:02:05,2025-05-22 00:00:00,OPay Card Payment,-5435.0,1237773.48,POS,250522330100843071257712



Successfully saved cleaned data to '../output/opay_extracted_transactions15062025172417.csv'
