**Credit Card Statement Parser**

Build a PDF parser that extracts 5 key data points from credit card statements across 5 major credit card issuers.

Using pdfplumber which will allow to read and extract text, tables, and layout information from PDF files.

In [1]:
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import pdfplumber
import re
import os

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text


class BaseParser:
    def __init__(self, bank_name):
        self.bank_name = bank_name

    def parse(self, text):
        raise NotImplementedError("Each bank parser must implement its own parse method.")


class YesBankParser(BaseParser):
    def __init__(self):
        super().__init__("HDFC Bank")

    def parse(self, text):
        data = {}
        data["Bank"] = self.bank_name
        data["Card Last 4"] = re.search(r"Card\s*No\.*\s*(\d{4})", text)
        data["Billing Cycle"] = re.search(r"Statement Period\s*:\s*(.*)", text)
        data["Due Date"] = re.search(r"Payment Due Date\s*:\s*(\d{2}/\d{2}/\d{4})", text)
        data["Total Due"] = re.search(r"Total Amount Due\s*:?[\s₹]*([\d,]+\.\d{2})", text)
        data["Name"] = re.search(r"Statement of\s+(.*)", text)
        return {k: (v.group(1) if v else "Not Found") for k, v in data.items()}


class SaraswatParser(BaseParser):
    def __init__(self):
        super().__init__("SBI Card")

    def parse(self, text):
        data = {}
        data["Bank"] = self.bank_name
        data["Card Last 4"] = re.search(r"Card Number\s*:\s*\*+(\d{4})", text)
        data["Billing Cycle"] = re.search(r"Statement Period\s*:\s*(.*)", text)
        data["Due Date"] = re.search(r"Payment Due Date\s*:\s*(\d{2}-\w{3}-\d{4})", text)
        data["Total Due"] = re.search(r"Total Amount Due\s*₹?\s*([\d,]+\.\d{2})", text)
        data["Name"] = re.search(r"Statement of Account\s*-\s*(.*)", text)
        return {k: (v.group(1) if v else "Not Found") for k, v in data.items()}


class CitiBankParser(BaseParser):
    def __init__(self):
        super().__init__("Axis Bank")

    def parse(self, text):
        data = {}
        data["Bank"] = self.bank_name
        data["Card Last 4"] = re.search(r"XXXX\s*(\d{4})", text)
        data["Billing Cycle"] = re.search(r"Statement Date\s*:\s*(.*)", text)
        data["Due Date"] = re.search(r"Due Date\s*:\s*(\d{2}/\d{2}/\d{4})", text)
        data["Total Due"] = re.search(r"Total Amount Due\s*₹?\s*([\d,]+\.\d{2})", text)
        data["Name"] = re.search(r"Cardholder\s*Name\s*:\s*(.*)", text)
        return {k: (v.group(1) if v else "Not Found") for k, v in data.items()}


class ICICIParser(BaseParser):
    def __init__(self):
        super().__init__("ICICI Bank")

    def parse(self, text):
        data = {}
        data["Bank"] = self.bank_name
        data["Card Last 4"] = re.search(r"Card Ending\s*:\s*(\d{4})", text)
        data["Billing Cycle"] = re.search(r"Billing Period\s*:\s*(.*)", text)
        data["Due Date"] = re.search(r"Payment Due Date\s*:\s*(\d{2}\s\w+\s\d{4})", text)
        data["Total Due"] = re.search(r"Total Amount Due\s*₹?\s*([\d,]+\.\d{2})", text)
        data["Name"] = re.search(r"Statement for\s*(.*)", text)
        return {k: (v.group(1) if v else "Not Found") for k, v in data.items()}


class HDFCParser(BaseParser):
    def __init__(self):
        super().__init__("American Express")

    def parse(self, text):
        data = {}
        data["Bank"] = self.bank_name
        data["Card Last 4"] = re.search(r"Card ending\s*(\d{4})", text)
        data["Billing Cycle"] = re.search(r"Statement Period\s*:\s*(.*)", text)
        data["Due Date"] = re.search(r"Payment Due\s*:\s*(\d{2}\s\w+\s\d{4})", text)
        data["Total Due"] = re.search(r"Total Due\s*₹?\s*([\d,]+\.\d{2})", text)
        data["Name"] = re.search(r"Account Summary for\s*(.*)", text)
        return {k: (v.group(1) if v else "Not Found") for k, v in data.items()}



def identify_bank(text):
    """Identify bank statement"""
    if "Yes Bank" in text:
        return YesBankParser()
    elif "Saraswat" in text:
        return SaraswatParser()
    elif "CitiBank" in text:
        return CitiBankParser()
    elif "ICICI" in text:
        return ICICIParser()
    elif "American Express" in text or "AMEX" in text:
        return HDFCParser()
    else:
        return None


def parse_statement(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    parser = identify_bank(text)

    if parser:
        result = parser.parse(text)
        print("\nExtracted Data:")
        for k, v in result.items():
            print(f"{k}: {v}")
    else:
        print(" Could not identify bank type. Please check your PDF.")

if __name__ == "__main__":
    pdf_file = "American_Express_sample_statement_7890"
    if os.path.exists(pdf_file):
        parse_statement(pdf_file)
    else:
        print(f"Please place your credit card statement as '{pdf_file}' in this folder.")


Please place your credit card statement as 'American_Express_sample_statement_7890' in this folder.
