<a href="https://colab.research.google.com/github/zaranasanghavi/AnalyticsDashboard/blob/main/SURE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install PyMuPDF pdfplumber

Collecting PyMuPDF
  Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import re
import fitz
import pdfplumber

def clean_text_generic(value):
    if not value:
        return None
    cleaned = re.sub(r'\d{6,}', '', value)
    cleaned = re.sub(r'\d{1,2}/\d{1,2}/\d{2,4}', '', cleaned)
    cleaned = re.sub(r'[:]+', ' ', cleaned)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned

def clean_text_hdfc(value):
    if not value:
        return None
    cleaned = re.sub(r'\b(AAN|Statement Date|Total Dues|Minimum Amount Due|Payment Due Date|Statement for HDFC Bank Credit Card)\b', '', value, flags=re.IGNORECASE)
    cleaned = re.sub(r'\d{1,2}/\d{1,2}/\d{2,4}', '', cleaned)
    cleaned = re.sub(r'\d{6,}', '', cleaned)
    cleaned = re.sub(r'[:]+', ' ', cleaned)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned



def extract_bob(pdf_path):
    data = {}
    doc = fitz.open(pdf_path)
    text = "".join(page.get_text("text") for page in doc)
    doc.close()

    statement_dates = re.findall(r"Statement Date:\s*(\d{2}/\d{2}/\d{4})", text)
    if not statement_dates:
        statement_dates = re.findall(r"\b\d{2}/\d{2}/\d{4}\b", text)
    data["Statement Date"] = statement_dates[0] if statement_dates else None

    due_dates = re.findall(r"Due Date:\s*(\d{2}/\d{2}/\d{4})", text)
    if not due_dates and len(statement_dates) > 1:
        due_dates = [statement_dates[1]]
    data["Due Date"] = due_dates[0] if due_dates else None

    card_numbers = re.findall(r"X{4,}\*+\d{4}|X{6,}\d{4}", text)
    data["Card Number"] = card_numbers[0] if card_numbers else None

    billing_cycle = re.findall(r"statement date is (\d{1,2}(st|nd|rd|th)) of every month", text, flags=re.IGNORECASE)
    data["Billing Cycle"] = billing_cycle[0][0] if billing_cycle else None

    credit_limit = re.findall(r"Credit Limit\s*₹\s*([\d,]+\.\d{2}|[\d,]+)", text)
    data["Credit Limit"] = credit_limit[0].replace(",", "") if credit_limit else None

    return data


def extract_indusind(pdf_path):
    data = {}
    with pdfplumber.open(pdf_path) as pdf:
        full_text = ''.join((page.extract_text() or '') + '\n' for page in pdf.pages)

    patterns = {
        "Account Holder Name": r"Purchases & Cash Transactions for\s+(MR [A-Z\s]+)\s*\(Credit Card No",
        "Card Number": r"\(Credit Card No\.? (\d{4}X{4,}X{4,}\d{4,})",
        "Purchases & Other Charges": r"Purchases & Other Charges\s*₹?\s*([\d,]+\.\d{2})",
        "Minimum Amount Due": r"Minimum Amount Due\s*₹?\s*([\d,]+\.\d{2})",
        "Card Variant": r"(Platinum|Gold|Titanium|Classic|Signature|World|Select|RuPay|Visa|MasterCard)"
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, full_text, re.IGNORECASE)
        value = match.group(1) if match else None
        data[key] = clean_text_generic(value)

    return data


def extract_hdfc(pdf_path):
    data = {}
    with pdfplumber.open(pdf_path) as pdf:
        text = '\n'.join((page.extract_text() or '') for page in pdf.pages)

    patterns = {
        "Account Holder Name": r"Name\s*:\s*([A-Z][A-Z ]+)",
        "Email": r"Email\s*:\s*([\w\.-]+@[\w\.-]+)",
        "Address": r"Address\s*:\s*(.*?)\nGST",
        "Card Number": r"Card No\s*:\s*([0-9Xx\s-]+)",
        "Available Cash Limit": r"Available Cash Limit\s*\n([\d,]+)",
        "Card Variant": r"(Visa|MasterCard|Platinum|Gold|Signature|Regalia|Business|Titanium|World|Select)",
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
        value = match.group(1) if match else None
        data[key] = clean_text_hdfc(value)

    return data


def extract_sbi(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = '\n'.join(page.extract_text() for page in pdf.pages)

    patterns = {
        "Customer Name": r"([A-Z\s]+)\s+Credit Card Number",
        "Card Number": r"Credit Card Number\s+(XXXX\s+XXXX\s+XXXX\s+\w{2}\d{2})",
        "Total Amount Due": r"Total Amount Due\s*\( ` \)\s*([\d,]+\.\d{2})",
        "Credit Limit": r"Credit Limit\s*\( ` \).*?([\d,]+\.\d{2})",
        "Available Credit Limit": r"Available Credit Limit\s*\( ` \).*?([\d,]+\.\d{2})",
        "Available Cash Limit": r"Available Cash Limit\s*\( ` \).*?([\d,]+\.\d{2})",
    }

    data = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, full_text, re.MULTILINE | re.DOTALL)
        data[key] = match.group(1).strip() if match else None

    return data


def extract_icici(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''.join((page.extract_text() or '') for page in pdf.pages)

    data = {}
    data["Statement Date"] = re.search(r'STATEMENT DATE[:\s]*([A-Za-z0-9 ,]+)', text, re.IGNORECASE).group(1).strip() if re.search(r'STATEMENT DATE[:\s]*([A-Za-z0-9 ,]+)', text, re.IGNORECASE) else None
    data["Payment Due Date"] = re.search(r'PAYMENT DUE DATE[:\s]*([A-Za-z0-9 ,]+)', text, re.IGNORECASE).group(1).strip() if re.search(r'PAYMENT DUE DATE[:\s]*([A-Za-z0-9 ,]+)', text, re.IGNORECASE) else None
    data["Card Variant"] = re.search(r'(Visa|MasterCard|Platinum|Gold|Signature|Titanium|Infinite|World|Select)', text, re.IGNORECASE).group(1) if re.search(r'(Visa|MasterCard|Platinum|Gold|Signature|Titanium|Infinite|World|Select)', text, re.IGNORECASE) else None
    data["Card Number"] = re.search(r'(\d{4}X{6,8}\d{2,4})', text).group(1) if re.search(r'(\d{4}X{6,8}\d{2,4})', text) else None
    data["Invoice No"] = re.search(r'Invoice No:\s*₹?([\d,]+\.\d{2}|\d+)', text).group(1) if re.search(r'Invoice No:\s*₹?([\d,]+\.\d{2}|\d+)', text) else None
    data["State Code"] = re.search(r'State Code:\s*₹?([\d,]+\.\d{2}|\d+)', text).group(1) if re.search(r'State Code:\s*₹?([\d,]+\.\d{2}|\d+)', text) else None

    return data


def extract_credit_card_data(pdf_path, bank_name):
    bank_name = bank_name.lower()

    match bank_name:
        case "bob":
            return extract_bob(pdf_path)
        case "indusind":
            return extract_indusind(pdf_path)
        case "hdfc":
            return extract_hdfc(pdf_path)
        case "sbi":
            return extract_sbi(pdf_path)
        case "icici":
            return extract_icici(pdf_path)
        case _:
            return {"Error": "Bank not supported"}


def print_extracted_data(data, bank_name):
    print(f"\n{bank_name.upper()} Credit Card Extracted Data")
    print("-" * 40)
    for k, v in data.items():
        print(f"{k:25}: {v if v else '(Not Found)'}")
    print("-" * 40)


if __name__ == "__main__":
    bank = input("Enter the name of the bank(bob, indusind, hdfc, sbi, icici): ")  # Replace with: "bob", "indusind", "hdfc", "sbi", "icici"
    pdf_file_path = f"{bank}.pdf"  # Match file name with bank

    extracted_data = extract_credit_card_data(pdf_file_path, bank)
    print_extracted_data(extracted_data, bank)


Enter the name of the bank(bob, indusind, hdfc, sbi, icici): bob

BOB Credit Card Extracted Data
----------------------------------------
Statement Date           : 01/10/2025
Due Date                 : 20/10/2025
Card Number              : XXXXXX********0534
Billing Cycle            : 1st
Credit Limit             : 10000
----------------------------------------


In [5]:
bank = input("Enter the name of the bank (bob, indusind, hdfc, sbi, icici): ")
pdf_file_path = f"{bank}.pdf"  # Ensure your PDF file name matches, e.g., 'hdfc.pdf'

extracted_data = extract_credit_card_data(pdf_file_path, bank)
print_extracted_data(extracted_data, bank)

Enter the name of the bank (bob, indusind, hdfc, sbi, icici): indusind

INDUSIND Credit Card Extracted Data
----------------------------------------
Account Holder Name      : MR NARESH SANGHVI
Card Number              : 3561XXXXXXXX9396
Purchases & Other Charges: 4,775.38
Minimum Amount Due       : 100.00
Card Variant             : PLATINUM
----------------------------------------


In [6]:
bank = input("Enter the name of the bank (bob, indusind, hdfc, sbi, icici): ")
pdf_file_path = f"{bank}.pdf"  # Ensure your PDF file name matches, e.g., 'hdfc.pdf'

extracted_data = extract_credit_card_data(pdf_file_path, bank)
print_extracted_data(extracted_data, bank)

Enter the name of the bank (bob, indusind, hdfc, sbi, icici): hdfc

HDFC Credit Card Extracted Data
----------------------------------------
Account Holder Name      : NARESH BHKHALAL SANGHAVI
Email                    : H.P.EXPORT501@GMAIL.COM
Address                  : ROOM NO E/218 WALCHAND COMPLEX 90 FEET ROAD BHAYANDER WEST THANE- MHS
Card Number              : 4572 62XX XXXX 6397 0
Available Cash Limit     : 5,92,000
Card Variant             : Visa
----------------------------------------


In [7]:
bank = input("Enter the name of the bank (bob, indusind, hdfc, sbi, icici): ")
pdf_file_path = f"{bank}.pdf"  # Ensure your PDF file name matches, e.g., 'hdfc.pdf'

extracted_data = extract_credit_card_data(pdf_file_path, bank)
print_extracted_data(extracted_data, bank)

Enter the name of the bank (bob, indusind, hdfc, sbi, icici): sbi

SBI Credit Card Extracted Data
----------------------------------------
Customer Name            : NARESH SANGHVI
Card Number              : XXXX XXXX XXXX XX92
Total Amount Due         : 589.00
Credit Limit             : 3,00,000.00
Available Credit Limit   : 2,99,411.45
Available Cash Limit     : 2,99,411.45
----------------------------------------


In [8]:
bank = input("Enter the name of the bank (bob, indusind, hdfc, sbi, icici): ")
pdf_file_path = f"{bank}.pdf"  # Ensure your PDF file name matches, e.g., 'hdfc.pdf'

extracted_data = extract_credit_card_data(pdf_file_path, bank)
print_extracted_data(extracted_data, bank)

Enter the name of the bank (bob, indusind, hdfc, sbi, icici): icici

ICICI Credit Card Extracted Data
----------------------------------------
Statement Date           : January 18, 2023
Payment Due Date         : February 5, 2023
Card Variant             : Visa
Card Number              : 4501XXXXXXXX2004
Invoice No               : 1574180100773147
State Code               : 27
----------------------------------------
