<a href="https://colab.research.google.com/github/vishdevs/receipt-agent/blob/main/Receipt_Agent_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get update -qq
!apt-get install -y tesseract-ocr > /dev/null 2>&1
!pip install pytesseract opencv-python-headless pillow pandas matplotlib PyMuPDF --quiet

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m68.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import io, os, re, datetime
from PIL import Image
import pytesseract
import cv2
import numpy as np
import pandas as pd
from google.colab import files
import matplotlib.pyplot as plt

WORK_DIR = "/content/receipt_agent"
os.makedirs(WORK_DIR, exist_ok=True)

def preprocess(img_bytes):
    img = Image.open(io.BytesIO(img_bytes)).convert('RGB')
    arr = np.array(img)[:,:,::-1]
    gray = cv2.cvtColor(arr, cv2.COLOR_BGR2GRAY)
    blur = cv2.medianBlur(gray,3)
    th = cv.adaptiveThreshold(blur,255,cv.ADAPTIVE_THRESH_GAUSSIAN_C,cv.THRESH_BINARY,11,2)
    return th

def ocr_text_from_bytes(img_bytes):
    img = preprocess(img_bytes)
    pil = Image.fromarray(img)
    text = pytesseract.image_to_string(pil, lang='eng')
    return text

In [3]:
print("Upload receipt images")
uploaded = files.upload()
records=[]
for fn,content in uploaded.items():
    text = ocr_text_from_bytes(content)
    records.append({"filename":fn, "raw_text":text})
    print("\n---", fn, "---")
    print(text[:300])

Upload receipt images


KeyboardInterrupt: 

In [None]:
def extract_amount(text):
    m = re.search(r'₹\s*([\d,]+\.?\d{0,2})', text)
    if not m:
        m = re.search(r'Rs\.?\s*([\d,]+\.?\d{0,2})', text, re.IGNORECASE)
    if not m:
        nums = re.findall(r'([\d,]+\.\d{1,2})', text)
        if nums: return nums[-1].replace(',','')
        return ""
    return m.group(1).replace(',','')

def extract_date(text):
    m = re.search(r'(\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4})', text)
    if m: return m.group(1)
    return ""

def extract_vendor(text):
    lines=[l.strip() for l in text.splitlines() if l.strip()]
    return lines[0] if lines else ""

for r in records:
    t = r["raw_text"]
    r["vendor"]=extract_vendor(t)
    r["date"]=extract_date(t)
    r["amount"]=extract_amount(t)

pd.DataFrame(records)

In [None]:
CATEGORIES={
    "Food":["restaurant","hotel","cafe"],
    "Groceries":["grocery","kirana","supermarket"],
    "Bills":["bill","utility","gst"],
    "Transport":["uber","ola","fuel","petrol"],
}

def categorize(text):
    t=text.lower()
    for cat,keys in CATEGORIES.items():
        for k in keys:
            if k in t:
                return cat
    return "Other"

for r in records:
    r["category"]=categorize(r["raw_text"])

pd.DataFrame(records)

In [None]:
df=pd.DataFrame(records)

def to_float(x):
    try: return float(str(x).replace(",",""))
    except: return None

df["amount_val"]=df["amount"].apply(to_float)

df.to_csv("expenses.csv", index=False)
files.download("expenses.csv")

In [None]:
df.groupby("category")["amount_val"].sum().plot.bar()
plt.show()