In [1]:
# !pip install opencv-python pytesseract

In [16]:
import cv2
import pytesseract
from PIL import Image
import numpy as np
import pandas as pd

In [3]:
# Path to your image
image_path = "walmart-receipt.png"

### Load and view image

1. Using OpenCV

In [4]:
# Load image with OpenCV
img = cv2.imread(image_path)

cv2.imshow("Receipt", img)     # show image in a window
cv2.waitKey(0)                 # wait until a key is pressed
cv2.destroyAllWindows()  

2. Using PIL

In [5]:
from PIL import Image

img1 = Image.open(image_path)
img1.show()   # opens in default viewer

In [6]:
# Convert to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

cv2.imshow("Receipt", gray)     # show image in a window
cv2.waitKey(0)                 # wait until a key is pressed
cv2.destroyAllWindows() 

In [7]:
# Apply thresholding to make text stand out
thresh = cv2.adaptiveThreshold(
    gray, 255,
    cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
    cv2.THRESH_BINARY,
    31, 2
)

cv2.imshow("Receipt", thresh)     # show image in a window
cv2.waitKey(0)                 # wait until a key is pressed
cv2.destroyAllWindows() 

In [8]:
# Optional: Noise removal (morphology)
kernel = np.ones((1, 1), np.uint8)
processed = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)

cv2.imshow("Receipt", processed)     # show image in a window
cv2.waitKey(0)                 # wait until a key is pressed
cv2.destroyAllWindows() 

In [9]:
# Save preprocessed image temporarily (if you want to inspect)
cv2.imwrite("processed_receipt.png", processed)

True

In [10]:
# Convert to PIL for pytesseract
pil_img = Image.fromarray(processed)

In [26]:
# OCR extraction
# custom_config = r'--oem 3 --psm 6'  # OEM 3 = default LSTM, PSM 6 = assume block of text
ocr_text = pytesseract.image_to_string(pil_img)

In [27]:
print("OCR Output\n============\n",ocr_text)

OCR Output
 Walmart >,<.

Save money. Live better.

(330 ) 339 - 3991
MANAGER DIANA EARNEST
231 BLUEBELL DR SW
NEW PHILADELPHIA OH 44663
ST# 02115 OP# 009044 TE# 44 TRH 01301

PET TOY 004747571658 1.97%
FLOPPY PUPPY 004747514846 1.97%
SSSUPREME S 070060332153 4.97%
2.5 SQUEAK 084699803238 5.92 X
MUNCHY DMBEL 068113108796 3.77%
DOG TREAT 007119013654 2.92 %
PED PCH 1 002310011802 0.50 X
PED PCH 1 002310011802 0.50 X
COUPON 23100 052310037000 1.00-0
HNYMD SMORES 088491226837 F 3.98 0
FRENCH DRSNG 004132100655 F 1.98 0
3 ORANGES 001466835001 F 5.47 N
BABY CARROTS 003338366602 I 1.48 N
COLLARDS 000000004614KI 1.24 N
CALZONE 005208362080 F 2.50 0
MM RVW MNT 003399105848 19.77 X
STKOBRLPLABL 001558679414 1.97%
STKOBRLPLABL 001558679414 1.97%
STKO SUNFLWR 001558679410 0.97 X
STKO SUNFLWR 001558679410 0.97 X
STKO SUNFLWR 001558679410 0.97 X
STKO SUNFLWR 001558679410 0.97 X
BLING BEADS 076594060699 0.97 X
GREAT VALUE 007874203191 F 9.970
LIPTON 001200011224 F 4.48%
DRY DOG 002310011035 12.44 X


In [37]:
# Split into lines
lines = [line.strip() for line in ocr_text.strip().split("\n")]
lines


['Walmart >,<.',
 '',
 'Save money. Live better.',
 '',
 '(330 ) 339 - 3991',
 'MANAGER DIANA EARNEST',
 '231 BLUEBELL DR SW',
 'NEW PHILADELPHIA OH 44663',
 'ST# 02115 OP# 009044 TE# 44 TRH 01301',
 '',
 'PET TOY 004747571658 1.97%',
 'FLOPPY PUPPY 004747514846 1.97%',
 'SSSUPREME S 070060332153 4.97%',
 '2.5 SQUEAK 084699803238 5.92 X',
 'MUNCHY DMBEL 068113108796 3.77%',
 'DOG TREAT 007119013654 2.92 %',
 'PED PCH 1 002310011802 0.50 X',
 'PED PCH 1 002310011802 0.50 X',
 'COUPON 23100 052310037000 1.00-0',
 'HNYMD SMORES 088491226837 F 3.98 0',
 'FRENCH DRSNG 004132100655 F 1.98 0',
 '3 ORANGES 001466835001 F 5.47 N',
 'BABY CARROTS 003338366602 I 1.48 N',
 'COLLARDS 000000004614KI 1.24 N',
 'CALZONE 005208362080 F 2.50 0',
 'MM RVW MNT 003399105848 19.77 X',
 'STKOBRLPLABL 001558679414 1.97%',
 'STKOBRLPLABL 001558679414 1.97%',
 'STKO SUNFLWR 001558679410 0.97 X',
 'STKO SUNFLWR 001558679410 0.97 X',
 'STKO SUNFLWR 001558679410 0.97 X',
 'STKO SUNFLWR 001558679410 0.97 X',
 'BLIN

In [35]:
# Split into lines
lines = [line.strip() for line in ocr_text.strip().split("\n")]

# Parse each line into structured data
data = []
for line in lines:
    parts = line.split()
    print(parts)

['Walmart', '>,<.']
[]
['Save', 'money.', 'Live', 'better.']
[]
['(330', ')', '339', '-', '3991']
['MANAGER', 'DIANA', 'EARNEST']
['231', 'BLUEBELL', 'DR', 'SW']
['NEW', 'PHILADELPHIA', 'OH', '44663']
['ST#', '02115', 'OP#', '009044', 'TE#', '44', 'TRH', '01301']
[]
['PET', 'TOY', '004747571658', '1.97%']
['FLOPPY', 'PUPPY', '004747514846', '1.97%']
['SSSUPREME', 'S', '070060332153', '4.97%']
['2.5', 'SQUEAK', '084699803238', '5.92', 'X']
['MUNCHY', 'DMBEL', '068113108796', '3.77%']
['DOG', 'TREAT', '007119013654', '2.92', '%']
['PED', 'PCH', '1', '002310011802', '0.50', 'X']
['PED', 'PCH', '1', '002310011802', '0.50', 'X']
['COUPON', '23100', '052310037000', '1.00-0']
['HNYMD', 'SMORES', '088491226837', 'F', '3.98', '0']
['FRENCH', 'DRSNG', '004132100655', 'F', '1.98', '0']
['3', 'ORANGES', '001466835001', 'F', '5.47', 'N']
['BABY', 'CARROTS', '003338366602', 'I', '1.48', 'N']
['COLLARDS', '000000004614KI', '1.24', 'N']
['CALZONE', '005208362080', 'F', '2.50', '0']
['MM', 'RVW', 'MNT'

In [None]:
# Split into lines
lines = [line.strip() for line in ocr_text.strip().split("\n")]

# Parse each line into structured data
data = []
for line in lines:
    parts = line.split()
    
    # Last part should be price or price+code (sometimes with %, X, F, etc.)
    price = parts[-2] if parts[-1].isalpha() or "%" in parts[-1] else parts[-1]
    code = parts[-2] if not price.replace('.', '', 1).isdigit() else None
    
    # Extract description and product code
    description = " ".join(parts[:-2]) if code else " ".join(parts[:-1])
    product_code = parts[-2] if code else parts[-2]
    
    # Clean price
    try:
        price_val = float(price.replace("%", "").replace("X", "").replace("N", "").replace("O", "").replace("-", ""))
    except:
        price_val = None
    
    data.append([description, product_code, price_val])

# Create dataframe
df = pd.DataFrame(data, columns=["Description", "Product Code", "Price"])
df

['al,',
 'alMaft ¢,s.',
 'Save money. Live better.',
 '(330 ) 339 - 3991',
 'MANAGER DIANA EARNEST',
 '231 BLUEBELL DR SW',
 'NEW PHILADELPHIA OH 44663',
 'ST# 02115 OP# 009044 TE# 44 TRH 01301',
 'PET TOY 004747571658 1.97%',
 'FLOPPY PUPPY 004747514846 1.97%',
 'SSSUPREME S 070060332153 4.97%',
 '2.5 SQUEAK 084699803238 5.92 X',
 'MUNCHY DMBEL 068113108796 3.77%',
 'DOG TREAT 007119013654 2.92 %',
 'PED PCH 1 002310011802 0.50 X',
 'PED PCH 1 002310011802 0.50 X',
 'COUPON 23100 052310037000 1.00-0',
 'HNYMD SMORES 088491226837 F 3.98 0',
 'FRENCH DRSNG 004132100655 F 1.98 0',
 '3 ORANGES 001466835001 F 5.47 N',
 'BABY CARROTS 003338366602 I 1.48 N',
 'COLLARDS 000000004614KI 1.24 N',
 'CALZONE 005208362080 F 2.50 0',
 'MM RVW MNT 003399105848 19.77 X',
 'STKOBRLPLABL 001558679414 1.97%',
 'STKOBRLPLABL 001558679414 1.97%',
 'STKO SUNFLWR 001558679410 0.97 X',
 'STKO SUNFLWR 001558679410 0.97 X',
 'STKO SUNFLWR 001558679410 0.97 X',
 'STKO SUNFLWR 001558679410 0.97 X',
 'BLING BEADS 

In [None]:

# Save as CSV
csv_path = "/mnt/data/receipt_items.csv"
df.to_csv(csv_path, index=False)

import caas_jupyter_tools
caas_jupyter_tools.display_dataframe_to_user("Receipt Data", df)
csv_path