In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
import re
from tensorflow import keras
from tensorflow.keras.layers import Dense, Input, Dropout, Conv2D, MaxPooling2D, Flatten
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from text_generator import TextGenerator
from config.wine import RED_VARIETALS, WHITE_VARIETALS, VARIETAL_ABBREVIATIONS, COUNTRIES
from robust_text_generator import RobustTextImageGenerator
from PIL import Image, ImageDraw, ImageFont
from tensorflow.keras.regularizers import l2


import re
from typing import List, Dict, Tuple

print("🍷 WINE OCR LEARNING LAB - STUDENT VERSION")
print("Your mission: Fill in the missing code to build a complete system!")

# =============================================================================
# 1. CONSTS
# =============================================================================

VOCABULARY_MAX_SIZE = 100000  # Adjust as needed

MENU_CHARACTERS = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.,-*'&$%"

MENU_GROUPING_MAX_DISTANCE = 200
#(
    
#     "0123456789" +           # Years, prices
#     "ABCDEFGHIJKLMNOPQRSTUVWXYZ" +  # Wine names (caps)
#     "abcdefghijklmnopqrstuvwxyz" +  # Wine names (lowercase)
#     " .,-*'&$%"                 # Punctuation
# )


char_to_idx = {char: idx for idx, char in enumerate(MENU_CHARACTERS)}
idx_to_char = {idx: char for idx, char in enumerate(MENU_CHARACTERS)}





2025-07-15 15:11:04.167453: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-15 15:11:04.348329: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752556264.403729 1071256 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752556264.421636 1071256 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752556264.546211 1071256 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

🍷 WINE OCR LEARNING LAB - STUDENT VERSION
Your mission: Fill in the missing code to build a complete system!


In [2]:
### PNG -> Wine Entries

from menu_token_extractor import MenuTokenExtractor
from menu_token_enrichor import MenuTokenEnrichor
from menu_token_grouper import MenuTokenGrouper

vectorizer = tf.keras.layers.TextVectorization(
  max_tokens=VOCABULARY_MAX_SIZE,
  output_sequence_length=1,  # Single token per text
  output_mode='int'
)

extractor = MenuTokenExtractor(
  low_text=0.3,
  width_ths=0.4,
  height_ths=0.4
)

enrichor = MenuTokenEnrichor(red_varietals=RED_VARIETALS,
                        white_varietals=WHITE_VARIETALS,
                        varietal_abbreviations=VARIETAL_ABBREVIATIONS,
                        countries=COUNTRIES)


grouper = MenuTokenGrouper(debug=True)



2025-07-15 15:11:09.703214: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [3]:
MENU_TRAIN_SIMPLE = [
  #  # Group 0: Noise and headers
  #  [
  #      {'text': 'WINE LIST', 'x': 350, 'y': 85, 'h': 32},
  #      {'text': 'RED WINES', 'x': 125, 'y': 125, 'h': 24},
  #      {'text': 'WHITE WINES', 'x': 125, 'y': 380, 'h': 24},
  #      {'text': 'Vintage', 'x': 52, 'y': 155, 'h': 14},
  #      {'text': 'Producer', 'x': 148, 'y': 155, 'h': 14},
  #      {'text': 'Varietal', 'x': 285, 'y': 155, 'h': 14},
  #      {'text': 'Region', 'x': 425, 'y': 155, 'h': 14},
  #      {'text': 'Price', 'x': 612, 'y': 155, 'h': 14}
  #  ],
   
   # Group 1: Wine 1
   [
       {'text': '2023', 'x': 52, 'y': 182, 'h': 18},
       {'text': 'PENFOLDS', 'x': 148, 'y': 182, 'h': 20},
       {'text': 'Shiraz', 'x': 285, 'y': 182, 'h': 18},
       {'text': 'Barossa Valley', 'x': 425, 'y': 182, 'h': 16},
       {'text': '85', 'x': 612, 'y': 182, 'h': 18}
   ],
   
   # Group 2: Wine 2
   [
       {'text': '2022', 'x': 52, 'y': 218, 'h': 18},
       {'text': 'NAUTILUS', 'x': 148, 'y': 218, 'h': 20},
       {'text': 'Sauvignon Blanc', 'x': 285, 'y': 218, 'h': 18},
       {'text': 'Marlborough NZ', 'x': 425, 'y': 218, 'h': 16},
       {'text': '65', 'x': 612, 'y': 218, 'h': 18}
   ],
   
   # Group 3: Wine 3
   [
       {'text': '2021', 'x': 52, 'y': 254, 'h': 18},
       {'text': 'CLOUDY BAY', 'x': 148, 'y': 254, 'h': 20},
       {'text': 'Chardonnay', 'x': 285, 'y': 254, 'h': 18},
       {'text': 'Hawkes Bay NZ', 'x': 425, 'y': 254, 'h': 16},
       {'text': '72', 'x': 612, 'y': 254, 'h': 18}
   ],
   
   # Group 4: Wine 4
   [
       {'text': '2020', 'x': 52, 'y': 290, 'h': 18},
       {'text': 'WOLF BLASS', 'x': 148, 'y': 290, 'h': 20},
       {'text': 'Cabernet Sauvignon', 'x': 285, 'y': 290, 'h': 18},
       {'text': 'South Australia', 'x': 425, 'y': 290, 'h': 16},
       {'text': '95', 'x': 612, 'y': 290, 'h': 18}
   ],
   
   # Group 5: Wine 5
   [
       {'text': 'NV', 'x': 52, 'y': 326, 'h': 18},
       {'text': 'YELLOWTAIL', 'x': 148, 'y': 326, 'h': 20},
       {'text': 'Pinot Noir', 'x': 285, 'y': 326, 'h': 18},
       {'text': 'Victoria', 'x': 425, 'y': 326, 'h': 16},
       {'text': '42', 'x': 612, 'y': 326, 'h': 18}
   ],
   
   # Group 6: Wine 6
   [
       {'text': '2019', 'x': 45, 'y': 362, 'h': 17},
       {'text': 'HARDYS', 'x': 145, 'y': 362, 'h': 19},
       {'text': 'Merlot', 'x': 288, 'y': 362, 'h': 17},
       {'text': 'McLaren Vale', 'x': 428, 'y': 362, 'h': 15},
       {'text': '78', 'x': 615, 'y': 362, 'h': 17}
   ],
   
   # Group 7: Wine 7
   [
       {'text': '2024', 'x': 55, 'y': 398, 'h': 18},
       {'text': 'JACOB\'S CREEK', 'x': 150, 'y': 398, 'h': 20},
       {'text': 'Riesling', 'x': 282, 'y': 398, 'h': 18},
       {'text': 'Eden Valley', 'x': 422, 'y': 398, 'h': 16},
       {'text': '55', 'x': 610, 'y': 398, 'h': 18}
   ],
   
   # Group 8: Wine 8
   [
       {'text': '2018', 'x': 48, 'y': 434, 'h': 19},
       {'text': 'WYNNS', 'x': 152, 'y': 434, 'h': 21},
       {'text': 'Cabernet Sauvignon', 'x': 290, 'y': 434, 'h': 19},
       {'text': 'Coonawarra', 'x': 430, 'y': 434, 'h': 17},
       {'text': '120', 'x': 608, 'y': 434, 'h': 19}
   ],
   
   # Group 9: Wine 9
   [
       {'text': '2023', 'x': 50, 'y': 470, 'h': 18},
       {'text': 'MCGUIGAN', 'x': 146, 'y': 470, 'h': 20},
       {'text': 'Semillon', 'x': 286, 'y': 470, 'h': 18},
       {'text': 'Hunter Valley', 'x': 426, 'y': 470, 'h': 16},
       {'text': '48', 'x': 614, 'y': 470, 'h': 18}
   ],
   
   # Group 10: Wine 10
   [
       {'text': '2022', 'x': 53, 'y': 506, 'h': 17},
       {'text': 'TYRRELL\'S', 'x': 149, 'y': 506, 'h': 19},
       {'text': 'Gewürztraminer', 'x': 283, 'y': 506, 'h': 17},
       {'text': 'Adelaide Hills', 'x': 423, 'y': 506, 'h': 15},
       {'text': '62', 'x': 612, 'y': 506, 'h': 17}
   ],
   
   # Group 11: Wine 11
   [
       {'text': '2021', 'x': 47, 'y': 542, 'h': 18},
       {'text': 'YALUMBA', 'x': 151, 'y': 542, 'h': 20},
       {'text': 'Viognier', 'x': 287, 'y': 542, 'h': 18},
       {'text': 'Eden Valley', 'x': 427, 'y': 542, 'h': 16},
       {'text': '88', 'x': 611, 'y': 542, 'h': 18}
   ],
   
   # Group 12: Wine 12
   [
       {'text': '2020', 'x': 54, 'y': 578, 'h': 19},
       {'text': 'CAPE MENTELLE', 'x': 147, 'y': 578, 'h': 21},
       {'text': 'Cabernet Merlot', 'x': 285, 'y': 578, 'h': 19},
       {'text': 'Margaret River', 'x': 425, 'y': 578, 'h': 17},
       {'text': '135', 'x': 609, 'y': 578, 'h': 19}
   ],
   
   # Group 13: Wine 13
   [
       {'text': 'NV', 'x': 51, 'y': 614, 'h': 18},
       {'text': 'LEEUWIN', 'x': 153, 'y': 614, 'h': 20},
       {'text': 'Moscato', 'x': 289, 'y': 614, 'h': 18},
       {'text': 'Margaret River', 'x': 429, 'y': 614, 'h': 16},
       {'text': '45', 'x': 613, 'y': 614, 'h': 18}
   ],
   
   # Group 14: Wine 14
   [
       {'text': '2019', 'x': 49, 'y': 650, 'h': 17},
       {'text': 'VASSE FELIX', 'x': 148, 'y': 650, 'h': 19},
       {'text': 'Pinot Grigio', 'x': 284, 'y': 650, 'h': 17},
       {'text': 'Margaret River', 'x': 424, 'y': 650, 'h': 15},
       {'text': '75', 'x': 615, 'y': 650, 'h': 17}
   ],
   
   # Group 15: Wine 15
   [
       {'text': '2023', 'x': 52, 'y': 686, 'h': 18},
       {'text': 'CULLEN', 'x': 150, 'y': 686, 'h': 20},
       {'text': 'Sauvignon Blanc', 'x': 282, 'y': 686, 'h': 18},
       {'text': 'Margaret River', 'x': 422, 'y': 686, 'h': 16},
       {'text': '92', 'x': 610, 'y': 686, 'h': 18}
   ],
   
   # Group 16: Wine 16
   [
       {'text': '2018', 'x': 46, 'y': 722, 'h': 19},
       {'text': 'HENSCHKE', 'x': 152, 'y': 722, 'h': 21},
       {'text': 'Shiraz', 'x': 288, 'y': 722, 'h': 19},
       {'text': 'Eden Valley', 'x': 428, 'y': 722, 'h': 17},
       {'text': '180', 'x': 607, 'y': 722, 'h': 19}
   ],
   
   # Group 17: Wine 17
   [
       {'text': '2022', 'x': 55, 'y': 758, 'h': 18},
       {'text': 'TORBRECK', 'x': 149, 'y': 758, 'h': 20},
       {'text': 'Grenache', 'x': 286, 'y': 758, 'h': 18},
       {'text': 'Barossa Valley', 'x': 426, 'y': 758, 'h': 16},
       {'text': '110', 'x': 612, 'y': 758, 'h': 18}
   ],
   
   # Group 18: Wine 18
   [
       {'text': '2021', 'x': 48, 'y': 794, 'h': 17},
       {'text': 'CLARENDON HILLS', 'x': 145, 'y': 794, 'h': 19},
       {'text': 'Syrah', 'x': 291, 'y': 794, 'h': 17},
       {'text': 'McLaren Vale', 'x': 431, 'y': 794, 'h': 15},
       {'text': '165', 'x': 611, 'y': 794, 'h': 17}
   ],
   
   # Group 19: Wine 19
   [
       {'text': '2020', 'x': 51, 'y': 830, 'h': 18},
       {'text': 'KATNOOK', 'x': 147, 'y': 830, 'h': 20},
       {'text': 'Cabernet Sauvignon', 'x': 284, 'y': 830, 'h': 18},
       {'text': 'Coonawarra', 'x': 427, 'y': 830, 'h': 16},
       {'text': '98', 'x': 613, 'y': 830, 'h': 18}
   ],
   
   # Group 20: Wine 20
   [
       {'text': '2019', 'x': 53, 'y': 866, 'h': 19},
       {'text': 'PETALUMA', 'x': 151, 'y': 866, 'h': 21},
       {'text': 'Chardonnay', 'x': 287, 'y': 866, 'h': 19},
       {'text': 'Adelaide Hills', 'x': 429, 'y': 866, 'h': 17},
       {'text': '125', 'x': 609, 'y': 866, 'h': 19}
   ],
   
   # Group 21: Wine 21
   [
       {'text': 'NV', 'x': 47, 'y': 902, 'h': 18},
       {'text': 'MOUNTADAM', 'x': 149, 'y': 902, 'h': 20},
       {'text': 'Pinot Noir', 'x': 285, 'y': 902, 'h': 18},
       {'text': 'Eden Valley', 'x': 425, 'y': 902, 'h': 16},
       {'text': '85', 'x': 615, 'y': 902, 'h': 18}
   ],
   
   # Group 22: Wine 22
   [
       {'text': '2023', 'x': 50, 'y': 938, 'h': 17},
       {'text': 'GROSSET', 'x': 153, 'y': 938, 'h': 19},
       {'text': 'Riesling', 'x': 289, 'y': 938, 'h': 17},
       {'text': 'Clare Valley', 'x': 423, 'y': 938, 'h': 15},
       {'text': '72', 'x': 611, 'y': 938, 'h': 17}
   ],
   
   # Group 23: Wine 23
   [
       {'text': '2022', 'x': 54, 'y': 974, 'h': 18},
       {'text': 'PIKE', 'x': 146, 'y': 974, 'h': 20},
       {'text': 'Chenin Blanc', 'x': 283, 'y': 974, 'h': 18},
       {'text': 'Clare Valley', 'x': 427, 'y': 974, 'h': 16},
       {'text': '58', 'x': 613, 'y': 974, 'h': 18}
   ],
   
   # Group 24: Wine 24
   [
       {'text': '2021', 'x': 48, 'y': 1010, 'h': 19},
       {'text': 'ELDERTON', 'x': 150, 'y': 1010, 'h': 21},
       {'text': 'Shiraz', 'x': 286, 'y': 1010, 'h': 19},
       {'text': 'Barossa Valley', 'x': 430, 'y': 1010, 'h': 17},
       {'text': '145', 'x': 608, 'y': 1010, 'h': 19}
   ],
   
   # Group 25: Wine 25
   [
       {'text': '2018', 'x': 52, 'y': 1046, 'h': 18},
       {'text': 'CHATEAU TANUNDA', 'x': 144, 'y': 1046, 'h': 20},
       {'text': 'Grenache', 'x': 288, 'y': 1046, 'h': 18},
       {'text': 'Barossa Valley', 'x': 426, 'y': 1046, 'h': 16},
       {'text': '88', 'x': 612, 'y': 1046, 'h': 18}
   ]
]

MENU_TEST_SIMPLE = [
    # Wine 1
    {'text': '2023', 'x': 52, 'y': 182, 'h': 18},
    {'text': 'PENFOLDS', 'x': 148, 'y': 182, 'h': 20}, 
    {'text': 'Shiraz', 'x': 285, 'y': 182, 'h': 18},
    {'text': 'Barossa Valley', 'x': 425, 'y': 182, 'h': 16},
    {'text': '85', 'x': 612, 'y': 182, 'h': 18},

    {'text': 'Gloubi', 'x': 612, 'y': 2000, 'h': 22},

    # Wine 2 - different row
    {'text': '2022', 'x': 52, 'y': 800, 'h': 18},
    {'text': 'NAUTILUS', 'x': 148, 'y': 800, 'h': 20},
    {'text': 'Sauvignon Blanc', 'x': 285, 'y': 800, 'h': 18},
    {'text': 'Marlborough NZ', 'x': 425, 'y': 800, 'h': 16},
    {'text': '65', 'x': 612, 'y': 800, 'h': 18}
]




In [4]:
from sklearn.cluster import DBSCAN
import numpy as np
import pandas as pd


# Training data - flatten the groups
tokens = []
train_labels = []
for group_idx, group in enumerate(MENU_TRAIN_SIMPLE):
   for token in group:
       tokens.append(token)
       train_labels.append(group_idx)

tokens = [enrichor.enrich_token(token) for token in tokens]

print(f"Training data: {len(tokens)} tokens from {len(MENU_TRAIN_SIMPLE)} groups")

groups, noise = grouper.group_tokens(tokens)
print(f"Results: {len(groups)} groups, {len(noise)} noise")


print(f"\n🍷 REAL MENU RESULTS:")
print(f"Groups: {len(groups)}, Noise: {len(noise)}")

# Show first few groups
for group_id in sorted(list(groups.keys())[:3]):
    group_tokens = groups[group_id]
    y_positions = [t['y'] for t in group_tokens]
    texts = [t['text'] for t in group_tokens]
    
    y_info = f"y={min(y_positions)}" if len(set(y_positions)) <= 2 else f"y={min(y_positions)}-{max(y_positions)}"
    print(f"  Group {group_id} ({len(group_tokens)} tokens, {y_info}): {' '.join(texts[:4])}{'...' if len(texts) > 4 else ''}")


Training data: 125 tokens from 25 groups
🍷 Grouping 125 tokens...
📐 Layout: 25 rows, 5.0 tokens/row
📐 Spacing: rows=36.0px, cols=130.0px
📏 Normalized: row_spacing=10.0, Y_span=240.0, X_span=1.5
📏 Normalized: row_spacing=10.0, Y_span=240.0, X_span=1.5
📊 Spatial distances: 6966 pairs
📊 Range: 0.24 to 240.00
📊 Percentiles: 25th=30.02, 50th=70.01
📊 Target eps: 8.00
📊 Found 25 groups, 0 noise tokens
📏 Used eps=8.00
Results: 25 groups, 0 noise

🍷 REAL MENU RESULTS:
Groups: 25, Noise: 0
  Group 0 (5 tokens, y=182): 2023 PENFOLDS Syrah barossa valley...
  Group 1 (5 tokens, y=218): 2022 NAUTILUS Sauvignon Blanc Marlborough NZ...
  Group 2 (5 tokens, y=254): 2021 CLOUDY BAY Chardonnay Hawkes Bay NZ...


In [5]:
# TEST ON FORMAL DATA (WINWORD MENU)
tokens = extractor.extract_tokens('data/menus/menu_test.png')
print(f"📄 Extracted {len(tokens)} tokens")
print([t['text'] for t in tokens])
# Enrich tokens
tokens = [enrichor.enrich_token(token) for token in tokens]
print(f"✅ Tokens enriched")


groups, noise = grouper.group_tokens(tokens)

print(f"\nResults: {len(groups)} groups, {len(noise)} noise")

# Print each group
print(f"\n🍷 WINE GROUPS:")
for group_id in sorted(groups.keys()):
    group_tokens = groups[group_id]
    print(f"\nGroup {group_id} ({len(group_tokens)} tokens):")
    
    for token in group_tokens:
        print(f"  '{token['text']}' at y={token['y']}")
    
    # Try to build wine info
    texts = [t['text'] for t in group_tokens]
    print(f"  → {' '.join(texts)}")

# Print noise
if noise:
    print(f"\n❌ NOISE ({len(noise)} tokens):")
    for token in noise:
        print(f"  '{token['text']}' at y={token['y']}")

print(f"\n🎯 Expected ~{len(tokens)//5} wine groups, got {len(groups)}")






📄 Extracted 52 tokens
['WHITE WINES', 'YEAR', 'WINE', 'REGION', 'GLASS', 'BOTTLE', '2023', 'NAUTILUS Sauvignon Blanc', 'Marlborough NZ', '16', '77', '2022', 'BALTER Chardonnay', 'Adelaide Hills', '18', '88', '2024', 'FREYCINET Riesling', 'Tasmania', '15', '75', '2023', 'CANTINE Pinot Grigio', 'Toscana Italy', '17', '82', 'RED WINES', 'YEAR', 'WINE', 'REGION', 'GLASS', 'BOTTLE', '2021', 'PENFOLDS Shiraz', 'Barossa Valley', '22', '110', '2020', 'ANTINORI Chianti', 'Toscana Italy', '19', '95', '2022', 'WYNNS Cabernet', 'Coonawarra', '24', '120', '2019', 'CLOUDY BAY Pinot Noir', 'Central Otago', '26', '130']
✅ Tokens enriched
🍷 Grouping 52 tokens...
📐 Layout: 12 rows, 4.3 tokens/row
📐 Spacing: rows=20.5px, cols=33.0px
📏 Normalized: row_spacing=10.0, Y_span=161.5, X_span=1.5
📏 Normalized: row_spacing=10.0, Y_span=161.5, X_span=1.5
📊 Spatial distances: 1112 pairs
📊 Range: 0.30 to 161.47
📊 Percentiles: 25th=19.51, 50th=47.82
📊 Target eps: 8.00
📊 Found 8 groups, 12 noise tokens
📏 Used eps=8.00

In [6]:
# TEST: BALBOA
menu = 'data/menus/menu_balboa_1.png'
tokens = extractor.extract_tokens(menu)

print(f"📄 Extracted {len(tokens)} tokens for {menu}")
print([t['text'] for t in tokens])

tokens = [enrichor.enrich_token(token) for token in tokens]
groups, noise = grouper.group_tokens(tokens)

print(f"\nResults: {len(groups)} groups, {len(noise)} noise")
# Print each group
print(f"\n🍷 WINE GROUPS:")
for group_id in sorted(groups.keys()):
    group_tokens = groups[group_id]
    print(f"\nGroup {group_id} ({len(group_tokens)} tokens):")
    
    for token in group_tokens:
        print(f"  '{token['text']}' at y={token['y']}")
    
    # Try to build wine info
    texts = [t['text'] for t in group_tokens]
    print(f"  → {' '.join(texts)}")

# Print noise
if noise:
    print(f"\n❌ NOISE ({len(noise)} tokens):")
    for token in noise:
        print(f"  '{token['text']}' at y={token['y']}")

print(f"\n🎯 Expected ~{len(tokens)//5} wine groups, got {len(groups)}")




📄 Extracted 92 tokens for data/menus/menu_balboa_1.png
['VINO', 'LIGHT AND CRISP WHITES', 'GLS', 'BTL', '2023', 'NAUTILUS Sauvignon Blanc', 'Marlborough, NZ', '16', '77', '2023', 'BALTER Sauvignon Bianc', 'Trentino,', '18', '88', '2024', "FREYCINET 'WINEGLASS BAY' Sauvignon Bianc", 'Tasmania', '75', '2023', 'CANTINE DOLIANOVA Vermentino', 'Sardegna, Italy', '16', '77', '2022', 'TUA RITA PERLATO DEL BOSCO Vermentino', 'Toscana,', '110', '2023', 'DR LOOSEN DR L Piesling', 'Mosen,', 'Germany', '15', '73', '2024', "O'LEARY WALKER 'POLISH RIVER' Riesling", 'Clare', 'SA', '75', '2022', "TORNATORE 'ETNA BIANCO' Carricante", 'Sicily, Italy', '20', '98', 'vibrant Sicilian white with crisp citrus, green apple, and delicate herb notes Crafted from Carricante', "grapes grown on Mount Etna'5 volcanic slopes, it delivers", "' refreshing minerality and", 'lingering clean', 'finish', '2022', 'MOUNT ROZIER RESERVE THE BEEKEEPER', 'South Africa', '15', '73', 'Chenin Blanc', 'MEDIUM TEXTURED WHITES', '20

In [9]:
# TEST: BARREL
menu = 'data/menus/menu_brooklyn.png'
tokens = extractor.extract_tokens(menu)

print(f"📄 Extracted {len(tokens)} tokens for {menu}")
print([t['text'] for t in tokens])

tokens = [enrichor.enrich_token(token) for token in tokens]
groups, noise = grouper.group_tokens(tokens)

print(f"\nResults: {len(groups)} groups, {len(noise)} noise")
# Print each group
print(f"\n🍷 WINE GROUPS:")
for group_id in sorted(groups.keys()):
    group_tokens = groups[group_id]
    print(f"\nGroup {group_id} ({len(group_tokens)} tokens):")
    
    for token in group_tokens:
        print(f"  '{token['text']}' at y={token['y']}")
    
    # Try to build wine info
    texts = [t['text'] for t in group_tokens]
    print(f"  → {' '.join(texts)}")

# Print noise
if noise:
    print(f"\n❌ NOISE ({len(noise)} tokens):")
    for token in noise:
        print(f"  '{token['text']}' at y={token['y']}")

print(f"\n🎯 Expected ~{len(tokens)//5} wine groups, got {len(groups)}")




📄 Extracted 177 tokens for data/menus/menu_brooklyn.png
['sparklíng', 'varietal', 'Year', 'Glass', 'Bottle', 'Provenance', 'Habitat', 'Brut Cuvee', 'Contra', 'Ranges', 'Laurent Breban', 'Blanc de Blanc NV', '16', 'France', 'Ruggeri', 'Prosecco NV', 'Italy', 'Moet 8 Chandon Imperial', 'Brut', 'Champagne', '129', 'France', 'Laurent Perrier La', 'Champagne NV', '149', 'France', 'Moscato 5 Rose', 'Ialinga Park', 'Moscato Frizzante', '2021', 'Willbriggie, NSW', 'Cester Camillo Rose', 'Rosc  Sparkling', '2020', 'Italy', 'Amisfield Pinot Noir', 'Rose', '2021', '18', 'PtagenCE, France', 'Chateau La Gordonne', 'Rose', '2020', 'Provence,', 'White Wine', 'Jones Road', 'Chardonnay', '2019', 'Mornington Peninsula', 'Voyager Estate Coastal', 'Chardonnay', '2020', 'Margaret River, WA', 'Fattori', 'Pinot Grigio', '2021', 'Alaclbdigeghtalz', 'The Pass', 'Pinot Gris', '2029', '1', '59', 'Haelborolegh sAZ', "Teusner ªEmpress'", 'Riesling', '2023', 'Adelaide Hills, SA', 'Howard Vineyard', 'Sauvignon Blanc