In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
import re
from tensorflow import keras
from tensorflow.keras.layers import Dense, Input, Dropout, Conv2D, MaxPooling2D, Flatten
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from text_generator import TextGenerator
from config.wine import RED_VARIETALS, WHITE_VARIETALS, VARIETAL_ABBREVIATIONS, COUNTRIES
from robust_text_generator import RobustTextImageGenerator
from PIL import Image, ImageDraw, ImageFont
from tensorflow.keras.regularizers import l2


import re
from typing import List, Dict, Tuple

print("🍷 WINE OCR LEARNING LAB - STUDENT VERSION")
print("Your mission: Fill in the missing code to build a complete system!")

# =============================================================================
# 1. CONSTS
# =============================================================================

VOCABULARY_MAX_SIZE = 100000  # Adjust as needed

MENU_CHARACTERS = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.,-*'&$%"

MENU_GROUPING_MAX_DISTANCE = 200
#(
    
#     "0123456789" +           # Years, prices
#     "ABCDEFGHIJKLMNOPQRSTUVWXYZ" +  # Wine names (caps)
#     "abcdefghijklmnopqrstuvwxyz" +  # Wine names (lowercase)
#     " .,-*'&$%"                 # Punctuation
# )


char_to_idx = {char: idx for idx, char in enumerate(MENU_CHARACTERS)}
idx_to_char = {idx: char for idx, char in enumerate(MENU_CHARACTERS)}





2025-07-14 14:51:51.061262: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-14 14:51:51.210837: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752468711.266071  912927 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752468711.283230  912927 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752468711.422002  912927 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

🍷 WINE OCR LEARNING LAB - STUDENT VERSION
Your mission: Fill in the missing code to build a complete system!


In [None]:
### PNG -> Wine Entries

from menu_token_extractor import TokenExtractor
from menu_token_enrichor import TokenEnrichor

vectorizer = tf.keras.layers.TextVectorization(
  max_tokens=VOCABULARY_MAX_SIZE,
  output_sequence_length=1,  # Single token per text
  output_mode='int'
)

extractor = TokenExtractor(
    low_text=0.3,
    width_ths=0.4,
    height_ths=0.4)

enrichor = TokenEnrichor(red_varietals=RED_VARIETALS,
                        white_varietals=WHITE_VARIETALS,
                        varietal_abbreviations=VARIETAL_ABBREVIATIONS,
                        countries=COUNTRIES)

def create_pairwise_features(tokens, vectorizer, max_distance=MENU_GROUPING_MAX_DISTANCE, same_row_threshold=20, embedding_dim=64):
    """
    Create pairwise features from tokens for neural network training.
    
    Args:
        tokens: List of token dictionaries with keys: 'text', 'x', 'y', 'h', 
                'is_vintage', 'is_varietal_red', 'is_varietal_white', 'is_price'
    
    Returns:
        numpy array of shape (n_pairs, n_features) ready for NN training
    """
    

    features = []

    all_texts = [token['text'] for token in tokens]
    
    vectorizer.adapt(all_texts)

    token_ids = vectorizer(all_texts).numpy().flatten()
    embedding = tf.keras.layers.Embedding(
        input_dim=VOCABULARY_MAX_SIZE,
        output_dim=embedding_dim,
        mask_zero=True
    )

    text_embeddings = embedding(token_ids).numpy()

    # Create all pairs
    for i in range(len(tokens)):
      for j in range(i + 1, len(tokens)):
        t1, t2 = tokens[i], tokens[j]

        # Skip if both tokens are vintage or both are varietals
        if 'is_vintage' in t1 and t1['is_vintage'] and 'is_vintage' in t2 and t2['is_vintage']:
          continue

        # Relationship features (7 features)
        x_diff = abs(t1['x'] - t2['x'])
        y_diff = abs(t1['y'] - t2['y'])
        euclidean_dist = np.sqrt(x_diff**2 + y_diff**2)

        # if euclidean_dist > max_distance:
        #     continue  # Skip this pair

        # Token 1 features (7 features)
        t1_features = [
            
            t1['text'],

            float(t1.get('is_varietal_red', False)),
            float(t1.get('is_varietal_white', False)),
            float(t1.get('is_price', False)),
            float(t1.get('is_vintage', False)),
            float(t1.get('is_region', False)),
            float(t1.get('is_country', False)),

            t1['x'] / 1000.0,  # Normalize coordinates
            t1['y'] / 1000.0,
            t1['h'] / 100.0,   # Normalize font height
        ]
        
        # Token 2 features (7 features)
        t2_features = [
            
            t2['text'],

            float(t2.get('is_varietal_red', False)),
            float(t2.get('is_varietal_white', False)),
            float(t2.get('is_price', False)),
            float(t2.get('is_vintage', False)),
            float(t2.get('is_region', False)),
            float(t2.get('is_country', False)),

            t2['x'] / 1000.0,
            t2['y'] / 1000.0,
            t2['h'] / 100.0,
        ]
        
        relationship_features = [
            x_diff / 1000.0,                           # X distance (normalized)
            y_diff / 1000.0,                           # Y distance (normalized)
            euclidean_dist / 1000.0,                   # Euclidean distance (normalized)
            float(y_diff < same_row_threshold),        # Same row (boolean)
            1.0 if t1['x'] < t2['x'] else -1.0, # Left-right order
            abs(t1['h'] - t2['h']) / 100.0,    # Font size difference
            min(t1['h'], t2['h']) / max(t1['h'], t2['h'], 1), # Font size ratio
            float(t1['text'] == t2['text']) # same
        ]
        
        t1_embed = text_embeddings[i]  # Shape: (embedding_dim,)
        t2_embed = text_embeddings[j]  # Shape: (embedding_dim,)

        # Combine all features (21 total)

        pair_features = np.concatenate([
          #  t1_embed, 
           t1_features, 
          #  t2_embed, 
           t2_features,
           relationship_features])

        features.append(pair_features)
    
    return np.array(features)

# Usage

def pairwise_predictions_to_wine_entries(tokens, predictions, threshold=0.5):
   """Convert pairwise predictions to wine entries DataFrame"""
   import pandas as pd
   import numpy as np
   
   n_tokens = len(tokens)
   adjacency = np.zeros((n_tokens, n_tokens))
   
   # Fill adjacency matrix
   pair_idx = 0
   for i in range(n_tokens):
       for j in range(i + 1, n_tokens):
           t1, t2 = tokens[i], tokens[j]
           
           if t1.get('is_vintage', False) and t2.get('is_vintage', False):
               continue
           
           x_diff = abs(t1['x'] - t2['x'])
           y_diff = abs(t1['y'] - t2['y'])
           euclidean_dist = np.sqrt(x_diff**2 + y_diff**2)
           
           if euclidean_dist > 800:
               continue
           
           if pair_idx < len(predictions) and predictions[pair_idx] > threshold:
               adjacency[i][j] = 1
               adjacency[j][i] = 1
           
           pair_idx += 1
   
   # Find groups
   visited = [False] * n_tokens
   groups = []
   
   def dfs(node, current_group):
       visited[node] = True
       current_group.append(node)
       for neighbor in range(n_tokens):
           if adjacency[node][neighbor] == 1 and not visited[neighbor]:
               dfs(neighbor, current_group)
   
   for i in range(n_tokens):
       if not visited[i]:
           current_group = []
           dfs(i, current_group)
           groups.append(current_group)
   
   # Extract wine entries
   wine_entries = []
   for group_indices in groups:
       if len(group_indices) == 1:
           continue
           
       group_tokens = [tokens[i] for i in group_indices]
       group_tokens.sort(key=lambda x: x['x'])
       
       wine_entry = {
           'vintage': '', 'country': '', 'region': '', 'type': '', 'varietal': '',
           'glass_price': '', 'bottle_price': '', 'description': '', 'other_texts': []
       }
       
       prices = []
       for token in group_tokens:
           text = token['text']
           
           if token.get('is_vintage'):
               wine_entry['vintage'] = text
           elif token.get('is_price'):
               prices.append(int(text) if text.isdigit() else 0)
           elif token.get('is_varietal_red'):
               wine_entry['varietal'] = text
               wine_entry['type'] = 'red'
           elif token.get('is_varietal_white'):
               wine_entry['varietal'] = text
               wine_entry['type'] = 'white'
           else:
               wine_entry['other_texts'].append(text)
       
       if len(prices) >= 2:
           prices.sort()
           wine_entry['glass_price'] = str(prices[0])
           wine_entry['bottle_price'] = str(prices[-1])
       elif len(prices) == 1:
           wine_entry['bottle_price'] = str(prices[0])
       
       wine_entries.append(wine_entry)
   
   return pd.DataFrame(wine_entries)



In [15]:
MENU_TRAINING_1_COL_GROUPS = [
   # Group 0: Noise and headers
   [
       {'text': 'WINE LIST', 'x': 350, 'y': 85, 'h': 32},
       {'text': 'RED WINES', 'x': 125, 'y': 125, 'h': 24},
       {'text': 'WHITE WINES', 'x': 125, 'y': 380, 'h': 24},
       {'text': 'Vintage', 'x': 52, 'y': 155, 'h': 14},
       {'text': 'Producer', 'x': 148, 'y': 155, 'h': 14},
       {'text': 'Varietal', 'x': 285, 'y': 155, 'h': 14},
       {'text': 'Region', 'x': 425, 'y': 155, 'h': 14},
       {'text': 'Price', 'x': 612, 'y': 155, 'h': 14}
   ],
   
   # Group 1: Wine 1
   [
       {'text': '2023', 'x': 52, 'y': 182, 'h': 18},
       {'text': 'PENFOLDS', 'x': 148, 'y': 182, 'h': 20},
       {'text': 'Shiraz', 'x': 285, 'y': 182, 'h': 18},
       {'text': 'Barossa Valley', 'x': 425, 'y': 182, 'h': 16},
       {'text': '85', 'x': 612, 'y': 182, 'h': 18}
   ],
   
   # Group 2: Wine 2
   [
       {'text': '2022', 'x': 52, 'y': 218, 'h': 18},
       {'text': 'NAUTILUS', 'x': 148, 'y': 218, 'h': 20},
       {'text': 'Sauvignon Blanc', 'x': 285, 'y': 218, 'h': 18},
       {'text': 'Marlborough NZ', 'x': 425, 'y': 218, 'h': 16},
       {'text': '65', 'x': 612, 'y': 218, 'h': 18}
   ],
   
   # Group 3: Wine 3
   [
       {'text': '2021', 'x': 52, 'y': 254, 'h': 18},
       {'text': 'CLOUDY BAY', 'x': 148, 'y': 254, 'h': 20},
       {'text': 'Chardonnay', 'x': 285, 'y': 254, 'h': 18},
       {'text': 'Hawkes Bay NZ', 'x': 425, 'y': 254, 'h': 16},
       {'text': '72', 'x': 612, 'y': 254, 'h': 18}
   ],
   
   # Group 4: Wine 4
   [
       {'text': '2020', 'x': 52, 'y': 290, 'h': 18},
       {'text': 'WOLF BLASS', 'x': 148, 'y': 290, 'h': 20},
       {'text': 'Cabernet Sauvignon', 'x': 285, 'y': 290, 'h': 18},
       {'text': 'South Australia', 'x': 425, 'y': 290, 'h': 16},
       {'text': '95', 'x': 612, 'y': 290, 'h': 18}
   ],
   
   # Group 5: Wine 5
   [
       {'text': 'NV', 'x': 52, 'y': 326, 'h': 18},
       {'text': 'YELLOWTAIL', 'x': 148, 'y': 326, 'h': 20},
       {'text': 'Pinot Noir', 'x': 285, 'y': 326, 'h': 18},
       {'text': 'Victoria', 'x': 425, 'y': 326, 'h': 16},
       {'text': '42', 'x': 612, 'y': 326, 'h': 18}
   ],
   
   # Group 6: Wine 6
   [
       {'text': '2019', 'x': 45, 'y': 362, 'h': 17},
       {'text': 'HARDYS', 'x': 145, 'y': 362, 'h': 19},
       {'text': 'Merlot', 'x': 288, 'y': 362, 'h': 17},
       {'text': 'McLaren Vale', 'x': 428, 'y': 362, 'h': 15},
       {'text': '78', 'x': 615, 'y': 362, 'h': 17}
   ],
   
   # Group 7: Wine 7
   [
       {'text': '2024', 'x': 55, 'y': 398, 'h': 18},
       {'text': 'JACOB\'S CREEK', 'x': 150, 'y': 398, 'h': 20},
       {'text': 'Riesling', 'x': 282, 'y': 398, 'h': 18},
       {'text': 'Eden Valley', 'x': 422, 'y': 398, 'h': 16},
       {'text': '55', 'x': 610, 'y': 398, 'h': 18}
   ],
   
   # Group 8: Wine 8
   [
       {'text': '2018', 'x': 48, 'y': 434, 'h': 19},
       {'text': 'WYNNS', 'x': 152, 'y': 434, 'h': 21},
       {'text': 'Cabernet Sauvignon', 'x': 290, 'y': 434, 'h': 19},
       {'text': 'Coonawarra', 'x': 430, 'y': 434, 'h': 17},
       {'text': '120', 'x': 608, 'y': 434, 'h': 19}
   ],
   
   # Group 9: Wine 9
   [
       {'text': '2023', 'x': 50, 'y': 470, 'h': 18},
       {'text': 'MCGUIGAN', 'x': 146, 'y': 470, 'h': 20},
       {'text': 'Semillon', 'x': 286, 'y': 470, 'h': 18},
       {'text': 'Hunter Valley', 'x': 426, 'y': 470, 'h': 16},
       {'text': '48', 'x': 614, 'y': 470, 'h': 18}
   ],
   
   # Group 10: Wine 10
   [
       {'text': '2022', 'x': 53, 'y': 506, 'h': 17},
       {'text': 'TYRRELL\'S', 'x': 149, 'y': 506, 'h': 19},
       {'text': 'Gewürztraminer', 'x': 283, 'y': 506, 'h': 17},
       {'text': 'Adelaide Hills', 'x': 423, 'y': 506, 'h': 15},
       {'text': '62', 'x': 612, 'y': 506, 'h': 17}
   ],
   
   # Group 11: Wine 11
   [
       {'text': '2021', 'x': 47, 'y': 542, 'h': 18},
       {'text': 'YALUMBA', 'x': 151, 'y': 542, 'h': 20},
       {'text': 'Viognier', 'x': 287, 'y': 542, 'h': 18},
       {'text': 'Eden Valley', 'x': 427, 'y': 542, 'h': 16},
       {'text': '88', 'x': 611, 'y': 542, 'h': 18}
   ],
   
   # Group 12: Wine 12
   [
       {'text': '2020', 'x': 54, 'y': 578, 'h': 19},
       {'text': 'CAPE MENTELLE', 'x': 147, 'y': 578, 'h': 21},
       {'text': 'Cabernet Merlot', 'x': 285, 'y': 578, 'h': 19},
       {'text': 'Margaret River', 'x': 425, 'y': 578, 'h': 17},
       {'text': '135', 'x': 609, 'y': 578, 'h': 19}
   ],
   
   # Group 13: Wine 13
   [
       {'text': 'NV', 'x': 51, 'y': 614, 'h': 18},
       {'text': 'LEEUWIN', 'x': 153, 'y': 614, 'h': 20},
       {'text': 'Moscato', 'x': 289, 'y': 614, 'h': 18},
       {'text': 'Margaret River', 'x': 429, 'y': 614, 'h': 16},
       {'text': '45', 'x': 613, 'y': 614, 'h': 18}
   ],
   
   # Group 14: Wine 14
   [
       {'text': '2019', 'x': 49, 'y': 650, 'h': 17},
       {'text': 'VASSE FELIX', 'x': 148, 'y': 650, 'h': 19},
       {'text': 'Pinot Grigio', 'x': 284, 'y': 650, 'h': 17},
       {'text': 'Margaret River', 'x': 424, 'y': 650, 'h': 15},
       {'text': '75', 'x': 615, 'y': 650, 'h': 17}
   ],
   
   # Group 15: Wine 15
   [
       {'text': '2023', 'x': 52, 'y': 686, 'h': 18},
       {'text': 'CULLEN', 'x': 150, 'y': 686, 'h': 20},
       {'text': 'Sauvignon Blanc', 'x': 282, 'y': 686, 'h': 18},
       {'text': 'Margaret River', 'x': 422, 'y': 686, 'h': 16},
       {'text': '92', 'x': 610, 'y': 686, 'h': 18}
   ],
   
   # Group 16: Wine 16
   [
       {'text': '2018', 'x': 46, 'y': 722, 'h': 19},
       {'text': 'HENSCHKE', 'x': 152, 'y': 722, 'h': 21},
       {'text': 'Shiraz', 'x': 288, 'y': 722, 'h': 19},
       {'text': 'Eden Valley', 'x': 428, 'y': 722, 'h': 17},
       {'text': '180', 'x': 607, 'y': 722, 'h': 19}
   ],
   
   # Group 17: Wine 17
   [
       {'text': '2022', 'x': 55, 'y': 758, 'h': 18},
       {'text': 'TORBRECK', 'x': 149, 'y': 758, 'h': 20},
       {'text': 'Grenache', 'x': 286, 'y': 758, 'h': 18},
       {'text': 'Barossa Valley', 'x': 426, 'y': 758, 'h': 16},
       {'text': '110', 'x': 612, 'y': 758, 'h': 18}
   ],
   
   # Group 18: Wine 18
   [
       {'text': '2021', 'x': 48, 'y': 794, 'h': 17},
       {'text': 'CLARENDON HILLS', 'x': 145, 'y': 794, 'h': 19},
       {'text': 'Syrah', 'x': 291, 'y': 794, 'h': 17},
       {'text': 'McLaren Vale', 'x': 431, 'y': 794, 'h': 15},
       {'text': '165', 'x': 611, 'y': 794, 'h': 17}
   ],
   
   # Group 19: Wine 19
   [
       {'text': '2020', 'x': 51, 'y': 830, 'h': 18},
       {'text': 'KATNOOK', 'x': 147, 'y': 830, 'h': 20},
       {'text': 'Cabernet Sauvignon', 'x': 284, 'y': 830, 'h': 18},
       {'text': 'Coonawarra', 'x': 427, 'y': 830, 'h': 16},
       {'text': '98', 'x': 613, 'y': 830, 'h': 18}
   ],
   
   # Group 20: Wine 20
   [
       {'text': '2019', 'x': 53, 'y': 866, 'h': 19},
       {'text': 'PETALUMA', 'x': 151, 'y': 866, 'h': 21},
       {'text': 'Chardonnay', 'x': 287, 'y': 866, 'h': 19},
       {'text': 'Adelaide Hills', 'x': 429, 'y': 866, 'h': 17},
       {'text': '125', 'x': 609, 'y': 866, 'h': 19}
   ],
   
   # Group 21: Wine 21
   [
       {'text': 'NV', 'x': 47, 'y': 902, 'h': 18},
       {'text': 'MOUNTADAM', 'x': 149, 'y': 902, 'h': 20},
       {'text': 'Pinot Noir', 'x': 285, 'y': 902, 'h': 18},
       {'text': 'Eden Valley', 'x': 425, 'y': 902, 'h': 16},
       {'text': '85', 'x': 615, 'y': 902, 'h': 18}
   ],
   
   # Group 22: Wine 22
   [
       {'text': '2023', 'x': 50, 'y': 938, 'h': 17},
       {'text': 'GROSSET', 'x': 153, 'y': 938, 'h': 19},
       {'text': 'Riesling', 'x': 289, 'y': 938, 'h': 17},
       {'text': 'Clare Valley', 'x': 423, 'y': 938, 'h': 15},
       {'text': '72', 'x': 611, 'y': 938, 'h': 17}
   ],
   
   # Group 23: Wine 23
   [
       {'text': '2022', 'x': 54, 'y': 974, 'h': 18},
       {'text': 'PIKE', 'x': 146, 'y': 974, 'h': 20},
       {'text': 'Chenin Blanc', 'x': 283, 'y': 974, 'h': 18},
       {'text': 'Clare Valley', 'x': 427, 'y': 974, 'h': 16},
       {'text': '58', 'x': 613, 'y': 974, 'h': 18}
   ],
   
   # Group 24: Wine 24
   [
       {'text': '2021', 'x': 48, 'y': 1010, 'h': 19},
       {'text': 'ELDERTON', 'x': 150, 'y': 1010, 'h': 21},
       {'text': 'Shiraz', 'x': 286, 'y': 1010, 'h': 19},
       {'text': 'Barossa Valley', 'x': 430, 'y': 1010, 'h': 17},
       {'text': '145', 'x': 608, 'y': 1010, 'h': 19}
   ],
   
   # Group 25: Wine 25
   [
       {'text': '2018', 'x': 52, 'y': 1046, 'h': 18},
       {'text': 'CHATEAU TANUNDA', 'x': 144, 'y': 1046, 'h': 20},
       {'text': 'Grenache', 'x': 288, 'y': 1046, 'h': 18},
       {'text': 'Barossa Valley', 'x': 426, 'y': 1046, 'h': 16},
       {'text': '88', 'x': 612, 'y': 1046, 'h': 18}
   ]
]


In [None]:

# Flatten for processing and keep track of original groups
tokens = []
original_groups = []
for group_idx, group in enumerate(MENU_TRAINING_1_COL_GROUPS):
    for token in group:
        tokens.append(token)
        original_groups.append(group_idx)

print(f"🔄 Flattened to {len(tokens)} tokens")

# Enrich tokens
tokens = [enrichor.enrich_token(token) for token in tokens]
print(f"✅ Tokens enriched")

# Create features
X = create_pairwise_features(tokens, vectorizer=vectorizer)
print(f"🔗 Created {X.shape[0]} pairs with {X.shape[1]} features")

# Create labels with EXACT same filtering as create_pairwise_features
labels = []

for i in range(len(tokens)):
    for j in range(i + 1, len(tokens)):
        t1, t2 = tokens[i], tokens[j]
        
        # Apply SAME filters as create_pairwise_features
        
        # Skip if both tokens are vintage
        if t1.get('is_vintage', False) and t2.get('is_vintage', False):
            continue
            
        # Distance check
        x_diff = abs(t1['x'] - t2['x'])
        y_diff = abs(t1['y'] - t2['y'])
        # euclidean_dist = np.sqrt(x_diff**2 + y_diff**2)
        
        # if euclidean_dist > MENU_GROUPING_MAX_DISTANCE:  # Same max_distance as create_pairwise_features
        #     continue
            
        # This pair passed all filters, create label
        group_i = original_groups[i]
        group_j = original_groups[j]
        
        same_group = (group_i == group_j and group_i > 0 and group_j > 0)
        labels.append(int(same_group))

y = np.array(labels)

print(f"📊 Labels created: {len(y)}")
print(f"   Positive pairs (same wine): {np.sum(y)} ({np.mean(y):.2%})")
print(f"   Negative pairs: {len(y) - np.sum(y)}")

assert X.shape[0] == len(y), f"Perfect match! X: {X.shape[0]}, y: {len(y)}"
print(f"✅ X shape: {X.shape}, y shape: {y.shape}")

# Train model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


print("🍷 TRAINING DATA - X_train & y_train:")
print("=" * 50)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"Positive samples: {np.sum(y_train == 1)} ({np.mean(y_train == 1):.1%})")
print(f"Negative samples: {np.sum(y_train == 0)} ({np.mean(y_train == 0):.1%})")

print(f"\nFirst 10 y_train labels:")
print(y_train[:10])

print(f"\nFirst 5 X_train feature vectors (first 10 features only):")
for i in range(min(5, len(X_train))):
   print(f"Sample {i}: {X_train[i][:10]}... (label: {y_train[i]})")

print(f"\nX_train feature stats:")
print(f"  Min: {X_train.min():.6f}")
print(f"  Max: {X_train.max():.6f}")
print(f"  Mean: {X_train.mean():.6f}")

print(f"y_train dtype: {y_train.dtype}")
print(f"y_train unique values: {np.unique(y_train)}")
print(f"y_train range: {y_train.min()} to {y_train.max()}")

# Check for data issues
print(f"X_train has NaN: {np.isnan(X_train).any()}")
print(f"X_train has inf: {np.isinf(X_train).any()}")
print(f"y_train has NaN: {np.isnan(y_train).any()}")

model = keras.Sequential([
    Input(shape=(X.shape[1],)),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'), 
    Dense(1, activation='sigmoid')  # Binary classification
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),  # Explicit binary crossentropy
    metrics=['accuracy'],
)

# Train model
print("\nTraining model...")
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    verbose=1,
    callbacks=[
      tf.keras.callbacks.EarlyStopping(
      monitor='val_loss',          # Monitor validation loss
      patience=5,                  # Stop if no improvement for 5 epochs
      restore_best_weights=True,   # Restore best weights when stopped
      verbose=1 )     
    ]
)

test_accuracy = model.evaluate(X_test, y_test, verbose=0)[1]
print(f"\n🎯 Test Accuracy: {test_accuracy:.4f}")
print(f"✅ Model ready for testing!")
full_test = [
    # Wine 1
    {'text': '2023', 'x': 52, 'y': 182, 'h': 18},
    {'text': 'PENFOLDS', 'x': 148, 'y': 182, 'h': 20}, 
    {'text': 'Shiraz', 'x': 285, 'y': 182, 'h': 18},
    {'text': 'Barossa Valley', 'x': 425, 'y': 182, 'h': 16},
    {'text': '85', 'x': 612, 'y': 182, 'h': 18},
    
    {'text': 'Gloubi', 'x': 612, 'y': 2000, 'h': 22},


    # Wine 2 - different row
    {'text': '2022', 'x': 52, 'y': 800, 'h': 18},
    {'text': 'NAUTILUS', 'x': 148, 'y': 800, 'h': 20},
    {'text': 'Sauvignon Blanc', 'x': 285, 'y': 800, 'h': 18},
    {'text': 'Marlborough NZ', 'x': 425, 'y': 800, 'h': 16},
    {'text': '65', 'x': 612, 'y': 800, 'h': 18}
]

full_tokens = [enrichor.enrich_token(token) for token in full_test]
X_full = create_pairwise_features(full_tokens, vectorizer=vectorizer)

print("Pairs", X_full)




# predictions_full = model.predict(X_full, verbose=0).flatten()


# wine_df_full = pairwise_predictions_to_wine_entries(full_tokens, predictions_full, threshold=0.8)

# print("🍷 TWO WINE TEST:")
# print(f"\nFound {len(wine_df_full)} wines - Expected: 2")

# wine_df_full

# on# Test different thresholds
# print("🔍 TESTING DIFFERENT THRESHOLDS:")

# for threshold in [0.3, 0.5, 0.7, 0.8, 0.9]:
#     wine_df_test = pairwise_predictions_to_wine_entries(full_tokens, predictions_full, threshold=threshold)
#     print(f"Threshold {threshold}: Found {len(wine_df_test)} wines")

# # Check actual predictions for cross-wine pairs
# print(f"\n📊 CROSS-WINE PREDICTIONS (should be low):")
# wine1_indices = [0, 1, 2, 3, 4]  # First 5 tokens (wine 1)
# wine2_indices = [5, 6, 7, 8, 9]  # Next 5 tokens (wine 2)

# pair_idx = 0
# cross_wine_preds = []
# for i in range(len(full_tokens)):
#     for j in range(i + 1, len(full_tokens)):
#         # Check if this is a cross-wine pair
#         wine1_token = i in wine1_indices
#         wine2_token = j in wine2_indices
#         is_cross_wine = (wine1_token and j in wine2_indices) or (wine2_token and i in wine1_indices)
        
#         if is_cross_wine and pair_idx < len(predictions_full):
#             pred = predictions_full[pair_idx]
#             cross_wine_preds.append(pred)
#             print(f"  {pred:.3f} | '{full_tokens[i]['text']}' (wine1) ↔ '{full_tokens[j]['text']}' (wine2)")
        
#         pair_idx += 1

# print(f"\nCross-wine pr

🔄 Flattened to 133 tokens
✅ Tokens enriched
🔗 Created 8547 pairs with 28 features
📊 Labels created: 8547
   Positive pairs (same wine): 250 (2.93%)
   Negative pairs: 8297
✅ X shape: (8547, 28), y shape: (8547,)
🍷 TRAINING DATA - X_train & y_train:
X_train shape: (6837, 28)
y_train shape: (6837,)
Positive samples: 205 (3.0%)
Negative samples: 6632 (97.0%)

First 10 y_train labels:
[0 0 0 0 0 0 0 0 0 0]

First 5 X_train feature vectors (first 10 features only):
Sample 0: ['Cabernet Sauvignon' '1.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.285' '0.29'
 '0.18']... (label: 0)
Sample 1: ['Pinot Grigio' '0.0' '1.0' '0.0' '0.0' '0.0' '0.0' '0.284' '0.65' '0.17']... (label: 0)
Sample 2: ['Sauvignon Blanc' '0.0' '1.0' '0.0' '0.0' '0.0' '0.0' '0.282' '0.686'
 '0.18']... (label: 0)
Sample 3: ['YELLOWTAIL' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.148' '0.326' '0.2']... (label: 0)
Sample 4: ['eden valley' '0.0' '0.0' '0.0' '0.0' '1.0' '0.0' '0.427' '0.542' '0.16']... (label: 0)

X_train feature stats:


UFuncTypeError: ufunc 'minimum' did not contain a loop with signature matching types (dtype('<U32'), dtype('<U32')) -> None

In [None]:
ediction stats:")
# print(f"  Min: {min(cross_wine_preds):.3f}")
# print(f"  Max: {max(cross_wine_preds):.3f}")
# print(f"  Mean: {np.mean(cross_wine_preds):.3f}")
# print(f"  Above 0.5: {sum(1 for p in cross_wine_preds if p > 0.5)}")



In [6]:
# # Make predictions

tokens = extractor.extract_tokens('data/menus/menu_test.png')
print(f"📄 Extracted {len(tokens)} tokens")
print(tokens)
# Enrich tokens
tokens = [enrichor.enrich_token(token) for token in tokens]
print(f"✅ Tokens enriched")

# Create features
X = create_pairwise_features(tokens, vectorizer=vectorizer)
print(f"🔗 Created {X.shape[0]} pairs")

# Make predictions
predictions = model.predict(X, verbose=0).flatten()

# # # Show prediction stats
same_group = np.sum(predictions > 0.5)
different_group = np.sum(predictions <= 0.5)
print(f"✅ {same_group} pairs predicted as SAME GROUP")
print(f"❌ {different_group} pairs predicted as DIFFERENT GROUP")

# Create adjacency matrix for grouping
n_tokens = len(tokens)
adjacency = np.zeros((n_tokens, n_tokens))

# # Fill adjacency matrix from predictions
pair_idx = 0
for i in range(n_tokens):
   for j in range(i + 1, n_tokens):
       # Skip if pair was filtered out during feature creation
       if pair_idx < len(predictions) and predictions[pair_idx] > 0.5:
           adjacency[i][j] = 1
           adjacency[j][i] = 1
       if pair_idx < len(predictions):
           pair_idx += 1

# # Find connected components (groups)
visited = [False] * n_tokens
groups = []

def dfs(node, current_group):
   visited[node] = True
   current_group.append(node)
   for neighbor in range(n_tokens):
       if adjacency[node][neighbor] == 1 and not visited[neighbor]:
           dfs(neighbor, current_group)

for i in range(n_tokens):
   if not visited[i]:
       current_group = []
       dfs(i, current_group)
       groups.append(current_group)

# Display results
print(f"\n🔍 DETECTED {len([g for g in groups if len(g) > 1])} WINE GROUPS")
print("=" * 50)

for group_idx, group in enumerate(groups):
   if len(group) == 1:
       continue  # Skip single tokens
       
   print(f"\n🍷 WINE {group_idx + 1}:")
   
   # Sort tokens by x position (left to right)
   group_tokens = [tokens[i] for i in group]
   group_tokens.sort(key=lambda x: x['x'])
   
   wine_text = []
   for token in group_tokens:
       text = token['text']
       if token.get('is_vintage'):
           text += " 🗓️"
       elif token.get('is_varietal_red'):
           text += " 🍷"  
       elif token.get('is_varietal_white'):
           text += " 🥂"
       elif token.get('is_price'):
           text += " 💰"
       wine_text.append(text)
   
   print(f"   {' | '.join(wine_text)}")

# Show top confident predictions
print(f"\n🔍 TOP CONFIDENT PREDICTIONS:")
print("-" * 30)

# Get pair info for top predictions
pair_info = []
pair_idx = 0
for i in range(n_tokens):
   for j in range(i + 1, n_tokens):
       if pair_idx < len(predictions):
           pair_info.append((i, j, predictions[pair_idx]))
           pair_idx += 1

# Sort by confidence
pair_info.sort(key=lambda x: x[2], reverse=True)

print("✅ SAME GROUP (top 5):")
count = 0
for i, j, conf in pair_info:
   if conf > 0.5 and count < 5:
       print(f"   {conf:.3f} | '{tokens[i]['text']}' ↔ '{tokens[j]['text']}'")
       count += 1

print("\n❌ DIFFERENT GROUP (top 5):")
count = 0
for i, j, conf in reversed(pair_info[-10:]):
   if conf < 0.5 and count < 5:
       print(f"   {conf:.3f} | '{tokens[i]['text']}' ✗ '{tokens[j]['text']}'")
       count += 1

print(f"\n🎉 DONE! Found {len([g for g in groups if len(g) > 1])} wines")







📄 Extracted 52 tokens
[{'text': 'WHITE WINES', 'x': 78, 'y': 25, 'h': 20, 'confidence': 0.9992150187175544}, {'text': 'YEAR', 'x': 50, 'y': 63, 'h': 20, 'confidence': 0.9999305009841919}, {'text': 'WINE', 'x': 105, 'y': 64, 'h': 18, 'confidence': 0.9986971020698547}, {'text': 'REGION', 'x': 347, 'y': 64, 'h': 24, 'confidence': 0.999713926243581}, {'text': 'GLASS', 'x': 441, 'y': 63, 'h': 20, 'confidence': 0.9917610159672245}, {'text': 'BOTTLE', 'x': 507, 'y': 63, 'h': 20, 'confidence': 0.9995592499186086}, {'text': '2023', 'x': 46, 'y': 103, 'h': 20, 'confidence': 0.7124162596457525}, {'text': 'NAUTILUS Sauvignon Blanc', 'x': 171, 'y': 103, 'h': 20, 'confidence': 0.9833118674577895}, {'text': 'Marlborough NZ', 'x': 369, 'y': 102, 'h': 21, 'confidence': 0.9930145629335138}, {'text': '16', 'x': 469, 'y': 103, 'h': 20, 'confidence': 0.9999787580544308}, {'text': '77', 'x': 510, 'y': 103, 'h': 20, 'confidence': 0.9999828884223854}, {'text': '2022', 'x': 46, 'y': 123, 'h': 20, 'confidence':

In [7]:
# Debug 3: Train a minimal spatial-only model (FIXED)
import random


print(f"\n🔄 TRAINING MINIMAL SPATIAL MODEL:")

def create_minimal_spatial_features(tokens):
    """Only Y difference as feature"""
    features = []
    for i in range(len(tokens)):
        for j in range(i + 1, len(tokens)):
            y_diff = abs(tokens[i]['y'] - tokens[j]['y'])
            features.append([float(y_diff / 1000.0)])  # Ensure float type
    return np.array(features, dtype=np.float32)

# Create test data
debug_tokens = [
    {'text': 'A', 'x': 100, 'y': 100, 'h': 18},  # Same row
    {'text': 'B', 'x': 200, 'y': 100, 'h': 18},  # Same row  
    {'text': 'C', 'x': 100, 'y': 200, 'h': 18},  # Different row
]

X_minimal = create_minimal_spatial_features(debug_tokens)
y_minimal = np.array([1.0, 0.0], dtype=np.float32)  # A↔B same row=1, A↔C diff row=0

print(f"X_minimal shape: {X_minimal.shape}")
print(f"X_minimal data: {X_minimal}")
print(f"y_minimal: {y_minimal}")

minimal_model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, input_shape=(1,), activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

minimal_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Create more training data for the minimal model
X_train_minimal = []
y_train_minimal = []

for same_row in [True, False]:
    for _ in range(50):  # 50 examples each
        if same_row:
            y1, y2 = 100, 100 + random.randint(-5, 5)  # Same row with small variation
            label = 1.0
        else:
            y1, y2 = 100, 100 + random.randint(50, 500)  # Different rows
            label = 0.0
            
        y_diff = abs(y1 - y2) / 1000.0
        X_train_minimal.append([y_diff])
        y_train_minimal.append(label)

X_train_minimal = np.array(X_train_minimal, dtype=np.float32)
y_train_minimal = np.array(y_train_minimal, dtype=np.float32)

print(f"Training minimal model on {len(X_train_minimal)} examples...")
history_minimal = minimal_model.fit(
    X_train_minimal, y_train_minimal, 
    epochs=50, 
    validation_split=0.2,
    verbose=0
)

# Test minimal model
test_preds = minimal_model.predict(X_minimal, verbose=0)
print(f"\nMinimal model results:")
print(f"  Same row (y_diff≈0): {test_preds[0][0]:.3f} (should be >0.5)")
print(f"  Diff row (y_diff=0.1): {test_preds[1][0]:.3f} (should be <0.5)")
print(f"  Final accuracy: {history_minimal.history['accuracy'][-1]:.3f}")

if test_preds[0][0] > 0.5 and test_preds[1][0] < 0.5:
    print("✅ Minimal model learned spatial pattern correctly!")
    print("❌ Problem is in your complex model - too many features confusing it")
else:
    print("❌ Even minimal model failed - fundamental issue with approach")


🔄 TRAINING MINIMAL SPATIAL MODEL:
X_minimal shape: (3, 1)
X_minimal data: [[0. ]
 [0.1]
 [0.1]]
y_minimal: [1. 0.]
Training minimal model on 100 examples...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Minimal model results:
  Same row (y_diff≈0): 0.544 (should be >0.5)
  Diff row (y_diff=0.1): 0.541 (should be <0.5)
  Final accuracy: 0.625
❌ Even minimal model failed - fundamental issue with approach
