In [9]:
import pandas as pd

# Load your main report
df = pd.read_csv("20250514-report.csv")
df = df[df['end_location']== " Off Location "]

In [11]:
# Filter labels (sku != '-')
labels_df = df[df['sku'] != '-'].copy()

# Filter ends (sku == '-')
ends_df = df[df['sku'] == '-'].copy()

# Optional: Select the first end per visit (or use heuristics later)
ends_unique = ends_df.sort_values('date_captured').groupby('visit_id').first().reset_index()

# Check structure
print(labels_df.head())
print(ends_unique.head())

                id  visit_id visit_created    date_captured  \
252  lblID-1195330    352670     14/5/2025  14/5/2025 14:38   
253  lblID-1195392    352670     14/5/2025  14/5/2025 14:35   
254  lblID-1195388    352670     14/5/2025  14/5/2025 14:42   
255  lblID-1195386    352670     14/5/2025  14/5/2025 14:35   
256  lblID-1195385    352670     14/5/2025  14/5/2025 14:40   

                                         end_image_url  \
252  https://dtexg3-images.s3.ap-southeast-2.amazon...   
253  https://dtexg3-images.s3.ap-southeast-2.amazon...   
254  https://dtexg3-images.s3.ap-southeast-2.amazon...   
255  https://dtexg3-images.s3.ap-southeast-2.amazon...   
256  https://dtexg3-images.s3.ap-southeast-2.amazon...   

                                       label_image_url store_type  \
252  https://dtexg3-images.s3.ap-southeast-2.amazon...      Coles   
253  https://dtexg3-images.s3.ap-southeast-2.amazon...      Coles   
254  https://dtexg3-images.s3.ap-southeast-2.amazon...      Coles

In [12]:
# Merge labels with ends on visit_id
final_df = labels_df.merge(
    ends_unique[['visit_id', 'end_image_url']],  # Only bring end_image_url
    on='visit_id',
    how='left',
    suffixes=('', '_correct_end')
)

# Final table has: label_image_url, end_image_url (label's own), end_image_url_correct_end (true full display)
final_df = final_df.rename(columns={'end_image_url': 'label_end_image_url', 'end_image_url_correct_end': 'correct_end_image_url'})

print(final_df.head())

              id  visit_id visit_created    date_captured  \
0  lblID-1195330    352670     14/5/2025  14/5/2025 14:38   
1  lblID-1195392    352670     14/5/2025  14/5/2025 14:35   
2  lblID-1195388    352670     14/5/2025  14/5/2025 14:42   
3  lblID-1195386    352670     14/5/2025  14/5/2025 14:35   
4  lblID-1195385    352670     14/5/2025  14/5/2025 14:40   

                                 label_end_image_url  \
0  https://dtexg3-images.s3.ap-southeast-2.amazon...   
1  https://dtexg3-images.s3.ap-southeast-2.amazon...   
2  https://dtexg3-images.s3.ap-southeast-2.amazon...   
3  https://dtexg3-images.s3.ap-southeast-2.amazon...   
4  https://dtexg3-images.s3.ap-southeast-2.amazon...   

                                     label_image_url store_type  \
0  https://dtexg3-images.s3.ap-southeast-2.amazon...      Coles   
1  https://dtexg3-images.s3.ap-southeast-2.amazon...      Coles   
2  https://dtexg3-images.s3.ap-southeast-2.amazon...      Coles   
3  https://dtexg3-images.s3.

In [13]:
len(final_df)

606

In [14]:
final_df.to_csv('final.csv',index=False)

In [17]:
import pandas as pd

# Load your report CSV
df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])

# Split Labels and Ends
labels_df = df[df['sku'] != '-'].copy()
ends_df = df[df['sku'] == '-'].copy()

# Function: Find the best matching end for a label
def find_best_end(label_row, ends_df, time_window='5min'):
    visit_id = label_row['visit_id']
    label_time = label_row['date_captured']
    
    # Filter ends by visit_id
    candidate_ends = ends_df[ends_df['visit_id'] == visit_id].copy()
    
    # Filter ends within time window
    candidate_ends = candidate_ends[
        (candidate_ends['date_captured'] >= label_time - pd.Timedelta(time_window)) &
        (candidate_ends['date_captured'] <= label_time + pd.Timedelta(time_window))
    ]
    
    if not candidate_ends.empty:
        # Return the earliest end in time window
        return candidate_ends.sort_values('date_captured').iloc[0]['end_image_url']
    else:
        # Fallback: Return earliest end in entire visit
        fallback_end = ends_df[ends_df['visit_id'] == visit_id].sort_values('date_captured')
        if not fallback_end.empty:
            return fallback_end.iloc[0]['end_image_url']
        else:
            return None  # No end found at all

# Apply matching logic to all labels
labels_df['matched_end_image_url'] = labels_df.apply(
    lambda row: find_best_end(row, ends_df),
    axis=1
)

# Final Result
final_mapping = labels_df[['visit_id', 'label_image_url', 'matched_end_image_url', 'date_captured']]
print(final_mapping.head())

# Save to CSV
final_mapping.to_csv("label_to_correct_end_mapping.csv", index=False)
print("✅ Final mapping saved as 'label_to_correct_end_mapping.csv'.")

  df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])


   visit_id                                    label_image_url  \
1    352718  https://dtexg3-images.s3.ap-southeast-2.amazon...   
2    352718  https://dtexg3-images.s3.ap-southeast-2.amazon...   
3    352677  https://dtexg3-images.s3.ap-southeast-2.amazon...   
4    352677  https://dtexg3-images.s3.ap-southeast-2.amazon...   
5    352677  https://dtexg3-images.s3.ap-southeast-2.amazon...   

                               matched_end_image_url       date_captured  
1  https://dtexg3-images.s3.ap-southeast-2.amazon... 2025-05-14 14:28:00  
2  https://dtexg3-images.s3.ap-southeast-2.amazon... 2025-05-14 14:29:00  
3  https://dtexg3-images.s3.ap-southeast-2.amazon... 2025-05-14 14:30:00  
4  https://dtexg3-images.s3.ap-southeast-2.amazon... 2025-05-14 14:29:00  
5  https://dtexg3-images.s3.ap-southeast-2.amazon... 2025-05-14 14:30:00  
✅ Final mapping saved as 'label_to_correct_end_mapping.csv'.


In [None]:
merged_df = pd.read_csv('merged_dashboard_data.csv')

In [19]:
import cv2
import numpy as np
import requests
from io import BytesIO

def load_image_from_url(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        img_array = np.asarray(bytearray(response.content), dtype=np.uint8)
        img = cv2.imdecode(img_array, cv2.IMREAD_GRAYSCALE)
        return img
    except Exception as e:
        print(f"❌ Failed to load image from URL: {url} | Error: {e}")
        return None

def compute_template_match_score(label_url, end_url):
    label = load_image_from_url(label_url)
    end = load_image_from_url(end_url)
    
    if label is None or end is None:
        return np.nan

    if label.shape[0] > end.shape[0] or label.shape[1] > end.shape[1]:
        return np.nan

    res = cv2.matchTemplate(end, label, cv2.TM_CCOEFF_NORMED)
    _, max_val, _, _ = cv2.minMaxLoc(res)
    return max_val

In [20]:
scores = []
for idx, row in merged_df.iterrows():
    label_url = row['label_image_url']
    end_url = row['matched_end_image_url']
    score = compute_template_match_score(label_url, end_url)
    scores.append(score)

merged_df['opencv_template_score'] = scores

KeyboardInterrupt: 

In [None]:
merged_df.to_csv("mapped_with_opencv_score.csv", index=False)

In [21]:
import cv2
import numpy as np
import requests
from io import BytesIO
import pandas as pd

# Load data
merged_df = pd.read_csv("merged_dashboard_data.csv")

# Function to load image from URL
def load_image_from_url(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        img_array = np.asarray(bytearray(response.content), dtype=np.uint8)
        img = cv2.imdecode(img_array, cv2.IMREAD_GRAYSCALE)
        return img
    except Exception as e:
        print(f"❌ Failed to load image from URL: {url} | Error: {e}")
        return None

# Function to compute template match score
def compute_template_match_score(label_url, end_url):
    label = load_image_from_url(label_url)
    end = load_image_from_url(end_url)
    
    if label is None or end is None:
        return np.nan

    if label.shape[0] > end.shape[0] or label.shape[1] > end.shape[1]:
        return np.nan

    res = cv2.matchTemplate(end, label, cv2.TM_CCOEFF_NORMED)
    _, max_val, _, _ = cv2.minMaxLoc(res)
    return max_val

# Limit to first 100 rows for testing
test_df = merged_df.head(100)

# Compute scores
scores = []
for idx, row in test_df.iterrows():
    label_url = row['label_image_url']
    end_url = row['matched_end_image_url']
    score = compute_template_match_score(label_url, end_url)
    scores.append(score)

test_df['opencv_template_score'] = scores

# Save results
test_df.to_csv("mapped_with_opencv_score_test100.csv", index=False)

print("✅ OpenCV template matching completed for 100 rows!")
print(test_df[['label_image_url', 'matched_end_image_url', 'opencv_template_score']].head())

✅ OpenCV template matching completed for 100 rows!
                                     label_image_url  \
0  https://dtexg3-images.s3.ap-southeast-2.amazon...   
1  https://dtexg3-images.s3.ap-southeast-2.amazon...   
2  https://dtexg3-images.s3.ap-southeast-2.amazon...   
3  https://dtexg3-images.s3.ap-southeast-2.amazon...   
4  https://dtexg3-images.s3.ap-southeast-2.amazon...   

                               matched_end_image_url  opencv_template_score  
0  https://dtexg3-images.s3.ap-southeast-2.amazon...               0.216641  
1  https://dtexg3-images.s3.ap-southeast-2.amazon...               0.246030  
2  https://dtexg3-images.s3.ap-southeast-2.amazon...               0.186976  
3  https://dtexg3-images.s3.ap-southeast-2.amazon...               0.150709  
4  https://dtexg3-images.s3.ap-southeast-2.amazon...               0.185925  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['opencv_template_score'] = scores


In [None]:
import cv2
import numpy as np
import requests
from io import BytesIO
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

# Load your mapped data (limit to 100 rows for testing)
merged_df = pd.read_csv("merged_dashboard_data.csv").head(100)

# Load image from URL
def load_image_from_url(url, color=cv2.IMREAD_GRAYSCALE):
    try:
        response = requests.get(url, timeout=20)
        response.raise_for_status()
        img_array = np.asarray(bytearray(response.content), dtype=np.uint8)
        img = cv2.imdecode(img_array, color)
        return img
    except:
        return None

# Compute template match score
def compute_template_match(row, max_template_ratio=0.5):
    label_url = row['label_image_url']
    end_url = row['matched_end_image_url']
    label = load_image_from_url(label_url)
    end = load_image_from_url(end_url)
    
    if label is None or end is None:
        return np.nan

    # Resize label if it's too large compared to end
    h_ratio = label.shape[0] / end.shape[0]
    w_ratio = label.shape[1] / end.shape[1]
    if max(h_ratio, w_ratio) > max_template_ratio:
        scale = max_template_ratio / max(h_ratio, w_ratio)
        label = cv2.resize(label, (int(label.shape[1]*scale), int(label.shape[0]*scale)))

    try:
        res = cv2.matchTemplate(end, label, cv2.TM_CCOEFF_NORMED)
        _, max_val, _, _ = cv2.minMaxLoc(res)
        return max_val
    except:
        return np.nan

# merged_df =merged_df.head(50)
# Run in parallel (up to 8 threads, adjust for your system)
with ThreadPoolExecutor(max_workers=8) as executor:
    scores = list(executor.map(compute_template_match, [row for _, row in merged_df.iterrows()]))

merged_df['opencv_template_score'] = scores

# Save results
merged_df.to_csv("mapped_with_opencv_score_100.csv", index=False)
print("✅ Optimized template matching done for 100 rows!")
print(merged_df[['label_image_url', 'matched_end_image_url', 'opencv_template_score']].head())

✅ Optimized template matching done for 100 rows!
                                     label_image_url  \
0  https://dtexg3-images.s3.ap-southeast-2.amazon...   
1  https://dtexg3-images.s3.ap-southeast-2.amazon...   
2  https://dtexg3-images.s3.ap-southeast-2.amazon...   
3  https://dtexg3-images.s3.ap-southeast-2.amazon...   
4  https://dtexg3-images.s3.ap-southeast-2.amazon...   

                               matched_end_image_url  opencv_template_score  
0  https://dtexg3-images.s3.ap-southeast-2.amazon...               0.230662  
1  https://dtexg3-images.s3.ap-southeast-2.amazon...               0.278055  
2  https://dtexg3-images.s3.ap-southeast-2.amazon...               0.212339  
3  https://dtexg3-images.s3.ap-southeast-2.amazon...               0.220275  
4  https://dtexg3-images.s3.ap-southeast-2.amazon...               0.168847  


In [23]:
import pandas as pd
import numpy as np
import cv2
import requests
from io import BytesIO
from tqdm import tqdm

# ================================
# 1️⃣ Load and Prepare Data
# ================================
print("🔹 Loading report data...")
report_path = "20250514-report.csv"
df = pd.read_csv(report_path, parse_dates=['date_captured'])
df =df[df['end_location'] == " Off Location "]
df.columns = df.columns.str.strip().str.replace('"', '')

print(f"✅ Report loaded with {len(df)} rows.")

# ================================
# 2️⃣ Split Labels and Ends
# ================================
print("🔹 Splitting labels and ends...")
labels_df = df[df['sku'] != '-'].copy()
ends_df = df[df['sku'] == '-'].copy()

print(f"✅ Found {len(labels_df)} label images and {len(ends_df)} end images.")

# ================================
# 3️⃣ Match Labels to Ends by Visit + Timestamp
# ================================
print("🔹 Matching labels to ends by visit and time proximity...")

def find_best_end(label_row, ends_df, time_window='5min'):
    visit_id = label_row['visit_id']
    label_time = label_row['date_captured']
    candidates = ends_df[ends_df['visit_id'] == visit_id].copy()
    candidates = candidates[
        (candidates['date_captured'] >= label_time - pd.Timedelta(time_window)) &
        (candidates['date_captured'] <= label_time + pd.Timedelta(time_window))
    ]
    if candidates.empty:
        fallback = ends_df[ends_df['visit_id'] == visit_id].sort_values('date_captured').head(1)
        if not fallback.empty:
            return fallback['end_image_url'].values[0], abs((label_time - fallback['date_captured'].values[0]).astype('timedelta64[m]').astype(int))
        else:
            return np.nan, np.nan
    else:
        candidates['time_diff'] = abs(candidates['date_captured'] - label_time).dt.total_seconds() / 60
        best_match = candidates.sort_values('time_diff').iloc[0]
        return best_match['end_image_url'], best_match['time_diff']

tqdm.pandas(desc="Matching")
labels_df[['matched_end_image_url', 'timestamp_diff_min']] = labels_df.progress_apply(
    lambda row: pd.Series(find_best_end(row, ends_df)), axis=1
)

print("✅ Matching complete.")

# ================================
# 4️⃣ OpenCV Template Matching Validation
# ================================
print("🔹 Running OpenCV template matching for validation...")

def load_image_from_url(url, color=cv2.IMREAD_GRAYSCALE):
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()
        img_array = np.asarray(bytearray(response.content), dtype=np.uint8)
        return cv2.imdecode(img_array, color)
    except:
        return None

def compute_template_match(row, max_template_ratio=0.5):
    label_url = row['label_image_url']
    end_url = row['matched_end_image_url']
    label = load_image_from_url(label_url)
    end = load_image_from_url(end_url)
    
    if label is None or end is None:
        return np.nan

    h_ratio = label.shape[0] / end.shape[0]
    w_ratio = label.shape[1] / end.shape[1]
    if max(h_ratio, w_ratio) > max_template_ratio:
        scale = max_template_ratio / max(h_ratio, w_ratio)
        label = cv2.resize(label, (int(label.shape[1]*scale), int(label.shape[0]*scale)))

    try:
        res = cv2.matchTemplate(end, label, cv2.TM_CCOEFF_NORMED)
        _, max_val, _, _ = cv2.minMaxLoc(res)
        return max_val
    except:
        return np.nan

labels_df['opencv_template_score'] = labels_df.progress_apply(compute_template_match, axis=1)

print("✅ OpenCV template matching complete.")

# ================================
# 5️⃣ Compute Confidence Scores
# ================================
print("🔹 Computing final confidence scores...")

def brand_score(label_brand, product_name):
    if pd.isna(label_brand) or pd.isna(product_name):
        return 0.0
    return 1.0 if label_brand.lower() in product_name.lower() else 0.5

labels_df['brand_confidence'] = labels_df.apply(
    lambda row: brand_score(row['brand'], row['product_name']), axis=1
)

alpha, beta, gamma = 0.5, 0.3, 0.2
labels_df['final_confidence'] = (
    alpha * labels_df['opencv_template_score'].fillna(0) +
    beta * labels_df['timestamp_diff_min'].apply(lambda x: 1.0 if x <= 2 else 0.8 if x <=5 else 0.5).fillna(0) +
    gamma * labels_df['brand_confidence']
)

labels_df['confidence_flag'] = labels_df['final_confidence'].apply(
    lambda x: '✅ High' if x >= 0.85 else ('⚠️ Review' if x >= 0.6 else '❌ Low')
)

print("✅ Confidence scoring complete.")

# ================================
# 6️⃣ Save Final Output
# ================================
final_output_path = "final_label_end_mapping_with_confidence.csv"
labels_df.to_csv(final_output_path, index=False)
print(f"✅ Final mapping with confidence saved to: {final_output_path}")

# ================================
# 7️⃣ Sample Output
# ================================
print("\n🔍 Sample Results:")
print(labels_df[['label_image_url', 'matched_end_image_url', 'final_confidence', 'confidence_flag']].head())

  df = pd.read_csv(report_path, parse_dates=['date_captured'])


🔹 Loading report data...
✅ Report loaded with 1112 rows.
🔹 Splitting labels and ends...
✅ Found 606 label images and 506 end images.
🔹 Matching labels to ends by visit and time proximity...


Matching: 100%|██████████| 606/606 [00:00<00:00, 1443.04it/s]


✅ Matching complete.
🔹 Running OpenCV template matching for validation...


Matching:  40%|███▉      | 241/606 [05:14<10:55,  1.80s/it]Invalid SOS parameters for sequential JPEG
Matching:  40%|███▉      | 242/606 [05:15<09:41,  1.60s/it]Invalid SOS parameters for sequential JPEG
Matching:  40%|████      | 243/606 [05:16<08:22,  1.38s/it]Invalid SOS parameters for sequential JPEG
Matching:  40%|████      | 244/606 [05:17<07:44,  1.28s/it]Invalid SOS parameters for sequential JPEG
Matching:  40%|████      | 245/606 [05:18<07:24,  1.23s/it]Invalid SOS parameters for sequential JPEG
Matching:  79%|███████▊  | 477/606 [11:29<02:55,  1.36s/it]Invalid SOS parameters for sequential JPEG
Matching:  79%|███████▉  | 478/606 [11:31<02:55,  1.37s/it]Invalid SOS parameters for sequential JPEG
Matching:  79%|███████▉  | 479/606 [11:32<02:59,  1.41s/it]Invalid SOS parameters for sequential JPEG
Matching:  79%|███████▉  | 480/606 [11:34<02:50,  1.35s/it]Invalid SOS parameters for sequential JPEG
Matching:  79%|███████▉  | 481/606 [11:35<02:57,  1.42s/it]Invalid SOS parameters 

✅ OpenCV template matching complete.
🔹 Computing final confidence scores...
✅ Confidence scoring complete.
✅ Final mapping with confidence saved to: final_label_end_mapping_with_confidence.csv

🔍 Sample Results:
                                       label_image_url  \
252  https://dtexg3-images.s3.ap-southeast-2.amazon...   
253  https://dtexg3-images.s3.ap-southeast-2.amazon...   
254  https://dtexg3-images.s3.ap-southeast-2.amazon...   
255  https://dtexg3-images.s3.ap-southeast-2.amazon...   
256  https://dtexg3-images.s3.ap-southeast-2.amazon...   

                                 matched_end_image_url  final_confidence  \
252  https://dtexg3-images.s3.ap-southeast-2.amazon...          0.579565   
253  https://dtexg3-images.s3.ap-southeast-2.amazon...          0.602917   
254  https://dtexg3-images.s3.ap-southeast-2.amazon...          0.672557   
255  https://dtexg3-images.s3.ap-southeast-2.amazon...          0.509396   
256  https://dtexg3-images.s3.ap-southeast-2.amazon...     




In [24]:
df =pd.read_csv('20250514-report.csv')

In [26]:
df['date_captured'] = pd.to_datetime(df['date_captured']).dt.strftime('%Y-%m-%d %H:%M:%S.%f').str[:-3]

  df['date_captured'] = pd.to_datetime(df['date_captured']).dt.strftime('%Y-%m-%d %H:%M:%S.%f').str[:-3]


In [27]:
df['date_captured']

0       2025-05-14 14:28:00.000
1       2025-05-14 14:28:00.000
2       2025-05-14 14:29:00.000
3       2025-05-14 14:30:00.000
4       2025-05-14 14:29:00.000
                 ...           
4396    2025-05-16 09:44:00.000
4397    2025-05-16 09:44:00.000
4398    2025-05-16 09:43:00.000
4399    2025-05-16 09:44:00.000
4400    2025-05-16 09:44:00.000
Name: date_captured, Length: 4401, dtype: object

In [2]:
import pandas as pd
import numpy as np
import cv2
import requests
from io import BytesIO
from tqdm import tqdm
from skimage.metrics import structural_similarity as ssim

# Load Data
df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])
df.columns = df.columns.str.strip().str.replace('"', '')

# Filter for Off Location
df = df[df['end_location'].str.lower().str.contains("off", na=False)].copy()

df =df[(df['store_type']== 'Coles') & (df['store_suburb'].str.lower().str.contains("croydon",na=False)) ]


# Split into Ends (sku == '-') and Labels (sku != '-')
ends_df = df[df['sku'] == '-'].copy()
labels_df = df[df['sku'] != '-'].copy()



# # print(f"✅ {len(labels_df)} labels | {len(ends_df)} ends filtered for Off Location.")

# # Image Loader
# def load_image_from_url(url, color=cv2.IMREAD_GRAYSCALE):
#     try:
#         response = requests.get(url, timeout=10)
#         response.raise_for_status()
#         img_array = np.asarray(bytearray(response.content), dtype=np.uint8)
#         return cv2.imdecode(img_array, color)
#     except:
#         return None

# # Visual Matching Function
# def match_label_to_filtered_ends(label_row, ends_df):
#     label_time = pd.to_datetime(label_row['date_captured'], errors='coerce')
#     label_end_url = label_row['end_image_url']
#     label_img = load_image_from_url(label_end_url)
#     if label_img is None:
#         return np.nan, np.nan, '❌ Label Image Missing'

#     # Filter Ends by same visit_id, store_type, end_location_number, and same date
#     candidates = ends_df[
#         (ends_df['visit_id'] == label_row['visit_id']) &
#         (ends_df['store_type'] == label_row['store_type']) &
#         (ends_df['end_location_number'] == label_row['end_location_number']) &
#         (ends_df['date_captured'].dt.date == label_time.date())
#     ]

#     if candidates.empty:
#         return np.nan, np.nan, '❌ No Ends Found (Filtered)'

#     best_score = -1
#     best_end_url = None

#     for _, end_row in candidates.iterrows():
#         end_img = load_image_from_url(end_row['end_image_url'])
#         if end_img is None:
#             continue
#         try:
#             label_resized = cv2.resize(label_img, (500, 500))
#             end_resized = cv2.resize(end_img, (500, 500))
#             score, _ = ssim(label_resized, end_resized, full=True)
#         except:
#             score = np.nan

#         if score > best_score:
#             best_score = score
#             best_end_url = end_row['end_image_url']

#     if best_end_url:
#         return best_end_url, best_score, '✅ Best SSIM Match'
#     else:
#         return np.nan, np.nan, '❌ No Visual Match'
# # Apply Matching
# tqdm.pandas(desc="Matching Labels to Ends (SSIM)")
# labels_df[['matched_end_image_url', 'ssim_score', 'match_type']] = labels_df.progress_apply(
#     lambda row: pd.Series(match_label_to_filtered_ends(row, ends_df)), axis=1
# )

# # Save Results
# labels_df.to_csv("off_location_visual_match_ssim.csv", index=False)
# print("✅ Visual matching (SSIM) complete. Output saved as 'off_location_visual_match_ssim.csv'.")

  df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])


In [10]:
labels_df.head()

Unnamed: 0,id,visit_id,visit_created,date_captured,end_image_url,label_image_url,store_type,store_type_id,store_suburb,store_postcode,...,label_type,brand,product_name,pack_size,current_price,was_price,unit_price,savings,multibuy_unit,multibuy_price
252,lblID-1195330,352670,14/5/2025,2025-05-14 14:38:00,https://dtexg3-images.s3.ap-southeast-2.amazon...,https://dtexg3-images.s3.ap-southeast-2.amazon...,Coles,15,Croydon,3136,...,30%_off,OREO,OREO PASCALL MARSHMALLOWS SLUG,131G,2.0,3,0.0,1.0,0,0.0
253,lblID-1195392,352670,14/5/2025,2025-05-14 14:35:00,https://dtexg3-images.s3.ap-southeast-2.amazon...,https://dtexg3-images.s3.ap-southeast-2.amazon...,Coles,15,Croydon,3136,...,30%_off,VICKS,VICKS VAPOUR SHOWER 5PK,5PACK,10.5,15,0.0,4.5,0,0.0
254,lblID-1195388,352670,14/5/2025,2025-05-14 14:42:00,https://dtexg3-images.s3.ap-southeast-2.amazon...,https://dtexg3-images.s3.ap-southeast-2.amazon...,Coles,15,Croydon,3136,...,half_price,DARRELL LEA,DARRELL LEA MILK CHOCOLATE LOVE HEART,100G,4.5,9,4.5,4.5,0,0.0
255,lblID-1195386,352670,14/5/2025,2025-05-14 14:35:00,https://dtexg3-images.s3.ap-southeast-2.amazon...,https://dtexg3-images.s3.ap-southeast-2.amazon...,Coles,15,Croydon,3136,...,half_price,BLACKMORES,LYP-SINE COLD SORE RELIEF TABLETS,30 PACK,7.0,14,23.33,7.0,0,0.0
256,lblID-1195385,352670,14/5/2025,2025-05-14 14:40:00,https://dtexg3-images.s3.ap-southeast-2.amazon...,https://dtexg3-images.s3.ap-southeast-2.amazon...,Coles,15,Croydon,3136,...,special_multibuy,RED BULL,SUGAR FREE ENERGY DRINK SINGLE CAN,250ML,3.25,0,3.25,1.0,2,5.5


In [12]:
candidat_df = ends_df[((labels_df['date_captured'].dt.date == ends_df['date_captured'].dt.date))]

ValueError: Can only compare identically-labeled Series objects

In [5]:
labels_df.columns

Index(['id', 'visit_id', 'visit_created', 'date_captured', 'end_image_url',
       'label_image_url', 'store_type', 'store_type_id', 'store_suburb',
       'store_postcode', 'store_state', 'location_tag', 'end_location',
       'end_location_number', 'sku', 'label_type', 'brand', 'product_name',
       'pack_size', 'current_price', 'was_price', 'unit_price', 'savings',
       'multibuy_unit', 'multibuy_price'],
      dtype='object')

In [6]:
ends_df.columns

Index(['id', 'visit_id', 'visit_created', 'date_captured', 'end_image_url',
       'label_image_url', 'store_type', 'store_type_id', 'store_suburb',
       'store_postcode', 'store_state', 'location_tag', 'end_location',
       'end_location_number', 'sku', 'label_type', 'brand', 'product_name',
       'pack_size', 'current_price', 'was_price', 'unit_price', 'savings',
       'multibuy_unit', 'multibuy_price'],
      dtype='object')

In [8]:
candidates = labels_df[
        (labels_df['visit_id'] == ends_df['visit_id']) &
        (labels_df['store_type'] == ends_df['store_type']) &
        (labels_df['end_location_number'] == ends_df['end_location_number']) &
        (labels_df['date_captured'].dt.date == ends_df.date())
    ]

ValueError: Can only compare identically-labeled Series objects

In [None]:
import pandas as pd

# Load Data
df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])
df.columns = df.columns.str.strip().str.replace('"', '')

# 1️⃣ Filter by specific datetime (replace with your target)
target_time = pd.Timestamp("2025-05-14 16:44:00")


# Filter: Keep rows within a small window (+/- 5 seconds)
time_window = '5s'
filtered_df = df[
    (df['date_captured'] >= target_time - pd.Timedelta(time_window)) &
    (df['date_captured'] <= target_time + pd.Timedelta(time_window))
].copy()

print(f"✅ Filtered {len(filtered_df)} rows around {target_time}")

# 2️⃣ Split into Ends and Labels
ends_df = filtered_df[filtered_df['sku'] == '-'].copy()
labels_df = filtered_df[filtered_df['sku'] != '-'].copy()

print(f"✅ {len(ends_df)} Ends and {len(labels_df)} Labels in filtered data.")

# 3️⃣ Save for SSIM matching later (optional)
# ends_df.to_csv("filtered_ends.csv", index=False)
# labels_df.to_csv("filtered_labels.csv", index=False)

✅ Filtered 8 rows around 2025-05-14 16:44:00
✅ 4 Ends and 4 Labels in filtered data.


  df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])


In [16]:
ends_df

Unnamed: 0,id,visit_id,visit_created,date_captured,end_image_url,label_image_url,store_type,store_type_id,store_suburb,store_postcode,...,label_type,brand,product_name,pack_size,current_price,was_price,unit_price,savings,multibuy_unit,multibuy_price
710,incID-813156,352779,14/5/2025,2025-05-14 16:44:00,https://dtexg3-images.s3.ap-southeast-2.amazon...,https://dtexg3-images.s3.ap-southeast-2.amazon...,Coles,15,Penrith,2750,...,-,-,-,-,-,-,-,-,-,-
712,incID-813160,352779,14/5/2025,2025-05-14 16:44:00,https://dtexg3-images.s3.ap-southeast-2.amazon...,https://dtexg3-images.s3.ap-southeast-2.amazon...,Coles,15,Penrith,2750,...,-,-,-,-,-,-,-,-,-,-
713,incID-813158,352779,14/5/2025,2025-05-14 16:44:00,https://dtexg3-images.s3.ap-southeast-2.amazon...,https://dtexg3-images.s3.ap-southeast-2.amazon...,Coles,15,Penrith,2750,...,-,-,-,-,-,-,-,-,-,-
716,incID-813154,352779,14/5/2025,2025-05-14 16:44:00,https://dtexg3-images.s3.ap-southeast-2.amazon...,-,Coles,15,Penrith,2750,...,-,-,-,-,-,-,-,-,-,-


In [15]:
labels_df

Unnamed: 0,id,visit_id,visit_created,date_captured,end_image_url,label_image_url,store_type,store_type_id,store_suburb,store_postcode,...,label_type,brand,product_name,pack_size,current_price,was_price,unit_price,savings,multibuy_unit,multibuy_price
701,lblID-1196053,352779,14/5/2025,2025-05-14 16:44:00,https://dtexg3-images.s3.ap-southeast-2.amazon...,https://dtexg3-images.s3.ap-southeast-2.amazon...,Coles,15,Penrith,2750,...,special,HEINZ,BEANZ BAKED BEANS IN TOMATO SAUCE,555G,3,3.8,0.54,0.8,0,0
706,lblID-1196054,352779,14/5/2025,2025-05-14 16:44:00,https://dtexg3-images.s3.ap-southeast-2.amazon...,https://dtexg3-images.s3.ap-southeast-2.amazon...,Coles,15,Penrith,2750,...,special,HEINZ,SPAGHETTI IN TOMATO SAUCE PASTA,535G,3,3.8,0.56,0.8,0,0
707,lblID-1196056,352779,14/5/2025,2025-05-14 16:44:00,https://dtexg3-images.s3.ap-southeast-2.amazon...,https://dtexg3-images.s3.ap-southeast-2.amazon...,Coles,15,Penrith,2750,...,special_multibuy,CHEEZELS,6 PACK OR DORITOS CORN CHIPS,150G-170G,4,0.0,0.0,1.0,2,7
708,lblID-1196050,352779,14/5/2025,2025-05-14 16:44:00,https://dtexg3-images.s3.ap-southeast-2.amazon...,https://dtexg3-images.s3.ap-southeast-2.amazon...,Coles,15,Penrith,2750,...,special_multibuy,ARNOTTS,SHORTBREAD CREAM BISCUITS,250G,4,0.0,0.0,2.0,2,6


In [7]:
import pandas as pd
import numpy as np
import cv2
import requests
from skimage.metrics import structural_similarity as ssim
from tqdm import tqdm

# 1️⃣ Load your report data
df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])
df.columns = df.columns.str.strip().str.replace('"', '')

# 2️⃣ Filter for a specific date (e.g., 14th May 2025)
target_time = pd.Timestamp("2025-05-14 16:44:00")

df = df[df['date_captured'] == target_time]
print(f"✅ Filtered {len(df)} rows for date {target_time}")

# 3️⃣ Split into Ends (sku == '-') and Labels (sku != '-')
ends_df = df[df['sku'] == '-'].copy()
labels_df = df[df['sku'] != '-'].copy()

ends_df.to_csv("filtered_ends.csv", index=False)
labels_df.to_csv("filtered_labels.csv", index=False)

print(f"✅ Saved {len(ends_df)} ends and {len(labels_df)} labels.")

# 4️⃣ Define function to load image from URL
def load_image_from_url(url, color=cv2.IMREAD_GRAYSCALE):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        img_array = np.asarray(bytearray(response.content), dtype=np.uint8)
        return cv2.imdecode(img_array, color)
    except:
        return None

# 5️⃣ Match Labels to Ends using SSIM
results = []
for idx, label_row in tqdm(labels_df.iterrows(), total=len(labels_df), desc="Matching Labels to Ends"):
    label_url = label_row['end_image_url']
    label_img = load_image_from_url(label_url)
    if label_img is None:
        continue

    best_score = -1
    best_end_url = None

    for _, end_row in ends_df.iterrows():
        end_url = end_row['end_image_url']
        end_img = load_image_from_url(end_url)
        if end_img is None:
            continue

    try:
        # label_resized = cv2.resize(label_img, (500, 500))
        # end_resized = cv2.resize(end_img, (500, 500))
        score, _ = ssim(end_img, label_img, full=True)
        print(f"Label {label_url} vs End {end_url}: SSIM = {score:.4f}")
    except:
        score = np.nan

        # try:
        #     # label_resized = cv2.resize(label_img, (500, 500))
        #     # end_resized = cv2.resize(end_img, (500, 500))
        #     score, _ = ssim(end_img, label_img, full=True)
        # except:
        #     score = np.nan

        if score > best_score:
            best_score = score
            best_end_url = end_url

    results.append({
        'label_end_image_url': label_url,
        'best_end_image_url': best_end_url,
        'ssim_score': best_score
    })

# 6️⃣ Save Results
ssim_results_df = pd.DataFrame(results)
ssim_results_df.to_csv("ssim_results_filtered.csv", index=False)

print("✅ SSIM matching complete. Results saved to 'ssim_results_filtered.csv'.")
print(ssim_results_df.head())

  df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])


✅ Filtered 8 rows for date 2025-05-14 16:44:00
✅ Saved 4 ends and 4 labels.


Matching Labels to Ends:  25%|██▌       | 1/4 [00:03<00:10,  3.48s/it]

Label https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164456-18WBXa_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_false.jpg vs End https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164401-lgnixh_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_true.jpg: SSIM = 0.3676


Matching Labels to Ends:  50%|█████     | 2/4 [00:06<00:06,  3.35s/it]

Label https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164453-QcJwkL_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_false.jpg vs End https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164401-lgnixh_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_true.jpg: SSIM = 0.3782


Matching Labels to Ends:  75%|███████▌  | 3/4 [00:10<00:03,  3.41s/it]

Label https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164417-bUZfVV_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_false.jpg vs End https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164401-lgnixh_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_true.jpg: SSIM = 0.3814


Matching Labels to Ends: 100%|██████████| 4/4 [00:13<00:00,  3.42s/it]

Label https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164404-NFkyfx_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_false.jpg vs End https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164401-lgnixh_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_true.jpg: SSIM = 0.4071
✅ SSIM matching complete. Results saved to 'ssim_results_filtered.csv'.
                                 label_end_image_url best_end_image_url  \
0  https://dtexg3-images.s3.ap-southeast-2.amazon...               None   
1  https://dtexg3-images.s3.ap-southeast-2.amazon...               None   
2  https://dtexg3-images.s3.ap-southeast-2.amazon...               None   
3  https://dtexg3-images.s3.ap-southeast-2.amazon...               None   

   ssim_score  
0          -1  
1          -1  
2          -1  
3          -1  





In [2]:
df['date_captured']

Series([], Name: date_captured, dtype: datetime64[ns])

In [8]:
import pandas as pd
import numpy as np
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
from tqdm import tqdm
import requests
from io import BytesIO
from sklearn.metrics.pairwise import cosine_similarity

# Load CLIP Model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Function to get CLIP embeddings
def get_clip_embedding(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        image = Image.open(BytesIO(response.content)).convert("RGB")
        inputs = processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            embedding = model.get_image_features(**inputs)
        return embedding.cpu().numpy().flatten()
    except:
        return None

# Load report data
df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])
df.columns = df.columns.str.strip().str.replace('"', '')

# Filter for specific date (no time window)
target_datetime = pd.Timestamp("2025-05-14 16:44:00")

filtered_df = df[df['date_captured'] == target_datetime].copy()
print(f"✅ Filtered {len(filtered_df)} rows for exact datetime {target_datetime}")

# Split into ends and labels
ends_df = filtered_df[filtered_df['sku'] == '-'].copy()
labels_df = filtered_df[filtered_df['sku'] != '-'].copy()

print(f"✅ {len(ends_df)} ends and {len(labels_df)} labels found.")

# Cosine Similarity Matching
results = []
for idx, label_row in tqdm(labels_df.iterrows(), total=len(labels_df), desc="Matching"):
    label_emb = get_clip_embedding(label_row['end_image_url'])
    if label_emb is None:
        continue

    best_score = -1
    best_end_url = None

    for _, end_row in ends_df.iterrows():
        end_emb = get_clip_embedding(end_row['end_image_url'])
        if end_emb is None:
            continue

        score = cosine_similarity([label_emb], [end_emb])[0][0]
        if score > best_score:
            best_score = score
            best_end_url = end_row['end_image_url']

    results.append({
        'label_end_image_url': label_row['end_image_url'],
        'best_end_image_url': best_end_url,
        'cosine_similarity': best_score
    })

# Save results
cosine_results_df = pd.DataFrame(results)
cosine_results_df.to_csv("cosine_similarity_results.csv", index=False)

print("✅ Cosine similarity matching complete!")
print(cosine_results_df.head())

  from .autonotebook import tqdm as notebook_tqdm
  df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])


✅ Filtered 8 rows for exact datetime 2025-05-14 16:44:00
✅ 4 ends and 4 labels found.


Matching: 100%|██████████| 4/4 [00:13<00:00,  3.45s/it]

✅ Cosine similarity matching complete!
                                 label_end_image_url  \
0  https://dtexg3-images.s3.ap-southeast-2.amazon...   
1  https://dtexg3-images.s3.ap-southeast-2.amazon...   
2  https://dtexg3-images.s3.ap-southeast-2.amazon...   
3  https://dtexg3-images.s3.ap-southeast-2.amazon...   

                                  best_end_image_url  cosine_similarity  
0  https://dtexg3-images.s3.ap-southeast-2.amazon...           0.787084  
1  https://dtexg3-images.s3.ap-southeast-2.amazon...           0.770264  
2  https://dtexg3-images.s3.ap-southeast-2.amazon...           0.830588  
3  https://dtexg3-images.s3.ap-southeast-2.amazon...           0.816342  





In [None]:
import torch
import requests
import numpy as np
from PIL import Image
from io import BytesIO
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import tempfile
import os
import time

# Add GroundingDINO path
import sys
sys.path.append("/Users/vrajnena/Desktop/Ml/GroundingDINO")
from groundingdino.util.inference import load_model, predict, load_image
from transformers import CLIPProcessor, CLIPModel

# ===================
# 🔹 Load Models
# ===================
# 

# GroundingDINO
dino_config = "/Users/vrajnena/Desktop/Ml/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
dino_weights = "/Users/vrajnena/Desktop/Ml/GroundingDINO/weights/groundingdino_swint_ogc.pth"
dino_model = load_model(dino_config, dino_weights)

# CLIP
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# ===================
# 🔹 Functions
# ===================
def download_image_to_temp(url, max_retries=3, timeout=30):
    for attempt in range(max_retries):
        try:
            response = requests.get(url, timeout=timeout)
            response.raise_for_status()
            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as f:
                f.write(response.content)
                return f.name
        except Exception as e:
            print(f"⚠️ Attempt {attempt+1} failed for {url}: {e}")
            time.sleep(1)
    print(f"❌ Failed to download image after {max_retries} attempts: {url}")
    return None

def get_clip_embedding(img_pil):
    try:
        inputs = clip_processor(images=img_pil, return_tensors="pt")
        with torch.no_grad():
            emb = clip_model.get_image_features(**inputs)
        return emb.cpu().numpy().flatten()
    except Exception as e:
        print(f"⚠️ CLIP embedding failed: {e}")
        return None

def detect_regions(end_url, prompt):
    try:
        temp_path = download_image_to_temp(end_url)
        if temp_path is None:
            return []

        img_pil, image_tensor = load_image(temp_path)
        image_tensor = image_tensor.unsqueeze(0)
        # print(image_tensor)

        boxes, logits, phrases = predict(
            model=dino_model,
            image=image_tensor,
            caption=prompt,
            box_threshold=0.3,
            text_threshold=0.25
        )

        W, H = img_pil.size
        crops = []
        for box in boxes:
            x0, y0, x1, y1 = box
            x0, y0, x1, y1 = int(x0 * W), int(y0 * H), int(x1 * W), int(y1 * H)
            crop = img_pil.crop((x0, y0, x1, y1))
            crops.append(crop)
        print(crops)
        os.remove(temp_path)  # Clean up
        return crops

    except Exception as e:
        print(f"❌ Error in detect_regions for {end_url}: {e}")
        return []

# ===================
# 🔹 Load Data
# ===================
df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])
df.columns = df.columns.str.strip().str.replace('"', '')

# Filter for specific datetime
target_time = pd.Timestamp("2025-05-14 16:44:00")
filtered = df[df['date_captured'] == target_time]
ends_df = filtered[filtered['sku'] == '-']
labels_df = filtered[filtered['sku'] != '-']

print(f"✅ Filtered: {len(ends_df)} ends and {len(labels_df)} labels for {target_time}")

# ===================
# 🔹 Matching Process
# ===================
results = []
for idx, label_row in tqdm(labels_df.iterrows(), total=len(labels_df), desc="Matching"):
    label_url = label_row['end_image_url']
    temp_label_path = download_image_to_temp(label_url)
    if temp_label_path is None:
        continue

    label_img = Image.open(temp_label_path).convert("RGB")
    label_emb = get_clip_embedding(label_img)
    os.remove(temp_label_path)

    if label_emb is None:
        continue

    best_score, best_end_url = -1, None

    for _, end_row in ends_df.iterrows():
        end_url = end_row['end_image_url']
        regions = detect_regions(end_url, prompt="price tag, product label, discount tag")

        for region in regions:
            region_emb = get_clip_embedding(region)
            if region_emb is None:
                continue
            score = cosine_similarity([label_emb], [region_emb])[0][0]

            if score > best_score:
                best_score = score
                best_end_url = end_url

    results.append({
        'label_end_image_url': label_url,
        'best_end_image_url': best_end_url,
        'cosine_similarity': best_score,
        'label_product': label_row.get('product_name', ''),
        'label_brand': label_row.get('brand', ''),
    })

# ===================
# 🔹 Save Results
# ===================
pd.DataFrame(results).to_csv("groundingdino_clip_results.csv", index=False)
print("✅ Matching complete! Results saved to 'groundingdino_clip_results.csv'.")

final text_encoder_type: bert-base-uncased




✅ Filtered: 4 ends and 4 labels for 2025-05-14 16:44:00




❌ Error in detect_regions for https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164414-szsjlq_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_false.jpg: not supported
❌ Error in detect_regions for https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164449-9q8J4Y_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_false.jpg: not supported
❌ Error in detect_regions for https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164427-sVfMHi_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_false.jpg: not supported


Matching:  25%|██▌       | 1/4 [00:02<00:08,  2.84s/it]

❌ Error in detect_regions for https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164401-lgnixh_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_true.jpg: not supported




❌ Error in detect_regions for https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164414-szsjlq_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_false.jpg: not supported
❌ Error in detect_regions for https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164449-9q8J4Y_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_false.jpg: not supported
❌ Error in detect_regions for https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164427-sVfMHi_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_false.jpg: not supported


Matching:  50%|█████     | 2/4 [00:05<00:05,  2.78s/it]

❌ Error in detect_regions for https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164401-lgnixh_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_true.jpg: not supported




❌ Error in detect_regions for https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164414-szsjlq_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_false.jpg: not supported
❌ Error in detect_regions for https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164449-9q8J4Y_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_false.jpg: not supported
❌ Error in detect_regions for https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164427-sVfMHi_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_false.jpg: not supported


Matching:  75%|███████▌  | 3/4 [00:08<00:02,  2.80s/it]

❌ Error in detect_regions for https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164401-lgnixh_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_true.jpg: not supported




❌ Error in detect_regions for https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164414-szsjlq_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_false.jpg: not supported
❌ Error in detect_regions for https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164449-9q8J4Y_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_false.jpg: not supported
❌ Error in detect_regions for https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164427-sVfMHi_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_false.jpg: not supported


Matching: 100%|██████████| 4/4 [00:11<00:00,  2.80s/it]

❌ Error in detect_regions for https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-164401-lgnixh_cc9c5d06-72be-49f1-a3ce-a005b450bf88_3_eoa_12568_-33.7508916_150.693301_android_34_11p03-01_b0_s0_true.jpg: not supported
✅ Matching complete! Results saved to 'groundingdino_clip_results.csv'.





In [28]:
# Final Pipeline Script - Part 1

import os
import sys
import torch
import requests
import numpy as np
import pandas as pd
import tempfile
import time
from io import BytesIO
from PIL import Image
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from transformers import CLIPProcessor, CLIPModel

# Import GroundingDINO
import sys
sys.path.append("/Users/vrajnena/Desktop/Ml/GroundingDINO")
from groundingdino.util.inference import load_model, predict, load_image

# ================================
# 🔹 Model Setup
# ================================
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load GroundingDINO
dino_config = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
dino_weights = "GroundingDINO/weights/groundingdino_swint_ogc.pth"
dino_model = load_model(dino_config, dino_weights).to(device)

# Load CLIP
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# ================================
# 🔹 Helper Functions
# ================================
def download_image(url, retries=3, timeout=30):
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=timeout)
            response.raise_for_status()
            return Image.open(BytesIO(response.content)).convert("RGB")
        except Exception as e:
            print(f"⚠️ Attempt {attempt+1} failed for {url}: {e}")
            time.sleep(1)
    print(f"❌ Failed to download image: {url}")
    return None

def get_clip_embedding(pil_image):
    try:
        inputs = clip_processor(images=pil_image, return_tensors="pt").to(device)
        with torch.no_grad():
            features = clip_model.get_image_features(**inputs)
        return features.cpu().numpy().flatten()
    except Exception as e:
        print(f"⚠️ CLIP embedding error: {e}")
        return None

def detect_dino_regions(pil_image, prompt="price tag, product label, discount tag"):
    try:
        # Preprocess for GroundingDINO
        image_tensor = clip_processor(images=pil_image, return_tensors="pt")['pixel_values'].to(device)
        image_tensor = image_tensor.squeeze(0)

        W, H = pil_image.size
        boxes, logits, phrases = predict(
            model=dino_model,
            image=image_tensor,
            caption=prompt,
            box_threshold=0.3,
            text_threshold=0.25
        )

        crops = []
        for box in boxes:
            x0, y0, x1, y1 = box
            # Convert to pixel coordinates
            x0, x1 = sorted([int(x0 * W), int(x1 * W)])
            y0, y1 = sorted([int(y0 * H), int(y1 * H)])
            # Ensure valid box
            x0 = max(0, x0)
            y0 = max(0, y0)
            x1 = min(W, x1)
            y1 = min(H, y1)
            if x1 - x0 <= 0 or y1 - y0 <= 0:
                continue  # Skip invalid boxes
            crops.append(pil_image.crop((x0, y0, x1, y1)))

        return crops

    except Exception as e:
        print(f"❌ DINO detection error: {e}")
        return []
# ================================
# 🔹 Load Data
# ================================
df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])
df.columns = df.columns.str.strip().str.replace('"', '')

target_time = pd.Timestamp("2025-05-14 16:44:00")
filtered = df[df['date_captured'] == target_time]
ends_df = filtered[filtered['sku'] == '-']
labels_df = filtered[filtered['sku'] != '-']

print(f"✅ Filtered {len(ends_df)} ends and {len(labels_df)} labels for {target_time}")

# ================================
# 🔹 Visual-First Matching
# ================================
results = []
for idx, label_row in tqdm(labels_df.iterrows(), total=len(labels_df), desc="Matching Labels"):
    label_url = label_row['end_image_url']
    label_img = download_image(label_url)
    if label_img is None:
        continue

    label_emb = get_clip_embedding(label_img)
    if label_emb is None:
        continue

    best_score, best_end_url = -1, None

    for _, end_row in ends_df.iterrows():
        end_url = end_row['end_image_url']
        end_img = download_image(end_url)
        if end_img is None:
            continue

        regions = detect_dino_regions(end_img)
        for region in regions:
            region_emb = get_clip_embedding(region)
            if region_emb is None:
                continue
            score = cosine_similarity([label_emb], [region_emb])[0][0]
            if score > best_score:
                best_score = score
                best_end_url = end_url

    results.append({
        'label_end_image_url': label_url,
        'best_end_image_url': best_end_url,
        'cosine_similarity': best_score,
        'label_product': label_row.get('product_name', ''),
        'label_brand': label_row.get('brand', ''),
    })

pd.DataFrame(results).to_csv("groundingdino_clip_results.csv", index=False)
print("✅ Matching complete! Results saved to 'groundingdino_clip_results.csv'.")

final text_encoder_type: bert-base-uncased




✅ Filtered 4 ends and 4 labels for 2025-05-14 16:44:00


Matching Labels: 100%|██████████| 4/4 [00:19<00:00,  4.95s/it]

✅ Matching complete! Results saved to 'groundingdino_clip_results.csv'.





In [31]:
import os
import pandas as pd
import numpy as np
from google.cloud import vision
from google.oauth2 import service_account
from fuzzywuzzy import fuzz
from tqdm import tqdm

# ====================================
# 1️⃣ Google Cloud Client Setup
# ====================================
SERVICE_ACCOUNT_JSON = "/Users/vrajnena/Desktop/Ml/upbeat-airfoil-439209-q3-ab31c54e6301.json"  # <-- Update
credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_JSON)
client = vision.ImageAnnotatorClient(credentials=credentials)

# ====================================
# 2️⃣ OCR Function for GCS URL
# ====================================
def extract_text_from_gcs(gcs_uri):
    image = vision.Image()
    image.source.image_uri = gcs_uri

    response = client.document_text_detection(image=image)
    if response.error.message:
        print(f"❌ Error in Vision API: {response.error.message}")
        return ""

    text = response.full_text_annotation.text
    return text

# ====================================
# 3️⃣ Matching Logic
# ====================================
def match_labels_to_ends(labels_df, ends_df):
    results = []
    for idx, label_row in tqdm(labels_df.iterrows(), total=len(labels_df), desc="Matching Labels to Ends"):
        label_text = f"{label_row['brand']} {label_row['product_name']}"

        best_score = -1
        best_end_url = None
        for _, end_row in ends_df.iterrows():
            end_text = end_row['ocr_text']
            score = fuzz.token_set_ratio(label_text, end_text)
            if score > best_score:
                best_score = score
                best_end_url = end_row['end_image_url']

        results.append({
            'label_image_url': label_row['label_image_url'],
            'best_end_image_url': best_end_url,
            'fuzzy_score': best_score,
            'label_text': label_text
        })

    return pd.DataFrame(results)

# ====================================
# 4️⃣ Load Data (from your report)
# ====================================
df = pd.read_csv("20250514-report.csv")
df.columns = df.columns.str.strip().str.replace('"', '')

# Filter for specific datetime
target_time = pd.Timestamp("2025-05-14 16:44:00")
filtered = df[df['date_captured'] == target_time]

# Split into Ends (sku == '-') and Labels (sku != '-')
ends_df = filtered[filtered['sku'] == '-'].copy()
labels_df = filtered[filtered['sku'] != '-'].copy()

# ====================================
# 5️⃣ Extract OCR for Ends
# ====================================
print("🔍 Running OCR for End Images...")
ocr_texts = []
for url in tqdm(ends_df['end_image_url']):
    text = extract_text_from_gcs(url)
    ocr_texts.append(text)
ends_df['ocr_text'] = ocr_texts

# Save OCR results
ends_df.to_csv("ends_with_ocr.csv", index=False)
print("✅ OCR complete. Saved to 'ends_with_ocr.csv'.")

# ====================================
# 6️⃣ Match Labels to Ends
# ====================================
matched_df = match_labels_to_ends(labels_df, ends_df)
matched_df.to_csv("gcv_label_to_end_mapping.csv", index=False)
print("✅ Final mapping saved to 'gcv_label_to_end_mapping.csv'.")

🔍 Running OCR for End Images...


0it [00:00, ?it/s]


✅ OCR complete. Saved to 'ends_with_ocr.csv'.


Matching Labels to Ends: 0it [00:00, ?it/s]

✅ Final mapping saved to 'gcv_label_to_end_mapping.csv'.





In [32]:
df =pd.read_csv("20250514-report.csv")

In [36]:
len(df['brand'].unique())

424

In [None]:
df.columns = df.columns.str.strip().str.replace('"', '')

# Filter out NaN and deduplicate
unique_products = df[['brand', 'product_name', 'pack_size','end_image_url']].dropna().drop_duplicates()

In [38]:
unique_products



Unnamed: 0,brand,product_name,pack_size
0,-,-,-
1,RED TRACTOR,AUSTRALIAN ROLLED OATS,1KG
2,Gold Sunset,Canola Oil,2L
3,SISTEMA,ULTRA CONTAINER 4L,1 EACH
4,DECOR,QUAD BANDS TRITAN BOTTLE 750ML,1 EACH
...,...,...,...
4375,NONE,MATT MORANSTOCK CHICKEN,500ML
4378,CAMPBELLS,CAMPBELLS COUNTRY LADLE SOUP MINESTRONE,495G
4380,CAPTAINS TABLE,CAPTAINS TABLE WATER CRACKER,125G
4387,HEINZ,HEINZ CLASSIC PEA & HAM SOUP CANNED SOUP READY...,535G


In [39]:
df = df[~df['end_location'].str.lower().str.contains('off', na=False)].copy()

# Sidebar Filters
# st.sidebar.title("🛒 Product Filter Dashboard")

store_types = df['store_type'].dropna().unique()
# store_type = st.sidebar.selectbox("Select Store Type", store_types)

store_suburbs = df[df['store_type'] == 'Coles']['store_suburb'].dropna().unique()
# store_suburb = st.sidebar.selectbox("Select Store Suburb", store_suburbs)

unique_products = df[(df['store_type'] == 'Coles')]['product_name'].dropna().unique()
# selected_product = st.sidebar.selectbox("Select Product", unique_products)

# Filtered Data
filtered_df = df[
    (df['store_type'] == store_types) &
    (df['store_suburb'] == store_suburbs) &
    (df['product_name'] == unique_products)
]


ValueError: ('Lengths must match to compare', (3289,), (2,))

In [4]:
import pandas as pd

# 1️⃣ Load the report CSV
df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])
df.columns = df.columns.str.strip().str.replace('"', '')
df =df[df['end_location'].str.contains("Off Location", case=False, na=False)].copy()

# 2️⃣ Split into Ends (sku == '-') and Labels (sku != '-')
ends_df = df[df['sku'] == '-'].copy()
labels_df = df[df['sku'] != '-'].copy()

print(f"✅ Loaded {len(ends_df)} ends and {len(labels_df)} labels.")

# 3️⃣ Function to generate a combined key for matching
def generate_key(row):
    return f"{row['visit_id']}|{row['store_type']}|{row['store_suburb']}|{row['end_location']}|{row['date_captured']}"

# 4️⃣ Create keys for matching
ends_df['match_key'] = ends_df.apply(generate_key, axis=1)
labels_df['match_key'] = labels_df.apply(generate_key, axis=1)

# 5️⃣ Match labels to ends based on match_key
match_results = []

for idx, label_row in labels_df.iterrows():
    label_key = label_row['match_key']
    candidates = ends_df[ends_df['match_key'] == label_key]

    if not candidates.empty:
        # Take the first match (can be adjusted if you prefer logic like closest timestamp etc.)
        matched_end = candidates.iloc[0]
        match_results.append({
            'label_id': label_row['id'],
            'label_image_url': label_row['label_image_url'],
            'label_product': f"{label_row['brand']} {label_row['product_name']}",
            'end_image_url': matched_end['end_image_url'],
            'matched_on': label_key
        })
    else:
        match_results.append({
            'label_id': label_row['id'],
            'label_image_url': label_row['label_image_url'],
            'label_product': f"{label_row['brand']} {label_row['product_name']}",
            'end_image_url': None,
            'matched_on': 'No Match'
        })

# 6️⃣ Convert to DataFrame and save
final_matches_df = pd.DataFrame(match_results)
final_matches_df.to_csv("label_to_end_matches.csv", index=False)

print(f"✅ Matching complete! Results saved to 'label_to_end_matches.csv'.")
print(final_matches_df.head())

✅ Loaded 506 ends and 606 labels.
✅ Matching complete! Results saved to 'label_to_end_matches.csv'.
        label_id                                    label_image_url  \
0  lblID-1195330  https://dtexg3-images.s3.ap-southeast-2.amazon...   
1  lblID-1195392  https://dtexg3-images.s3.ap-southeast-2.amazon...   
2  lblID-1195388  https://dtexg3-images.s3.ap-southeast-2.amazon...   
3  lblID-1195386  https://dtexg3-images.s3.ap-southeast-2.amazon...   
4  lblID-1195385  https://dtexg3-images.s3.ap-southeast-2.amazon...   

                                       label_product  \
0                OREO OREO PASCALL MARSHMALLOWS SLUG   
1                      VICKS VICKS VAPOUR SHOWER 5PK   
2  DARRELL LEA DARRELL LEA MILK CHOCOLATE LOVE HEART   
3       BLACKMORES LYP-SINE COLD SORE RELIEF TABLETS   
4        RED BULL SUGAR FREE ENERGY DRINK SINGLE CAN   

                                       end_image_url  \
0  https://dtexg3-images.s3.ap-southeast-2.amazon...   
1  https://dtexg3-images

In [None]:
import pandas as pd

# 1️⃣ Load the report CSV
df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])
df.columns = df.columns.str.strip().str.replace('"', '')
df =df[df['end_location'].str.contains("Off Location", case=False, na=False)].copy()

# 2️⃣ Split into Ends (sku == '-') and Labels (sku != '-')
ends_df = df[df['sku'] == '-'].copy()
labels_df = df[df['sku'] != '-'].copy()

print(f"✅ Loaded {len(ends_df)} ends and {len(labels_df)} labels.")

# 3️⃣ Function to generate a combined key for matching
def generate_key(row):
    return f"{row['visit_id']}|{row['store_type']}|{row['store_suburb']}|{row['end_location']}|{row['date_captured']}"

# 4️⃣ Create keys for matching
ends_df['match_key'] = ends_df.apply(generate_key, axis=1)
labels_df['match_key'] = labels_df.apply(generate_key, axis=1)

# 5️⃣ Pre-compute counts of ends per key
ends_counts = ends_df['match_key'].value_counts().to_dict()

# 6️⃣ Match labels to ends based on match_key (only if exactly 1 end)
match_results = []

for idx, label_row in labels_df.iterrows():
    label_key = label_row['match_key']
    match_count = ends_counts.get(label_key, 0)

    if match_count == 1:
        matched_end = ends_df[ends_df['match_key'] == label_key].iloc[0]
        match_results.append({
            'label_id': label_row['id'],
            'label_image_url': label_row['label_image_url'],
            'label_product': f"{label_row['brand']} {label_row['product_name']}",
            'end_image_url': matched_end['end_image_url'],
            'brand':matched_end['brand']
            'matched_on': label_key
        })
    else:
        match_results.append({
            'label_id': label_row['id'],
            'label_image_url': label_row['label_image_url'],
            'label_product': f"{label_row['brand']} {label_row['product_name']}",
            'end_image_url': None,
            'matched_on': 'No Match (multiple ends or none)'
        })

# 7️⃣ Convert to DataFrame and save
final_matches_df = pd.DataFrame(match_results)
final_matches_df.to_csv("label_to_end_matches_filtered.csv", index=False)

print(f"✅ Matching complete! Results saved to 'label_to_end_matches_filtered.csv'.")
print(final_matches_df.head())

✅ Loaded 506 ends and 606 labels.
✅ Matching complete! Results saved to 'label_to_end_matches_filtered.csv'.
        label_id                                    label_image_url  \
0  lblID-1195330  https://dtexg3-images.s3.ap-southeast-2.amazon...   
1  lblID-1195392  https://dtexg3-images.s3.ap-southeast-2.amazon...   
2  lblID-1195388  https://dtexg3-images.s3.ap-southeast-2.amazon...   
3  lblID-1195386  https://dtexg3-images.s3.ap-southeast-2.amazon...   
4  lblID-1195385  https://dtexg3-images.s3.ap-southeast-2.amazon...   

                                       label_product  \
0                OREO OREO PASCALL MARSHMALLOWS SLUG   
1                      VICKS VICKS VAPOUR SHOWER 5PK   
2  DARRELL LEA DARRELL LEA MILK CHOCOLATE LOVE HEART   
3       BLACKMORES LYP-SINE COLD SORE RELIEF TABLETS   
4        RED BULL SUGAR FREE ENERGY DRINK SINGLE CAN   

                                       end_image_url  \
0                                               None   
1  https://dtex