In [1]:
import torch
from transformers import AutoFeatureExtractor, AutoModel
from PIL import Image
import requests
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "google/vit-base-patch16-224-in21k"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()



ViTModel(
  (embeddings): ViTEmbeddings(
    (patch_embeddings): ViTPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ViTLayer(
        (attention): ViTSdpaAttention(
          (attention): ViTSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ViTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ViTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUAct

In [3]:
def get_image_embedding(image_path):
    # Load and preprocess the image
    image = Image.open(image_path).convert("RGB")
    inputs = feature_extractor(images=image, return_tensors="pt")
    
    # Extract features
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
        embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)  # Normalize
    return embeddings

In [4]:
import pandas as pd

# 1️⃣ Load Report
df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])
df.columns = df.columns.str.strip().str.replace('"', '')

# 2️⃣ Filter for Off Location only
df_off = df[df['end_location'].str.contains("Off Location", case=False, na=False)].copy()

# 3️⃣ Split into Ends (sku == '-') and Labels (sku != '-')
ends_df = df_off[df_off['sku'] == '-'].copy()
labels_df = df_off[df_off['sku'] != '-'].copy()

print(f"✅ Ends: {len(ends_df)}, Labels: {len(labels_df)}")

# 4️⃣ Create lists
end_image_urls = ends_df['end_image_url'].dropna().unique().tolist()
label_image_urls = labels_df['label_image_url'].dropna().unique().tolist()

# 5️⃣ Save to CSV
pd.DataFrame({'end_image_url': end_image_urls}).to_csv('end_image_urls.csv', index=False)
pd.DataFrame({'label_image_url': label_image_urls}).to_csv('label_image_urls.csv', index=False)

print(f"✅ End image paths and label image paths saved.")
print(f"Sample End Image URLs:\n{end_image_urls[:5]}")
print(f"Sample Label Image URLs:\n{label_image_urls[:5]}")

✅ Ends: 506, Labels: 606
✅ End image paths and label image paths saved.
Sample End Image URLs:
['https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-144302-neJ3bY_c28d7da5-5483-463c-a62a-7100ccd2cd28_3_eoa_13015_-37.79389719008628_145.27940489474503_ios_18.4.1_110_b0_s0_false.jpg', 'https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-144014-duwz67_c28d7da5-5483-463c-a62a-7100ccd2cd28_3_eoa_13015_-37.79389719008628_145.27940489474503_ios_18.4.1_110_b0_s0_true.jpg', 'https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-144010-dCZV5x_c28d7da5-5483-463c-a62a-7100ccd2cd28_3_eoa_13015_-37.79389719008628_145.27940489474503_ios_18.4.1_110_b0_s0_false.jpg', 'https://dtexg3-images.s3.ap-southeast-2.amazonaws.com/mobile_uploads/20250514-143940-jQd6cd_c28d7da5-5483-463c-a62a-7100ccd2cd28_3_eoa_13015_-37.79389719008628_145.27940489474503_ios_18.4.1_110_b0_s0_undefined.jpg', 'https://dtexg3-images.s3.ap-southeast-2.amazonaw

In [5]:
# Extract embedding for the query image
query_embedding = get_image_embedding(label_image_urls)

# Extract embeddings for the dataset images
dataset_embeddings = []
for path in end_image_urls:
    embedding = get_image_embedding(path)
    dataset_embeddings.append(embedding)

# Stack embeddings into a single tensor
dataset_embeddings = torch.vstack(dataset_embeddings)

# Compute cosine similarities
similarities = cosine_similarity(query_embedding.numpy(), dataset_embeddings.numpy())

# Get top-k similar images
top_k = 5
top_k_indices = similarities[0].argsort()[-top_k:][::-1]
top_k_paths = [image_paths[i] for i in top_k_indices]

AttributeError: 'list' object has no attribute 'read'

In [7]:
import os
import pandas as pd
import numpy as np
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics.pairwise import cosine_similarity

# Load report CSV
df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])
df.columns = df.columns.str.strip().str.replace('"', '')

# Filter off-location
off_location_df = df[df['end_location'].str.contains("Off Location", case=False, na=False)].copy()

# Split into ends and labels
ends_df = off_location_df[off_location_df['sku'] == '-'].copy()
labels_df = off_location_df[off_location_df['sku'] != '-'].copy()

print(f"✅ Found {len(ends_df)} ends and {len(labels_df)} labels in Off-Location.")

# Load CLIP model
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Function to download and load image from URL
def load_image_from_url(url):
    try:
        response = requests.get(url, timeout=15)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content)).convert("RGB")
        return img
    except Exception as e:
        print(f"❌ Error loading image {url}: {e}")
        return None

# Get embedding
def get_clip_embedding(image):
    try:
        inputs = clip_processor(images=image, return_tensors="pt")
        with torch.no_grad():
            emb = clip_model.get_image_features(**inputs)
        return emb.squeeze().numpy()
    except Exception as e:
        print(f"⚠️ Error in CLIP embedding: {e}")
        return None

# Prepare results
results = []

for idx, label_row in tqdm(labels_df.iterrows(), total=len(labels_df), desc="Matching"):
    label_img = load_image_from_url(label_row['label_image_url'])
    if label_img is None:
        continue

    label_emb = get_clip_embedding(label_img)
    if label_emb is None:
        continue

    best_score = -1
    best_end_url = None

    # Filter candidate ends by visit_id and store details
    candidates = ends_df[
        (ends_df['visit_id'] == label_row['visit_id']) &
        (ends_df['store_type'] == label_row['store_type']) &
        (ends_df['store_suburb'] == label_row['store_suburb']) 
    ]

    for _, end_row in candidates.iterrows():
        end_img = load_image_from_url(end_row['end_image_url'])
        if end_img is None:
            continue

        end_emb = get_clip_embedding(end_img)
        if end_emb is None:
            continue

        score = cosine_similarity([label_emb], [end_emb])[0][0]

        if score > best_score:
            best_score = score
            best_end_url = end_row['end_image_url']

    results.append({
        'label_image_url': label_row['label_image_url'],
        'label_product': f"{label_row['brand']} {label_row['product_name']}",
        'best_end_image_url': best_end_url,
        'cosine_similarity': best_score
    })

# Save results
results_df = pd.DataFrame(results)
results_df.to_csv("cosine_similarity_matches.csv", index=False)
print("✅ Cosine similarity matching complete. Results saved to 'cosine_similarity_matches.csv'.")

✅ Found 506 ends and 606 labels in Off-Location.


Matching:   3%|▎         | 17/606 [14:40<8:28:21, 51.79s/it]


KeyboardInterrupt: 

In [8]:
import os
import pandas as pd

# Load CSV
df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])
df.columns = df.columns.str.strip().str.replace('"', '')

# Filter for off-location and target stores
target_stores = ['Croydon', 'Keilor Downs', 'Doncaster']
off_location_df = df[
    (df['end_location'].str.contains("Off Location", case=False, na=False)) &
    (df['store_suburb'].isin(target_stores))
].copy()

# Split
ends_df = off_location_df[off_location_df['sku'] == '-'].copy()
labels_df = off_location_df[off_location_df['sku'] != '-'].copy()

print(f"✅ Filtered {len(ends_df)} ends and {len(labels_df)} labels.")

✅ Filtered 109 ends and 90 labels.


In [None]:
import aiohttp
import aiofiles
import asyncio

async def download_image(session, url, save_path):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    try:
        async with session.get(url, timeout=60) as resp:
            if resp.status == 200:
                f = await aiofiles.open(save_path, mode='wb')
                await f.write(await resp.read())
                await f.close()
                print(f"✅ Saved: {save_path}")
            else:
                print(f"❌ Failed: {url}")
    except Exception as e:
        print(f"⚠️ Error {url}: {e}")

async def download_all(df, url_col, folder):
    async with aiohttp.ClientSession() as session:
        tasks = []
        for _, row in df.iterrows():
            url = row[url_col]
            filename = os.path.join(folder, os.path.basename(url))
            tasks.append(download_image(session, url, filenamme))
        await asyncio.gather(*tasks)

# Download ends and labels
async def main():
    await download_all(ends_df, 'end_image_url', 'images/ends')
    await download_all(labels_df, 'label_image_url', 'images/labels')

if __name__ == "__main__":
    asyncio.run(main())

RuntimeError: asyncio.run() cannot be called from a running event loop

In [None]:
import os
import pandas as pd
import requests
from time import sleep

# Load your report CSV
df = pd.read_csv("20250514-report.csv", parse_dates=['date_captured'])
df.columns = df.columns.str.strip().str.replace('"', '')

# Filter for off-location + target stores
target_stores = ['Croydon', 'Keilor Downs', 'Doncaster']
off_location_df = df[
    (df['end_location'].str.contains("Off Location", case=False, na=False)) &
    (df['store_suburb'].isin(target_stores))
].copy()

# Split into ends and labels
ends_df = off_location_df[off_location_df['sku'] == '-'].copy()
labels_df = off_location_df[off_location_df['sku'] != '-'].copy()

print(f"✅ Found {len(ends_df)} ends and {len(labels_df)} labels.")

# Download function
def download_image(row, url_col, base_folder):
    url = row[url_col]
    store_type = str(row['store_type']).replace('/', '_').replace('\\', '_')
    store_suburb = str(row['store_suburb']).replace('/', '_').replace('\\', '_')
    
    folder = os.path.join(base_folder, store_type, store_suburb)
    os.makedirs(folder, exist_ok=True)
    
    filename = os.path.join(folder, os.path.basename(url))
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"✅ Downloaded: {filename}")
    except Exception as e:
        print(f"❌ Failed: {url} | Error: {e}")

# Download one by one
print("\n⬇️ Downloading Ends...")
for idx, row in ends_df.iterrows():
    download_image(row, 'end_image_url', 'images/ends')
    sleep(0.5)  # Optional: small pause between requests

print("\n⬇️ Downloading Labels...")
for idx, row in labels_df.iterrows():
    download_image(row, 'label_image_url', 'images/labels')
    sleep(0.5)  # Optional: small pause between requests

print("\n✅ All downloads complete!")