In [1]:
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from torchvision import models, transforms
from PIL import Image
import joblib

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
test_csv_path = r"C:\Users\Ekaansh\OneDrive\Desktop\AB\vs code\JS\projects\hackathon\amazon\data\test_processed.csv"
test_img_folder = r"D:\amazon dataset\AMAZON_ML_IMAGES_TEST"

In [4]:
weights = {"CatBoost": 0.2, "LightGBM": 0.4, "XGBoost": 0.4}

In [5]:
cat_model_path = r"C:\Users\Ekaansh\OneDrive\Desktop\AB\vs code\JS\projects\hackathon\amazon\models\with image\CatBoost_model_gpu.pkl"
lgb_model_path = r"C:\Users\Ekaansh\OneDrive\Desktop\AB\vs code\JS\projects\hackathon\amazon\models\with image\LightGBM_model_gpu.pkl"
xgb_model_path = r"C:\Users\Ekaansh\OneDrive\Desktop\AB\vs code\JS\projects\hackathon\amazon\models\with image\XGBoost_model_gpu.pkl"

In [6]:
batch_size = 32  

In [7]:
test_df = pd.read_csv(test_csv_path)
img_ids = test_df["sample_id"].astype(str).tolist()
print(f"Total test images: {len(img_ids)}")

Total test images: 75000


In [8]:
exclude_cols = ["sample_id", "image_link"]
X_tab = test_df.drop(columns=exclude_cols).values
print(f"Tabular features shape: {X_tab.shape}")

Tabular features shape: (75000, 9)


In [9]:
cnn_model = models.efficientnet_b3(weights=models.EfficientNet_B3_Weights.IMAGENET1K_V1)
cnn_model.classifier = torch.nn.Identity()  # remove classification head
cnn_model = cnn_model.to(device)
cnn_model.eval()

transform = transforms.Compose([
    transforms.Resize((300, 300)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

In [10]:
features = []
num_batches = int(np.ceil(len(img_ids) / 1))

print("ðŸš€ Generating CNN embeddings for test images...")

with torch.no_grad():
    for i in tqdm(range(0, len(img_ids), 1), total=num_batches, desc="Batches"):
        batch_ids = img_ids[i:i+1]
        batch_imgs = []

        for img_id in batch_ids:
            img_path = os.path.join(test_img_folder, f"{img_id}.jpg")
            if not os.path.exists(img_path):
                batch_imgs.append(torch.zeros(3, 300, 300))
                continue
            try:
                img = Image.open(img_path).convert("RGB")
                batch_imgs.append(transform(img))
            except:
                batch_imgs.append(torch.zeros(3, 300, 300))

        batch_tensor = torch.stack(batch_imgs).to(device)
        batch_features = cnn_model(batch_tensor).cpu().numpy()
        features.append(batch_features)

        torch.cuda.empty_cache()


ðŸš€ Generating CNN embeddings for test images...


Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 75000/75000 [1:43:33<00:00, 12.07it/s]


In [12]:
# Make sure each feature vector is the same length
features = [f if f.shape[0] == 1536 else np.zeros(1536) for f in features]

# Convert list of arrays to 2D numpy array
features = np.vstack(features).astype(np.float32)
print("âœ… Image embeddings shape:", features.shape)

# Now save
np.save("test_cnn_features_efficientnetb3.npy", features)
print("âœ… Test CNN embeddings saved successfully")


âœ… Image embeddings shape: (75000, 1536)
âœ… Test CNN embeddings saved successfully
