In [2]:
import pandas as pd
import numpy as np
import os
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [4]:
csv_path = r"C:\Users\Ekaansh\OneDrive\Desktop\AB\vs code\JS\projects\hackathon\amazon\data\train_processed.csv"
df = pd.read_csv(csv_path)
print("Dataset loaded. Total samples:", len(df))

Dataset loaded. Total samples: 75000


In [5]:
IMG_SIZE = 224
transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [6]:
image_dir = r"E:\dataset\TRAIN _IMAGES"

In [7]:
class ProductImageDataset(Dataset):
    def __init__(self, df, img_dir, transform=None):
        self.df = df
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        sample_id = str(self.df.iloc[idx]['sample_id'])
        img_path = os.path.join(self.img_dir, f"{sample_id}.jpg")  # adjust extension if needed
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        price = torch.tensor(self.df.iloc[idx]['price'], dtype=torch.float32)
        return image, price

In [8]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = ProductImageDataset(train_df, img_dir=image_dir, transform=transform)
val_dataset = ProductImageDataset(val_df, img_dir=image_dir, transform=transform)

# Use multiple workers and pin_memory for faster GPU transfer
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)

# ---------------------------
# Step 6: Load pretrained ResNet50 feature extractor
# ---------------------------
model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
feature_extractor = torch.nn.Sequential(*list(model.children())[:-1])  # remove FC layer
feature_extractor = feature_extractor.to(device)
feature_extractor.eval()

Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


In [8]:
def extract_features_safe(dataloader, model, device, dataset_name="train"):
    features = []
    targets = []
    skipped = 0
    total_processed = 0

    print(f"\nStarting extraction for {dataset_name} set...")

    with torch.no_grad():
        for i, (imgs, prices) in enumerate(dataloader):
            try:
                imgs = imgs.to(device, non_blocking=True)
                output = model(imgs)
                output = output.view(output.size(0), -1)

                features.append(output.cpu().numpy())
                targets.append(prices.numpy())

                total_processed += imgs.size(0)

                # progress update every 10 batches
                if (i + 1) % 10 == 0 or (i + 1) == len(dataloader):
                    print(f"{dataset_name} → Processed {total_processed}/{len(dataloader.dataset)} images")

            except Exception as e:
                skipped += imgs.size(0)
                print(f"⚠️ Batch {i+1} skipped due to error: {e}")

    features = np.vstack(features)
    targets = np.hstack(targets)

    print(f"\n✅ {dataset_name} extraction complete.")
    print(f"Total images processed successfully: {total_processed}")
    print(f"Total images skipped: {skipped}")

    return features, targets


In [None]:
train_features, train_targets, train_skipped = extract_features_safe(train_loader, feature_extractor, device, "Train")
val_features, val_targets, val_skipped = extract_features_safe(val_loader, feature_extractor, device, "Validation")

print("Train skipped files:", train_skipped)
print("Validation skipped files:", val_skipped)



Starting extraction for Train set...


In [None]:
np.save("train_img_features.npy", train_features)
np.save("train_targets.npy", train_targets)
np.save("val_img_features.npy", val_features)
np.save("val_targets.npy", val_targets)

print("Features saved to disk. You can reuse them without re-extracting.")