# Image Feature Extraction

This notebook downloads product images and extracts features using pretrained CNN models.


In [None]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('../src')

from utils import download_images, preprocess_image, extract_image_features
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import joblib
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Load data
train_df = pd.read_csv('../dataset/train.csv')
print(f"Training data shape: {train_df.shape}")

# Download images (this will take a while)
print("Downloading images...")
download_results = download_images(train_df, image_dir='../images', max_retries=3, delay=1)

# Check download results
print(f"Download results:")
print(download_results['status'].value_counts())


In [None]:
# Load pretrained ResNet model for feature extraction
print("Loading pretrained ResNet model...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load ResNet50 pretrained model
resnet = models.resnet50(pretrained=True)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])  # Remove final classification layer
resnet.eval()
resnet.to(device)

# Image preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

print("Model loaded successfully!")


In [None]:
# Extract image features
def extract_resnet_features(image_path, model, transform, device):
    """Extract features using ResNet"""
    try:
        # Load and preprocess image
        image = Image.open(image_path).convert('RGB')
        image_tensor = transform(image).unsqueeze(0).to(device)
        
        # Extract features
        with torch.no_grad():
            features = model(image_tensor)
            features = features.squeeze().cpu().numpy()
        
        return features
    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

# Extract features for all successfully downloaded images
print("Extracting image features...")
image_features = []
sample_ids = []

successful_downloads = download_results[download_results['status'] == 'success']

for idx, row in tqdm(successful_downloads.iterrows(), total=len(successful_downloads), desc="Extracting features"):
    sample_id = row['sample_id']
    image_path = row['filepath']
    
    features = extract_resnet_features(image_path, resnet, transform, device)
    
    if features is not None:
        image_features.append(features)
        sample_ids.append(sample_id)

print(f"Successfully extracted features for {len(image_features)} images")


In [None]:
# Create image features DataFrame
if len(image_features) > 0:
    image_features_df = pd.DataFrame(image_features, index=sample_ids)
    image_features_df.index.name = 'sample_id'
    
    print(f"Image features shape: {image_features_df.shape}")
    
    # Save image features
    image_features_df.to_csv('../dataset/image_features.csv')
    print("Image features saved to ../dataset/image_features.csv")
    
    # Also save as numpy array for easier loading
    np.save('../dataset/image_features.npy', image_features_df.values)
    np.save('../dataset/image_sample_ids.npy', image_features_df.index.values)
    print("Image features also saved as numpy arrays")
else:
    print("No image features extracted!")
