In [None]:
import pandas as pd
import numpy as np
import requests
import re
from PIL import Image
from io import BytesIO
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm import tqdm

# Configure retry strategy for robust downloads
retry_strategy = Retry(
    total=3,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["HEAD", "GET", "OPTIONS"]
)

# Create session with headers and retries
session = requests.Session()
session.mount('https://', HTTPAdapter(max_retries=retry_strategy))
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
    'Referer': 'https://www.stockx.com/'
})

# Initialize ResNet model once (more efficient)
model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

def preprocess_data(df):
    """Enhanced data preprocessing pipeline"""
    # Extract retail price
    price_pattern = r'(?:retail|price|at)\s*\$?(\d{2,3})(?:\.\d{2})?'
    df['retail_price'] = df['description'].str.extract(price_pattern, flags=re.IGNORECASE)[0].astype(float)
    
    # Handle release dates
    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
    df['release_year'] = df['release_date'].dt.year.fillna(-1).astype(int)
    df['release_month'] = df['release_date'].dt.month.fillna(-1).astype(int)
    df['release_dayofweek'] = df['release_date'].dt.dayofweek.fillna(-1).astype(int)
    
    # Encode categorical features
    df = pd.get_dummies(df, columns=['brand', 'product_category'], 
                       prefix=['brand', 'category'], dummy_na=True)
    
    return df

def process_image(url):
    """Robust image processing with error handling"""
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
        
        img = Image.open(BytesIO(response.content))
        if img.mode != 'RGB':
            img = img.convert('RGB')
            
        return img.resize((224, 224))
    
    except Exception as e:
        print(f"Image download failed: {url} - {str(e)}")
        return None

def extract_features(url):
    """CNN feature extraction with fallback"""
    img = process_image(url)
    if img is None:
        return np.zeros(2048)  # Return zero array for missing images
    
    try:
        # Convert to array and preprocess
        img_array = np.array(img)
        img_array = preprocess_input(img_array)
        img_array = np.expand_dims(img_array, axis=0)
        
        # Extract features
        features = model.predict(img_array, verbose=0)
        return features.flatten()
    except Exception as e:
        print(f"Feature extraction failed: {url} - {str(e)}")
        return np.zeros(2048)

def main():
    # Load and preprocess data
    print("Loading data...")
    df = pd.read_csv('third_version_of_stockx_data_with_kinda_many_brands_without_dropping_nan_description.csv')
    df = preprocess_data(df)
    
    # Extract features with progress
    print("Extracting image features...")
    tqdm.pandas(desc="Processing images")
    df['cnn_features'] = df['thumb_url'].progress_apply(extract_features)
    
    # Convert features to storage-friendly format
    df['cnn_features'] = df['cnn_features'].apply(
        lambda x: ';'.join(map(str, x)) if isinstance(x, np.ndarray) else ';'.join(map(str, np.zeros(2048))))
    
    # Save results
    print("Saving results...")
    df.to_csv('stockx_with_cnn_features.csv', index=False)
    print("Processing complete! Saved to stockx_with_cnn_features.csv")

if __name__ == "__main__":
    main()