In [None]:
# ram_safe_multimodal.py
import pandas as pd
import numpy as np
from PIL import Image
import requests
from io import BytesIO
import torch
import torch.nn as nn
from torchvision import models, transforms
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import gc
import os
from tqdm import tqdm

# ============================================================
# RAM-SAFE CONFIG
# ============================================================
class SafeConfig:
    # Text settings
    TEXT_MODEL = 'all-MiniLM-L6-v2'  # Lightweight text model

    # Image settings
    IMG_SIZE = 128  # 128x128 images
    BATCH_SIZE = 1  # Process ONE sample at a time
    TIMEOUT = 3

    # Chunk processing
    CHUNK_SIZE = 1000  # Small chunks
    SAVE_EVERY = 500   # Save progress every 500 samples

# ============================================================
# 1. LOAD MODELS ONCE
# ============================================================
def load_models_once():
    """Load models once and keep in memory"""
    print("🚀 Loading models...")

    # Text model (lightweight)
    text_model = SentenceTransformer(SafeConfig.TEXT_MODEL)

    # Image model (ResNet18 - works with 128x128)
    img_model = models.resnet18(pretrained=True)
    img_model = nn.Sequential(*(list(img_model.children())[:-1]))  # Remove classifier
    img_model.eval()

    # Image transforms for 128x128
    img_transform = transforms.Compose([
        transforms.Resize((SafeConfig.IMG_SIZE, SafeConfig.IMG_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                           std=[0.229, 0.224, 0.225]),
    ])

    return text_model, img_model, img_transform

# ============================================================
# 2. PROCESS SINGLE SAMPLE (MEMORY EFFICIENT)
# ============================================================
def process_single_sample(text, image_url, text_model, img_model, img_transform):
    """Process one sample - minimal memory usage"""
    try:
        # TEXT: Get embedding
        text_emb = text_model.encode([str(text)], convert_to_numpy=True)[0]

        # IMAGE: Download and process if URL exists
        if pd.notna(image_url) and str(image_url).startswith('http'):
            try:
                response = requests.get(image_url, timeout=SafeConfig.TIMEOUT)
                image = Image.open(BytesIO(response.content)).convert('RGB')
                image_tensor = img_transform(image).unsqueeze(0)

                with torch.no_grad():
                    img_emb = img_model(image_tensor).squeeze().numpy()

                # Force cleanup
                del image_tensor, image

            except Exception as e:
                img_emb = np.zeros(512)  # Fallback for image errors
        else:
            img_emb = np.zeros(512)

        # Combine features
        combined = np.concatenate([text_emb, img_emb])

        return combined

    except Exception as e:
        # Return zero features if anything fails
        return np.zeros(384 + 512)

# ============================================================
# 3. PROCESS CHUNK AND SAVE TO DISK
# ============================================================
def process_chunk_to_disk(chunk_data, text_model, img_model, img_transform, chunk_id, output_dir):
    """Process one chunk and immediately save to disk"""
    chunk_texts, chunk_urls, chunk_indices = chunk_data

    print(f"🔨 Processing chunk {chunk_id} ({len(chunk_texts)} samples)...")

    features_list = []

    for i, (text, url, idx) in enumerate(tqdm(zip(chunk_texts, chunk_urls, chunk_indices), total=len(chunk_texts))):
        # Process single sample
        features = process_single_sample(text, url, text_model, img_model, img_transform)
        features_list.append(features)

        # Aggressive memory cleanup
        if i % 50 == 0:
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

    # Convert to array and save to disk immediately
    features_array = np.array(features_list)
    np.save(os.path.join(output_dir, f'features_chunk_{chunk_id}.npy'), features_array)

    print(f"💾 Saved chunk {chunk_id} to disk")

    # Clean up
    del features_list, features_array
    gc.collect()

    return len(chunk_texts)

# ============================================================
# 4. MAIN RAM-SAFE EXTRACTION
# ============================================================
def ram_safe_extraction(train_df, test_df):
    """
    Extract multimodal features without RAM crashes
    """
    print("⚡ RAM-SAFE MULTIMODAL EXTRACTION")

    # Create output directory
    output_dir = "temp_features"
    os.makedirs(output_dir, exist_ok=True)

    # Combine all data
    all_texts = list(train_df['catalog_content'])
    all_urls = list(train_df['image_link'])
    all_indices = list(range(len(train_df)))

    # Add test data
    all_texts.extend(list(test_df['catalog_content']))
    all_urls.extend(list(test_df['image_link']))
    all_indices.extend(list(range(len(train_df), len(train_df) + len(test_df))))

    total_samples = len(all_texts)
    print(f"📊 Processing {total_samples} samples in chunks...")

    # Load models ONCE
    text_model, img_model, img_transform = load_models_once()

    # Process in small chunks
    processed_count = 0
    chunk_files = []

    for chunk_start in range(0, total_samples, SafeConfig.CHUNK_SIZE):
        chunk_end = min(chunk_start + SafeConfig.CHUNK_SIZE, total_samples)
        chunk_id = chunk_start // SafeConfig.CHUNK_SIZE

        chunk_data = (
            all_texts[chunk_start:chunk_end],
            all_urls[chunk_start:chunk_end],
            all_indices[chunk_start:chunk_end]
        )

        # Process chunk and save to disk
        count = process_chunk_to_disk(chunk_data, text_model, img_model, img_transform, chunk_id, output_dir)
        processed_count += count
        chunk_files.append(f'features_chunk_{chunk_id}.npy')

        print(f"📍 Progress: {processed_count}/{total_samples} samples")

    # Load all chunks from disk and combine
    print("🔗 Combining chunks from disk...")

    all_features = []
    for chunk_file in chunk_files:
        chunk_path = os.path.join(output_dir, chunk_file)
        chunk_data = np.load(chunk_path)
        all_features.append(chunk_data)

        # Delete file after loading to save space
        os.remove(chunk_path)

    # Combine all features
    all_features = np.vstack(all_features)

    # Clean up temp directory
    os.rmdir(output_dir)

    # Split back to train/test
    train_size = len(train_df)
    train_features = all_features[:train_size]
    test_features = all_features[train_size:train_size + len(test_df)]

    print(f"✅ EXTRACTION COMPLETE!")
    print(f"📊 Train features: {train_features.shape}")
    print(f"📊 Test features: {test_features.shape}")

    return train_features, test_features

# ============================================================
# 5. HYBRID APPROACH (TEXT + IMAGE METADATA)
# ============================================================
def hybrid_fast_extraction(train_df, test_df):
    """
    Ultra-fast hybrid: Text embeddings + Image metadata (no image download)
    Takes 20-30 minutes for 150K samples
    """
    print("🎯 ULTRA-FAST HYBRID EXTRACTION")

    # TEXT: Fast embeddings
    text_model = SentenceTransformer('all-MiniLM-L6-v2')

    all_texts = list(train_df['catalog_content']) + list(test_df['catalog_content'])
    print("📝 Encoding texts...")
    text_embeddings = text_model.encode(all_texts, batch_size=512, show_progress_bar=True)

    # IMAGE: Metadata only (no download)
    print("🖼️ Extracting image metadata...")
    image_metadata = []
    for url in list(train_df['image_link']) + list(test_df['image_link']):
        metadata = {
            'has_image': 1 if pd.notna(url) and str(url).startswith('http') else 0,
            'is_amazon': 1 if 'amazon' in str(url).lower() else 0,
            'url_length': len(str(url)),
            'has_jpg': 1 if '.jpg' in str(url).lower() else 0,
        }
        image_metadata.append(list(metadata.values()))

    image_metadata = np.array(image_metadata)

    # Combine
    all_features = np.hstack([text_embeddings, image_metadata])

    # Split back
    train_size = len(train_df)
    train_features = all_features[:train_size]
    test_features = all_features[train_size:train_size + len(test_df)]

    print(f"✅ HYBRID features: {train_features.shape}")
    return train_features, test_features

# ============================================================
# 6. FAST SUBMISSION
# ============================================================
def create_fast_submission(train_features, test_features, train_df, test_df):
    """Create submission in 5 minutes"""
    from sklearn.ensemble import ExtraTreesRegressor

    print("🎯 Creating submission...")

    X = train_features
    y = train_df['price'].values
    X_test = test_features

    model = ExtraTreesRegressor(
        n_estimators=100,
        max_depth=20,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X, y)
    predictions = model.predict(X_test)

    submission = pd.DataFrame({
        'sample_id': test_df['sample_id'],
        'price': predictions
    })

    submission['price'] = submission['price'].clip(lower=0.1)
    submission.to_csv('ram_safe_submission.csv', index=False)

    print("✅ SUBMISSION CREATED!")
    return submission

# ============================================================
# 7. MAIN EXECUTION
# ============================================================
if __name__ == "__main__":
    import time
    start_time = time.time()

    # Install: pip install sentence-transformers torchvision

    # Load data
    train_df = pd.read_csv("train_cleaned.csv")
    test_df = pd.read_csv("test_cleaned.csv")

    print(f"📥 Loaded: Train {train_df.shape}, Test {test_df.shape}")

    # Choose extraction method
    print("Choose extraction method:")
    print("1. RAM-SAFE Multimodal (1-1.5 hours)")
    print("2. HYBRID Fast (20-30 minutes)")

    choice = input("Enter choice (1 or 2): ").strip()

    if choice == "1":
        try:
            print("🚀 RAM-SAFE MULTIMODAL EXTRACTION...")
            train_features, test_features = ram_safe_extraction(train_df, test_df)
            mode = "multimodal"
        except Exception as e:
            print(f"❌ Multimodal failed: {e}")
            print("🔄 Falling back to hybrid...")
            train_features, test_features = hybrid_fast_extraction(train_df, test_df)
            mode = "hybrid_fallback"
    else:
        print("🎯 HYBRID FAST EXTRACTION...")
        train_features, test_features = hybrid_fast_extraction(train_df, test_df)
        mode = "hybrid"

    # Create submission
    submission = create_fast_submission(train_features, test_features, train_df, test_df)

    total_time = (time.time() - start_time) / 60
    print(f"⏱️  TOTAL TIME: {total_time:.1f} minutes")
    print(f"🎯 MODE: {mode}")
    print("🎉 SUCCESS! No RAM crashes!")