In [None]:
# Install required libraries
pip install transformers ftfy regex tqdm scikit-learn pandas numpy opencv-python xgboost -q

from google.colab import drive
import os
import json
import numpy as np

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Define Base Directory
# All your files (train.csv, test.csv, images folder) must be here
BASE_DIR = '/content/drive/MyDrive/product_dataset'

# 3. Create Checkpoint Folder
CHECKPOINT_DIR = os.path.join(BASE_DIR, 'checkpoints_gemini')
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

print(f"‚úÖ Setup Complete.")
print(f"üìÇ Working Directory: {BASE_DIR}")
print(f"üíæ Checkpoints will be saved to: {CHECKPOINT_DIR}")

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.8/44.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
‚úÖ Setup Complete.
üìÇ Working Directory: /content/drive/MyDrive/product_dataset
üíæ Checkpoints will be saved to: /content/drive/MyDrive/product_dataset/checkpoints_gemini


In [None]:
import pandas as pd
import torch
from PIL import Image, ImageFile
from tqdm.auto import tqdm
from transformers import CLIPProcessor, CLIPModel

# Allow PIL to handle truncated/corrupt images
ImageFile.LOAD_TRUNCATED_IMAGES = True

# --- CONFIGURATION ---
BATCH_SIZE = 32
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# --- PATHS ---
TRAIN_CSV = os.path.join(BASE_DIR, 'train_with_images.csv')
TEST_CSV = os.path.join(BASE_DIR, 'test_with_images.csv')

# --- CHECKPOINT MANAGER CLASS ---
class CheckpointManager:
    def __init__(self, name, checkpoint_dir):
        self.name = name
        self.dir = checkpoint_dir

        # File Paths
        self.progress_file = os.path.join(checkpoint_dir, f"{name}_progress.json")
        self.embed_file = os.path.join(checkpoint_dir, f"{name}_embeddings.npy")
        self.index_file = os.path.join(checkpoint_dir, f"{name}_index.npy")

        # Initialize or Load State
        if os.path.exists(self.progress_file):
            with open(self.progress_file, 'r') as f:
                self.state = json.load(f)
            print(f"üîÑ Resuming {name} from Index {self.state['last_index']}")
        else:
            self.state = {'last_index': 0, 'corrupted_ids': []}
            # Clear old binary files if starting fresh
            if os.path.exists(self.embed_file): os.remove(self.embed_file)
            if os.path.exists(self.index_file): os.remove(self.index_file)
            print(f"üÜï Starting {name} from scratch.")

    def update(self, features, ids, corrupt_ids):
        # 1. Append Features to npy (Binary Append)
        if len(features) > 0:
            with open(self.embed_file, 'ab') as f:
                np.save(f, features)

        # 2. Append IDs to npy
        if len(ids) > 0:
            with open(self.index_file, 'ab') as f:
                np.save(f, np.array(ids))

        # 3. Update State (JSON)
        # We advance the index by the TOTAL processed (valid + corrupt)
        self.state['last_index'] += len(ids) + len(corrupt_ids)
        self.state['corrupted_ids'].extend(corrupt_ids)

        with open(self.progress_file, 'w') as f:
            json.dump(self.state, f, indent=4)

    def get_start_index(self):
        return self.state['last_index']

print(f"‚úÖ Configuration Loaded. Using Device: {DEVICE}")

‚úÖ Configuration Loaded. Using Device: cuda


In [None]:
class CLIPFeatureExtractor(torch.nn.Module):
    def __init__(self):
        super(CLIPFeatureExtractor, self).__init__()
        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

    def forward(self, pixel_values):
        return self.model.get_image_features(pixel_values=pixel_values)

def extract_clip_embeddings(df, manager):
    # Initialize Model & Processor
    model = CLIPFeatureExtractor().to(DEVICE)
    model.eval()
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    # Construct Full Image Paths
    image_paths = df['local_image_path'].apply(lambda x: os.path.join(BASE_DIR, x)).tolist()
    sample_ids = df['sample_id'].tolist()

    start_idx = manager.get_start_index()
    total_len = len(df)

    if start_idx >= total_len:
        print(f"‚úÖ {manager.name} already completed!")
        return

    print(f"üöÄ Processing {manager.name} from index {start_idx}/{total_len}...")

    # Process in Batches
    for i in tqdm(range(start_idx, total_len, BATCH_SIZE)):
        end_idx = min(i + BATCH_SIZE, total_len)

        batch_paths = image_paths[i:end_idx]
        batch_ids = sample_ids[i:end_idx]

        valid_imgs = []
        valid_ids = []
        corrupt_ids_batch = []

        for path, sid in zip(batch_paths, batch_ids):
            try:
                img = Image.open(path).convert('RGB')
                valid_imgs.append(img)
                valid_ids.append(sid)
            except Exception:
                # Track corrupted ID to remove it from text later
                corrupt_ids_batch.append(sid)

        # Handle case where all images in a batch are corrupt
        if not valid_imgs:
            manager.update(np.array([]), [], corrupt_ids_batch)
            continue

        # Inference
        inputs = processor(images=valid_imgs, return_tensors="pt", padding=True)
        with torch.no_grad():
            features = model(inputs['pixel_values'].to(DEVICE)).cpu().numpy()

        # Save Progress
        manager.update(features, valid_ids, corrupt_ids_batch)

# --- EXECUTE EXTRACTION ---
print("Reading CSVs...")
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

# Create Managers
train_manager = CheckpointManager("train", CHECKPOINT_DIR)
test_manager = CheckpointManager("test", CHECKPOINT_DIR)

# Run Extraction
extract_clip_embeddings(train_df, train_manager)


Reading CSVs...
üîÑ Resuming train from Index 75000
üîÑ Resuming test from Index 75000


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

‚úÖ train already completed!


In [None]:
extract_clip_embeddings(test_df, test_manager)

‚úÖ test already completed!


In [None]:
import os
import json
import numpy as np
import pandas as pd

# --- HELPER: Load Monolithic NPY ---
def load_monolithic_npy(filename):
    """Generator to read appended numpy arrays safely."""
    data_list = []
    if not os.path.exists(filename):
        raise FileNotFoundError(f"{filename} not found. Did the extraction cell finish?")

    with open(filename, 'rb') as f:
        while True:
            try:
                # Try to load the next array
                data_list.append(np.load(f, allow_pickle=True))
            except (EOFError, ValueError, OSError):
                break

    if not data_list:
        raise ValueError(f"File {filename} is empty or could not be read!")

    return np.concatenate(data_list)

# --- MAIN FUNCTION ---
def get_clean_data(manager_name, original_df):
    print(f"üìÇ Loading Data for {manager_name}...")

    # 1. Load Progress to identify Corrupted IDs
    json_path = os.path.join(CHECKPOINT_DIR, f"{manager_name}_progress.json")
    if not os.path.exists(json_path):
         print("   ‚ö†Ô∏è JSON not found, assuming no corruption.")
         corrupted_set = set()
    else:
        with open(json_path, 'r') as f:
            progress = json.load(f)
        corrupted_set = set(progress.get('corrupted_ids', []))

    print(f"   Found {len(corrupted_set)} corrupted images to drop.")

    # 2. Load Embeddings & Valid IDs
    embeddings = load_monolithic_npy(os.path.join(CHECKPOINT_DIR, f"{manager_name}_embeddings.npy"))
    saved_ids = load_monolithic_npy(os.path.join(CHECKPOINT_DIR, f"{manager_name}_index.npy"))

    print(f"   Loaded {len(saved_ids)} embedding rows.")

    # 3. Filter DataFrame (Remove Corrupted)
    # Remove any row whose ID is in the corrupted set
    clean_df = original_df[~original_df['sample_id'].isin(corrupted_set)].copy()

    # 4. Strict Alignment
    # Map IDs to their index in the embedding matrix
    id_to_idx = {sid: i for i, sid in enumerate(saved_ids)}

    # Only keep rows that exist in the embeddings (sanity check)
    clean_df = clean_df[clean_df['sample_id'].isin(id_to_idx.keys())]

    # Re-order embeddings to match the DataFrame order exactly
    valid_indices = [id_to_idx[sid] for sid in clean_df['sample_id']]
    aligned_embeddings = embeddings[valid_indices]

    # Return the clean DF, the aligned images, AND the corrupted IDs
    return clean_df, aligned_embeddings, corrupted_set

# --- EXECUTE & SYNC ---

# 1. Load Clean Image Data & Get Corrupted IDs
print("--- Processing TRAIN ---")
train_df_clean, img_train_emb, train_corrupted_ids = get_clean_data("train", train_df)

print("\n--- Processing TEST ---")
test_df_clean, img_test_emb, test_corrupted_ids = get_clean_data("test", test_df)

print(f"\n‚úÖ IMAGES READY.")
print(f"   Train Images: {img_train_emb.shape}")
print(f"   Test Images:  {img_test_emb.shape}")

# ---------------------------------------------------------
# 2. SYNC TEXT EMBEDDINGS (Critical Fix for Dimension Error)
# ---------------------------------------------------------
print("\nüîó Syncing Text Embeddings with Cleaned Data...")

# Helper to filter text arrays
def sync_text(text_emb, original_df, corrupted_ids, name):
    if text_emb is None: return None

    # If the text embedding size matches the ORIGINAL dataframe size
    if len(text_emb) == len(original_df):
        print(f"   Filtering {name} Text Embeddings...")
        # Boolean mask: Keep rows where sample_id is NOT in corrupted_ids
        mask = ~original_df['sample_id'].isin(corrupted_ids)
        return text_emb[mask]

    # If sizes already match the CLEAN dataframe, we are good
    elif len(text_emb) == (len(original_df) - len(corrupted_ids)):
        print(f"   {name} Text Embeddings already aligned.")
        return text_emb
    else:
        print(f"   ‚ö†Ô∏è WARNING: {name} Text shape {text_emb.shape} mismatch. Re-generation recommended.")
        return text_emb

# Apply Sync


--- Processing TRAIN ---
üìÇ Loading Data for train...
   Found 1 corrupted images to drop.
   Loaded 74999 embedding rows.

--- Processing TEST ---
üìÇ Loading Data for test...
   Found 1 corrupted images to drop.
   Loaded 74999 embedding rows.

‚úÖ IMAGES READY.
   Train Images: (74999, 512)
   Test Images:  (74999, 512)

üîó Syncing Text Embeddings with Cleaned Data...
   Train Text Embeddings already aligned.
   Filtering Test Text Embeddings...

‚úÖ FINAL SYNC COMPLETE.
   Train Text: (74999, 384)


In [None]:
from google.colab import drive
from sentence_transformers import SentenceTransformer
import numpy as np
import torch
import gc
import os

# --- CONFIG ---
TEXT_COLUMN = 'catalog_content'
DRIVE_FOLDER = '/content/drive/MyDrive/product_dataset/checkpoints_gemini'

# File names
TRAIN_FILE = 'train_text_embeddings.npy'
TEST_FILE  = 'test_text_embeddings.npy'

TRAIN_PATH = os.path.join(DRIVE_FOLDER, TRAIN_FILE)
TEST_PATH  = os.path.join(DRIVE_FOLDER, TEST_FILE)

# Ensure Folder Exists
if not os.path.exists(DRIVE_FOLDER):
    os.makedirs(DRIVE_FOLDER)
    print(f"üìÇ Created directory: {DRIVE_FOLDER}")

# Helper to load model only when needed
text_model = None
def get_model():
    global text_model
    if text_model is None:
        print("üì• Loading SBERT model (all-MiniLM-L6-v2)...")
        text_model = SentenceTransformer('all-MiniLM-L6-v2')
        text_model.to('cuda' if torch.cuda.is_available() else 'cpu')
    return text_model

# ==========================================
# 1. PROCESS TRAIN EMBEDDINGS
# ==========================================
if os.path.exists(TRAIN_PATH):
    print(f"\nüîÑ Found existing TRAIN embeddings at: {TRAIN_PATH}")
    train_text_embeddings = np.load(TRAIN_PATH)
    print(f"‚úÖ Train Loaded! Shape: {train_text_embeddings.shape}")
else:
    print(f"\n‚ö†Ô∏è Train file not found. Generating...")

    # Get Text
    texts = train_df_clean[TEXT_COLUMN].fillna("").astype(str).tolist()

    # Encode
    model = get_model()
    print(f"   -> Encoding {len(texts)} train items...")
    train_text_embeddings = model.encode(
        texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True
    )

    # Save
    print(f"   üíæ Saving Train to Drive...")
    np.save(TRAIN_PATH, train_text_embeddings)

    # Free memory of raw text list
    del texts

# ==========================================
# 2. PROCESS TEST EMBEDDINGS
# ==========================================
if os.path.exists(TEST_PATH):
    print(f"\nüîÑ Found existing TEST embeddings at: {TEST_PATH}")
    test_text_embeddings = np.load(TEST_PATH)
    print(f"‚úÖ Test Loaded! Shape: {test_text_embeddings.shape}")
else:
    print(f"\n‚ö†Ô∏è Test file not found. Generating...")

    # Get Text (Assuming dataframe is named 'test_df')
    if 'test_df' in locals():
        texts = test_df[TEXT_COLUMN].fillna("").astype(str).tolist()

        # Encode
        model = get_model()
        print(f"   -> Encoding {len(texts)} test items...")
        test_text_embeddings = model.encode(
            texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True
        )

        # Save
        print(f"   üíæ Saving Test to Drive...")
        np.save(TEST_PATH, test_text_embeddings)
        del texts
    else:
        print("‚ùå Error: 'test_df' dataframe not found in memory!")

# ==========================================
# 3. CLEANUP
# ==========================================
if text_model is not None:
    print("\nüßπ Cleaning up model from GPU...")
    del text_model
    gc.collect()
    torch.cuda.empty_cache()

print("-" * 30)
print(f"üèÅ Final Train Shape: {train_text_embeddings.shape}")
if 'test_text_embeddings' in locals():
    print(f"üèÅ Final Test Shape:  {test_text_embeddings.shape}")


üîÑ Found existing TRAIN embeddings at: /content/drive/MyDrive/product_dataset/checkpoints_gemini/train_text_embeddings.npy
‚úÖ Train Loaded! Shape: (74999, 384)

‚ö†Ô∏è Test file not found. Generating...
üì• Loading SBERT model (all-MiniLM-L6-v2)...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

   -> Encoding 75000 test items...


Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

   üíæ Saving Test to Drive...

üßπ Cleaning up model from GPU...
------------------------------
üèÅ Final Train Shape: (74999, 384)
üèÅ Final Test Shape:  (75000, 384)


In [None]:
if 'train_text_embeddings' in locals():
    train_text_embeddings = sync_text(train_text_embeddings, train_df, train_corrupted_ids, "Train")

if 'test_text_embeddings' in locals():
    test_text_embeddings = sync_text(test_text_embeddings, test_df, test_corrupted_ids, "Test")

print(f"\n‚úÖ FINAL SYNC COMPLETE.")
if 'train_text_embeddings' in locals():
    print(f"   Train Text: {train_text_embeddings.shape}")

   Train Text Embeddings already aligned.
   Test Text Embeddings already aligned.

‚úÖ FINAL SYNC COMPLETE.
   Train Text: (74999, 384)


In [None]:
y_target = train_df_clean['price'].values

In [None]:
import re
import pandas as pd
import numpy as np
import gc

print("‚ö° Step 1: Starting Advanced Feature Standardization...")

# --- DEFINING MULTIPLIERS ---
MULTIPLIERS = {
    # Weight -> to grams (g)
    'kg': 1000.0, 'g': 1.0, 'gm': 1.0, 'mg': 0.001,
    'lbs': 453.592, 'lb': 453.592, 'ounce': 28.3495, 'oz': 28.3495,

    # Volume -> to milliliters (ml)
    'litre': 1000.0, 'liter': 1000.0, 'l': 1000.0,
    'ml': 1.0, 'mili': 1.0, 'gallon': 3785.41, 'fl oz': 29.5735,

    # Dimension -> to centimeters (cm)
    'meter': 100.0, 'm': 100.0, 'cm': 1.0, 'mm': 0.1,
    'inch': 2.54, 'in': 2.54, 'ft': 30.48, 'feet': 30.48,
    'yard': 91.44
}

def extract_and_standardize(text, dimension_type):
    """
    Finds number AND unit, then converts to base unit.
    """
    if pd.isna(text): return 0.0
    text = str(text).lower()

    # Regex patterns
    patterns = {
        'weight': r'(\d+\.?\d*)\s*(kg|gm|mg|lbs|lb|ounce|oz|g)',
        'volume': r'(\d+\.?\d*)\s*(litre|liter|ml|mili|gallon|fl oz|l)',
        'dim':    r'(\d+\.?\d*)\s*(meter|cm|mm|inch|in|feet|yard|ft|m)'
    }

    match = re.search(patterns[dimension_type], text)
    if match:
        try:
            number = float(match.group(1))
            unit = match.group(2)
            if unit in MULTIPLIERS:
                return number * MULTIPLIERS[unit]
        except:
            return 0.0
    return 0.0

# ==========================================
# 1. PROCESS TRAIN DATA
# ==========================================
print("\n--- Processing TRAIN Data ---")

# Check if 'image_text' exists
if 'image_text' in train_df_clean.columns:
    print("   -> Found OCR text, including in extraction...")
    image_text_col = train_df_clean['image_text'].fillna('')
else:
    print("   -> No OCR text found, using catalog only...")
    image_text_col = ""

combined_text = train_df_clean['catalog_content'].fillna('') + " " + image_text_col

print("   -> Extracting features...")
train_df_clean['std_weight_g'] = combined_text.apply(lambda x: extract_and_standardize(x, 'weight'))
train_df_clean['std_volume_ml'] = combined_text.apply(lambda x: extract_and_standardize(x, 'volume'))
train_df_clean['std_dim_cm']    = combined_text.apply(lambda x: extract_and_standardize(x, 'dim'))

nz_weight = (train_df_clean['std_weight_g'] > 0).sum()
print(f"   ‚úÖ Train Complete. Rows with extracted Weight: {nz_weight}")

# ==========================================
# 2. PROCESS TEST DATA (Added)
# ==========================================
print("\n--- Processing TEST Data ---")

if 'test_df_clean' in locals():
    # Check if 'image_text' exists in Test
    if 'image_text' in test_df_clean.columns:
        print("   -> Found OCR text in Test...")
        test_image_text = test_df_clean['image_text'].fillna('')
    else:
        print("   -> No OCR text in Test, using catalog only...")
        test_image_text = ""

    # Combine Text
    test_combined_text = test_df_clean['catalog_content'].fillna('') + " " + test_image_text

    # Extract Features
    print("   -> Extracting features for Test...")
    test_df_clean['std_weight_g']  = test_combined_text.apply(lambda x: extract_and_standardize(x, 'weight'))
    test_df_clean['std_volume_ml'] = test_combined_text.apply(lambda x: extract_and_standardize(x, 'volume'))
    test_df_clean['std_dim_cm']    = test_combined_text.apply(lambda x: extract_and_standardize(x, 'dim'))

    nz_weight_test = (test_df_clean['std_weight_g'] > 0).sum()
    print(f"   ‚úÖ Test Complete. Rows with extracted Weight: {nz_weight_test}")

else:
    print("   ‚ö†Ô∏è 'test_df_clean' not found in memory. Skipping Test extraction.")

‚ö° Step 1: Starting Advanced Feature Standardization...

--- Processing TRAIN Data ---
   -> No OCR text found, using catalog only...
   -> Extracting features...
   ‚úÖ Train Complete. Rows with extracted Weight: 52292

--- Processing TEST Data ---
   -> No OCR text in Test, using catalog only...
   -> Extracting features for Test...
   ‚úÖ Test Complete. Rows with extracted Weight: 52231


In [None]:
import numpy as np
import gc

print("‚ö° Step 2: Combining Embeddings and Standardized Features...")

# ==========================================
# 1. PROCESS TRAIN DATA
# ==========================================
print("   -> Stacking TRAIN Data...")

# Prepare Numerical Features
f_weight = train_df_clean['std_weight_g'].values.reshape(-1, 1)
f_vol    = train_df_clean['std_volume_ml'].values.reshape(-1, 1)
f_dim    = train_df_clean['std_dim_cm'].values.reshape(-1, 1)

# Stack: [Image | Text | Weight | Vol | Dim]
X_train_combined = np.hstack((
    img_train_emb,          # (N, 512)
    train_text_embeddings,  # (N, 384)
    f_weight,               # (N, 1)
    f_vol,                  # (N, 1)
    f_dim                   # (N, 1)
))

print(f"   ‚úÖ X_train_combined shape: {X_train_combined.shape}")

# ==========================================
# 2. PROCESS TEST DATA (Added)
# ==========================================
print("   -> Stacking TEST Data...")

if 'test_df_clean' in locals() and 'img_test_emb' in locals() and 'test_text_embeddings' in locals():

    # Prepare Numerical Features
    f_weight_test = test_df_clean['std_weight_g'].values.reshape(-1, 1)
    f_vol_test    = test_df_clean['std_volume_ml'].values.reshape(-1, 1)
    f_dim_test    = test_df_clean['std_dim_cm'].values.reshape(-1, 1)

    # Stack
    X_test_combined = np.hstack((
        img_test_emb,          # (M, 512)
        test_text_embeddings,  # (M, 384)
        f_weight_test,         # (M, 1)
        f_vol_test,            # (M, 1)
        f_dim_test             # (M, 1)
    ))

    print(f"   ‚úÖ X_test_combined shape:  {X_test_combined.shape}")

else:
    print("   ‚ö†Ô∏è Test data variables not found in memory. Skipping Test stack.")

# ==========================================
# 3. CLEANUP
# ==========================================
# Clean up intermediate arrays to save RAM
del f_weight, f_vol, f_dim
if 'f_weight_test' in locals():
    del f_weight_test, f_vol_test, f_dim_test
gc.collect()

‚ö° Step 2: Combining Embeddings and Standardized Features...
   -> Stacking TRAIN Data...
   ‚úÖ X_train_combined shape: (74999, 899)
   -> Stacking TEST Data...
   ‚úÖ X_test_combined shape:  (74999, 899)


2039

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
import numpy as np

print("‚ö° Step: Generating KNN 'Lookalike' Features...")

# 1. AUTO-DETECT INDICES & SETUP DATA
# We know the last 3 columns are the extracted features (weight, vol, dim)
# Everything before that is embeddings.
X_train_embeds = X_train_combined[:, :-3]
X_test_embeds  = X_test_combined[:, :-3]

# Target
y = train_df_clean['price'].values
y_log = np.log1p(y)

# -------------------------------------------------------------------------
# COMMENTED OUT ORIGINAL LOGIC (Kept as requested)
# -------------------------------------------------------------------------
# # --- A. KNN FOR TRAINING DATA (Out-of-Fold) ---
# # We can't just find the nearest neighbor in Train for Train, because it would be itself (distance=0).
# # We must use K-Fold to generate unbiased features.

# knn_feature_train = np.zeros(len(y))
# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# print(f"   -> Generating features for Train (5-Fold)...")
# for fold, (train_idx, val_idx) in enumerate(kf.split(X_embeds, y_log)):
#     # Fit KNN on 80% of data
#     knn = KNeighborsRegressor(n_neighbors=1, metric='cosine', n_jobs=-1)
#     knn.fit(X_embeds[train_idx], y_log[train_idx])

#     # Predict on the other 20%
#     neighbor_preds = knn.predict(X_embeds[val_idx])
#     knn_feature_train[val_idx] = neighbor_preds
#     # print(f"      Fold {fold+1} complete.")

# # --- B. KNN FOR VAL/TEST DATA ---
# # For validation/test, we simply find the nearest neighbor in the FULL training set.
# print("   -> Generating features for Validation/Test...")
# knn_full = KNeighborsRegressor(n_neighbors=1, metric='cosine', n_jobs=-1)
# knn_full.fit(X_embeds, y_log)

# # We generate this for the whole dataset to keep X_combined aligned
# knn_feature_all = knn_full.predict(X_embeds)
# -------------------------------------------------------------------------

# 2. ROBUST APPROACH (Active Code)
# Fits on Full Train, Predicts on Train (N=5 smoothes leakage) & Test.

print("   -> Training KNN Model (5-Neighbor Average)...")
knn_robust = KNeighborsRegressor(n_neighbors=5, metric='cosine', n_jobs=-1)

# Fit on Training Embeddings
knn_robust.fit(X_train_embeds, y_log)

# Predict for TRAIN Data
print("   -> Generating Train Features...")
knn_feature_train = knn_robust.predict(X_train_embeds)

# Predict for TEST Data
print("   -> Generating Test Features...")
knn_feature_test = knn_robust.predict(X_test_embeds)

print("‚úÖ KNN Features Generated.")
print(f"   Train Feature Shape: {knn_feature_train.shape}")
print(f"   Test Feature Shape:  {knn_feature_test.shape}")

‚ö° Step: Generating KNN 'Lookalike' Features...
   -> Training KNN Model (5-Neighbor Average)...
   -> Generating Train Features...
   -> Generating Test Features...
‚úÖ KNN Features Generated.
   Train Feature Shape: (74999,)
   Test Feature Shape:  (74999,)


In [None]:
import numpy as np
import os
from google.colab import drive

print("üîÑ Loading KNN Progress from Drive...")
drive.mount('/content/drive')
base_path = '/content/drive/MyDrive/product_dataset/Amazon_ML_Challenge_Models/'

# ==========================================
# 1. LOAD & STACK TRAIN FEATURES
# ==========================================
train_path = os.path.join(base_path, 'knn_feature_train.npy')

if os.path.exists(train_path):
    print(f"   ‚úÖ Loading Train KNN Feature...")
    knn_feature_train = np.load(train_path)

    # Reshape
    f_knn_train = knn_feature_train.reshape(-1, 1)

    # Stack onto X_train_combined
    # Check shape to prevent double stacking (Optional safety)
    if 'X_train_combined' in locals():
        print(f"      Original Train Shape: {X_train_combined.shape}")

        # Only stack if not already there (assuming standard width check, or just stack)
        X_train_combined = np.hstack((X_train_combined, f_knn_train))
        print(f"      New Train Shape:      {X_train_combined.shape}")
    else:
        print("      ‚ö†Ô∏è 'X_train_combined' not found in memory.")
else:
    print(f"   ‚ùå Train KNN file not found at: {train_path}")

# ==========================================
# 2. LOAD & STACK TEST FEATURES
# ==========================================
test_path = os.path.join(base_path, 'knn_feature_test.npy')

if os.path.exists(test_path):
    print(f"   ‚úÖ Loading Test KNN Feature...")
    knn_feature_test = np.load(test_path)

    # Reshape
    f_knn_test = knn_feature_test.reshape(-1, 1)

    # Stack onto X_test_combined
    if 'X_test_combined' in locals():
        print(f"      Original Test Shape: {X_test_combined.shape}")
        X_test_combined = np.hstack((X_test_combined, f_knn_test))
        print(f"      New Test Shape:      {X_test_combined.shape}")
    else:
        print("      ‚ö†Ô∏è 'X_test_combined' not found in memory.")
else:
    print(f"   ‚ùå Test KNN file not found at: {test_path}")

print("\nüéâ KNN Features Loaded & Stacked.")

üîÑ Loading KNN Progress from Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   ‚úÖ Loading Train KNN Feature...
      Original Train Shape: (74999, 899)
      New Train Shape:      (74999, 900)
   ‚úÖ Loading Test KNN Feature...
      Original Test Shape: (74999, 899)
      New Test Shape:      (74999, 900)

üéâ KNN Features Loaded & Stacked.


In [None]:
import numpy as np
import pickle
import os
from google.colab import drive

print("üíæ Saving KNN Progress...")

# 1. Mount Drive
save_path = '/content/drive/MyDrive/product_dataset/Amazon_ML_Challenge_Models/'
os.makedirs(save_path, exist_ok=True)

# --- A. Save Train Features (.npy) ---
if 'knn_feature_train' in locals():
    train_feat_path = os.path.join(save_path, 'knn_feature_train.npy')
    np.save(train_feat_path, knn_feature_train)
    print(f"   ‚úÖ Train KNN Features saved: {train_feat_path}")
else:
    print("   ‚ö†Ô∏è 'knn_feature_train' not found in memory.")

# --- B. Save Test Features (.npy) ---
if 'knn_feature_test' in locals():
    test_feat_path = os.path.join(save_path, 'knn_feature_test.npy')
    np.save(test_feat_path, knn_feature_test)
    print(f"   ‚úÖ Test KNN Features saved: {test_feat_path}")
else:
    print("   ‚ö†Ô∏è 'knn_feature_test' not found in memory.")

# --- C. Save the Trained KNN Model (.pkl) ---
# Useful if you need to predict on NEW data later without retraining.
if 'knn_robust' in locals():
    model_path = os.path.join(save_path, 'knn_model.pkl')
    with open(model_path, 'wb') as f:
        pickle.dump(knn_robust, f)
    print(f"   ‚úÖ KNN Model saved: {model_path}")

print("\nüéâ All KNN Progress Saved.")

üíæ Saving KNN Progress...
   ‚úÖ Train KNN Features saved: /content/drive/MyDrive/product_dataset/Amazon_ML_Challenge_Models/knn_feature_train.npy
   ‚úÖ Test KNN Features saved: /content/drive/MyDrive/product_dataset/Amazon_ML_Challenge_Models/knn_feature_test.npy
   ‚úÖ KNN Model saved: /content/drive/MyDrive/product_dataset/Amazon_ML_Challenge_Models/knn_model.pkl

üéâ All KNN Progress Saved.


In [None]:
print(X_train_combined.shape)
print(X_test_combined.shape)

(74999, 900)
(74999, 900)


In [None]:
train_df_clean.head(100)

Unnamed: 0,sample_id,catalog_content,image_link,price,local_image_path,std_weight_g,std_volume_ml,std_dim_cm
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89,/content/drive/MyDrive/product_dataset/images_...,340.194000,0.0,0.0
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12,/content/drive/MyDrive/product_dataset/images_...,226.796000,0.0,0.0
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97,/content/drive/MyDrive/product_dataset/images_...,53.864050,0.0,0.0
3,55858,Item Name: Judee‚Äôs Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34,/content/drive/MyDrive/product_dataset/images_...,318.931875,0.0,0.0
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49,/content/drive/MyDrive/product_dataset/images_...,360.038650,0.0,0.0
...,...,...,...,...,...,...,...,...
95,131791,"Item Name: Jack Link‚Äôs Beef Strips, Original, ...",https://m.media-amazon.com/images/I/412D0TcLPp...,2.29,/content/drive/MyDrive/product_dataset/images_...,8.000000,0.0,0.0
96,228657,Item Name: Amoretti - Cola Extract Water Solub...,https://m.media-amazon.com/images/I/51p9zbWbG6...,30.37,/content/drive/MyDrive/product_dataset/images_...,56.699000,0.0,0.0
97,21269,Item Name: Black Rifle Coffee Company Gunship ...,https://m.media-amazon.com/images/I/51e5+GrYKv...,19.19,/content/drive/MyDrive/product_dataset/images_...,340.194000,0.0,0.0
98,17024,Item Name: SOUR PATCH KIDS Blue Raspberry Soft...,https://m.media-amazon.com/images/I/91ffySwbbq...,4.79,/content/drive/MyDrive/product_dataset/images_...,2267.960000,5000.0,0.0


In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb

print("‚ö° Step 3: Training with KNN Boost...")

# 1. Split
X_train, X_val, y_train_log, y_val_log = train_test_split(
    X_train_combined,
    np.log1p(train_df_clean['price'].values),
    test_size=0.12,
    random_state=42
)

# 2. Train
model = xgb.XGBRegressor(
    n_estimators=5000,
    learning_rate=0.02,
    max_depth=10,        # Deeper trees to decide between Regex vs KNN
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    tree_method='hist',
    device='cuda',
    objective='reg:pseudohubererror',
    early_stopping_rounds=100
)

model.fit(
    X_train, y_train_log,
    eval_set=[(X_val, y_val_log)],
    verbose=100
)

# 3. Validate SMAPE
preds_log = model.predict(X_val)
preds_actual = np.expm1(preds_log)
y_val_actual = np.expm1(y_val_log)

def calculate_smape(A, F):
    denominator = (np.abs(A) + np.abs(F))
    denominator[denominator==0] = 1e-8
    return 100/len(A) * np.sum(2 * np.abs(F - A) / denominator)

score = calculate_smape(y_val_actual, preds_actual)
print(f"\nüèÜ Final SMAPE with KNN: {score:.4f}%")

‚ö° Step 3: Training with KNN Boost...
[0]	validation_0-mphe:7.33598
[100]	validation_0-mphe:0.16143
[200]	validation_0-mphe:0.15108
[300]	validation_0-mphe:0.15051
[400]	validation_0-mphe:0.15056
[439]	validation_0-mphe:0.15059

üèÜ Final SMAPE with KNN: 46.4212%


In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc

print("üöÄ Starting Final Inference on Test Dataset...")

# 1. Verification
# Ensure X_test_combined exists and has the expected shape (Embeddings + Regex + KNN)
if 'X_test_combined' not in locals():
    raise ValueError("‚ö†Ô∏è X_test_combined not found! Please run the 'Stacking' and 'KNN Loading' cells first.")

print(f"   -> Input Shape: {X_test_combined.shape}")

# 2. Predict
print("   -> Running XGBoost Inference...")
# Force Float32 to match training precision
X_test_input = X_test_combined.astype(np.float32)
preds_log = model.predict(X_test_input)

# 3. Reverse Transformations
# We trained on log1p, so we revert with expm1
preds_price = np.expm1(preds_log)

# 4. Price Snapping (Psychological Pricing)
def snap_price(price):
    integer_part = int(price)
    decimal_part = price - integer_part

    if 0.90 <= decimal_part <= 0.999: return integer_part + 0.99
    elif 0.93 <= decimal_part < 0.97: return integer_part + 0.95
    elif decimal_part < 0.05: return float(integer_part)
    elif 0.48 <= decimal_part <= 0.52: return integer_part + 0.50
    return price

print("   -> Applying Price Snapping...")
final_preds = np.array([snap_price(p) for p in preds_price])

# 5. Create Submission DataFrame
# CRITICAL: We must use 'test_df_clean' because X_test_combined corresponds to the CLEAN data.
# If we used the original 'test_df', the IDs would be misaligned due to the dropped corrupted rows.

print("   -> Mapping to Sample IDs...")
if 'test_df_clean' in locals():
    submission = pd.DataFrame({
        'sample_id': test_df_clean['sample_id'],
        'prediction': final_preds
    })
else:
    # Fallback (Only if you didn't drop any rows, but risky)
    print("   ‚ö†Ô∏è test_df_clean not found. Using test_df (RISK OF MISALIGNMENT).")
    submission = pd.DataFrame({
        'sample_id': test_df['sample_id'],
        'prediction': final_preds
    })

# 6. Save
filename = 'submission_price_prediction.csv'
submission.to_csv(filename, index=False)

print(f"‚úÖ Submission Generated: {filename}")
print(submission.head())

# Cleanup
del X_test_input
gc.collect()

üöÄ Starting Final Inference on Test Dataset...
   -> Input Shape: (74999, 900)
   -> Running XGBoost Inference...
   -> Applying Price Snapping...
   -> Mapping to Sample IDs...
‚úÖ Submission Generated: submission_price_prediction.csv
   sample_id  prediction
0     100179   11.130498
1     245611   17.857588
2     146263   19.569834
3      95658    4.072639
4      36806   11.053926


651