In [2]:


# --- Core Libraries ---
import pandas as pd
import numpy as np
import os
import re
from tqdm.notebook import tqdm
import joblib

# --- Mount Google Drive ---
from google.colab import drive
print("Connecting to Google Drive...")
drive.mount('/content/drive')

# --- Define Project Paths using YOUR folder structure ---
PROJECT_DIR = "/content/drive/MyDrive/Colab_Notebooks/amazon-ml-challenge/"
DATA_DIR = os.path.join(PROJECT_DIR, 'data/')
IMAGE_DIR = os.path.join(DATA_DIR, 'images/')
FEATURES_DIR = os.path.join(PROJECT_DIR, 'features_advanced1/')
MODELS_DIR = os.path.join(PROJECT_DIR, 'models_advanced1/')
os.makedirs(FEATURES_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

print("\n✅ Setup Complete! All paths are set.")

Connecting to Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

✅ Setup Complete! All paths are set.


In [4]:
!pip install rapidfuzz


Collecting rapidfuzz
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m179.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m90.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.1


In [5]:
# -------------------------------
# Part 2: Imports & device
# -------------------------------
import os, re, numpy as np, pandas as pd
from tqdm import tqdm
from PIL import Image

# ML & preprocessing
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

# LightGBM
import lightgbm as lgb

# Image features
import torch
import timm

# Text features
from sentence_transformers import SentenceTransformer

# Fuzzy matching for brands
from rapidfuzz import process

# Device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device in use:", device)


Device in use: cuda


In [6]:
# -------------------------------
# Part 3: Load data
# -------------------------------
train_path = os.path.join(DATA_DIR, 'train.csv')
test_path = os.path.join(DATA_DIR, 'test.csv')

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print("✅ Train shape:", train_df.shape)
print("✅ Test shape:", test_df.shape)
print("\nTrain columns:", train_df.columns.tolist())
print("\nTest columns:", test_df.columns.tolist())

# Quick check of data
display(train_df.head())
display(test_df.head())


✅ Train shape: (75000, 4)
✅ Test shape: (75000, 3)

Train columns: ['sample_id', 'catalog_content', 'image_link', 'price']

Test columns: ['sample_id', 'catalog_content', 'image_link']


Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


Unnamed: 0,sample_id,catalog_content,image_link
0,100179,Item Name: Rani 14-Spice Eshamaya's Mango Chut...,https://m.media-amazon.com/images/I/71hoAn78AW...
1,245611,Item Name: Natural MILK TEA Flavoring extract ...,https://m.media-amazon.com/images/I/61ex8NHCIj...
2,146263,Item Name: Honey Filled Hard Candy - Bulk Pack...,https://m.media-amazon.com/images/I/61KCM61J8e...
3,95658,Item Name: Vlasic Snack'mm's Kosher Dill 16 Oz...,https://m.media-amazon.com/images/I/51Ex6uOH7y...
4,36806,"Item Name: McCormick Culinary Vanilla Extract,...",https://m.media-amazon.com/images/I/71QYlrOMoS...


In [7]:

import re
from rapidfuzz import process

# Regex patterns
item_name_re = re.compile(r'Item Name: (.*?)(Bullet Point|Product Description|Value:|$)', re.DOTALL)
bullet_re = re.compile(r'Bullet Point \d+: (.*?)(Bullet Point|Value:|$)', re.DOTALL)
value_re = re.compile(r'Value: ([\d\.]+)')
unit_re = re.compile(r'Unit: ([\w\s\.-]+)', re.DOTALL)

# Example brand list (customize if needed)
BRANDS = ['bear creek', 'la victoria', 'salerno', "judee's", 'kedem', 'vlasic', 'rani', 'nike', 'apple']

# Function to parse one row
def parse_row(row):
    content = str(row['catalog_content'])
    data = {'sample_id': row['sample_id']}

    # Item Name
    name_match = item_name_re.search(content)
    data['item_name'] = name_match.group(1).strip() if name_match else ''

    # Bullet points
    bullets = bullet_re.findall(content)
    data['bullets'] = ' '.join([b[0].strip() for b in bullets])
    data['num_bullets'] = len(bullets)

    # Value and unit
    value_match = value_re.search(content)
    data['value'] = float(value_match.group(1)) if value_match else 1.0
    unit_match = unit_re.search(content)
    data['unit'] = unit_match.group(1).strip().lower() if unit_match else 'unknown'

    # Full text features
    data['full_text'] = data['item_name'] + ' ' + data['bullets']
    data['text_len'] = len(data['full_text'])
    data['word_count'] = len(data['full_text'].split())

    # Brand matching using fuzzy search
    match = process.extractOne(data['full_text'].lower(), BRANDS)
    if match:
        data['brand'] = match[0] if match[1] > 70 else 'unknown'
    else:
        data['brand'] = 'unknown'
    data['has_brand'] = 0 if data['brand']=='unknown' else 1

    return data

# Function to parse entire dataframe
def parse_catalog(df):
    return df.apply(parse_row, axis=1, result_type='expand')

# Apply parsing
train_structured = parse_catalog(train_df)
test_structured = parse_catalog(test_df)

# Merge back with original dataframe
train_df = pd.merge(train_df, train_structured, on='sample_id', how='left')
test_df = pd.merge(test_df, test_structured, on='sample_id', how='left')

print("✅ Structured features extracted. Columns now include:")
print(train_df.columns.tolist())


✅ Structured features extracted. Columns now include:
['sample_id', 'catalog_content', 'image_link', 'price', 'item_name', 'bullets', 'num_bullets', 'value', 'unit', 'full_text', 'text_len', 'word_count', 'brand', 'has_brand']


In [8]:
# -------------------------------
# Part 5: Feature Engineering + Target Encoding
# -------------------------------

# Ensure no zero values for 'value'
train_df['value'] = train_df['value'].replace(0, 1)
test_df['value'] = test_df['value'].replace(0, 1)

# Target encoding function
def target_encode(df_train, df_test, col, target):
    mean_map = df_train.groupby(col)[target].mean()
    df_train[col + '_te'] = df_train[col].map(mean_map)
    df_test[col + '_te'] = df_test[col].map(mean_map).fillna(df_train[target].mean())
    return df_train, df_test

# Apply target encoding for 'brand' and 'unit'
for col in ['brand', 'unit']:
    train_df, test_df = target_encode(train_df, test_df, col, 'price')

# Additional engineered features
for df in [train_df, test_df]:
    df['text_len_ratio'] = df['text_len'] / df['value']
    df['num_words_item_name'] = df['item_name'].apply(lambda x: len(str(x).split()))
    df['num_words_bullets'] = df['bullets'].apply(lambda x: len(str(x).split()))
    df['avg_word_len'] = df['full_text'].apply(lambda x: np.mean([len(w) for w in x.split()]) if len(x.split())>0 else 0)

# List of structured and engineered features
engineered_features = ['text_len_ratio','num_words_item_name','num_words_bullets','avg_word_len']
structured_features = ['value','unit_te','brand_te','text_len'] + engineered_features

print("✅ Engineered features added. Sample columns:")
print(structured_features)


✅ Engineered features added. Sample columns:
['value', 'unit_te', 'brand_te', 'text_len', 'text_len_ratio', 'num_words_item_name', 'num_words_bullets', 'avg_word_len']


In [10]:
# -------------------------------
# Part 6: Image Feature Extraction using Vision Transformer (ViT)
# -------------------------------

import timm
from PIL import Image
from torchvision import transforms
from tqdm import tqdm
import torch

def extract_image_features_vit(df, image_dir, device):
    # Load pretrained ViT model without classifier head
    model = timm.create_model('vit_base_patch16_224', pretrained=True, num_classes=0).to(device)
    model.eval()

    # Preprocessing pipeline
    config = model.default_cfg
    preprocess = transforms.Compose([
        transforms.Resize(config['input_size'][1:]),
        transforms.CenterCrop(config['input_size'][1:]),
        transforms.ToTensor(),
        transforms.Normalize(mean=config['mean'], std=config['std'])
    ])

    all_features = []
    for sample_id in tqdm(df['sample_id'], desc="Processing Images"):
        path = os.path.join(image_dir, f'{sample_id}.jpg')
        try:
            img = Image.open(path).convert('RGB')
            tensor = preprocess(img).unsqueeze(0).to(device)
            with torch.no_grad():
                feat = model.forward_features(tensor).mean(dim=1)
            all_features.append(feat.squeeze().cpu().numpy())
        except:
            # In case of missing or corrupted image, use zeros
            all_features.append(np.zeros(768))

    return np.array(all_features)

# Device selection
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Extract features
train_image_features = extract_image_features_vit(train_df, IMAGE_DIR, device)
test_image_features = extract_image_features_vit(test_df, IMAGE_DIR, device)

print(f"✅ Image features extracted. Shape: {train_image_features.shape}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Processing Images: 100%|██████████| 75000/75000 [00:14<00:00, 5028.65it/s]
Processing Images: 100%|██████████| 75000/75000 [00:13<00:00, 5615.77it/s]


✅ Image features extracted. Shape: (75000, 768)


In [11]:
# -------------------------------
# Part 7: Text Feature Extraction using SBERT
# -------------------------------

from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm

def extract_text_features_sbert(df, device):
    # Load pretrained SBERT model
    model = SentenceTransformer('all-mpnet-base-v2', device=device)

    corpus = df['full_text'].tolist()
    embeddings = model.encode(corpus, show_progress_bar=True, batch_size=128)

    return np.array(embeddings)

# Extract features
train_text_features = extract_text_features_sbert(train_df, device)
test_text_features = extract_text_features_sbert(test_df, device)

print(f"✅ Text features extracted. Shape: {train_text_features.shape}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/586 [00:00<?, ?it/s]

Batches:   0%|          | 0/586 [00:00<?, ?it/s]

✅ Text features extracted. Shape: (75000, 768)


In [12]:
# -------------------------------
# Part 8: PCA on Image & Text Embeddings + Combine Features
# -------------------------------

from sklearn.decomposition import PCA
import numpy as np

# Apply PCA to reduce dimensions to 256 for faster training
pca_image = PCA(n_components=256, random_state=42)
pca_text = PCA(n_components=256, random_state=42)

train_image_features_pca = pca_image.fit_transform(train_image_features)
test_image_features_pca = pca_image.transform(test_image_features)

train_text_features_pca = pca_text.fit_transform(train_text_features)
test_text_features_pca = pca_text.transform(test_text_features)

# Structured features
engineered_features = ['text_len_ratio','num_words_item_name','num_words_bullets','avg_word_len']
structured_features = ['value','unit_te','brand_te','text_len'] + engineered_features

# Combine structured + image + text features
X = np.hstack([train_df[structured_features].values, train_image_features_pca, train_text_features_pca])
X_test = np.hstack([test_df[structured_features].values, test_image_features_pca, test_text_features_pca])

# Target variable (log scale)
y = np.log1p(train_df['price'] / train_df['value'])

print(f"✅ Features combined. X shape: {X.shape}, X_test shape: {X_test.shape}, y shape: {y.shape}")


  explained_variance_ratio_ = explained_variance_ / total_var


✅ Features combined. X shape: (75000, 520), X_test shape: (75000, 520), y shape: (75000,)


In [13]:
# -------------------------------
# Part 9: SMAPE evaluation + K-Fold training with progress
# -------------------------------

import lightgbm as lgb
from sklearn.model_selection import KFold
import numpy as np

# SMAPE evaluation function for LightGBM
def smape_lgb(y_pred, dataset):
    y_true = dataset.get_label()
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_pred) + np.abs(y_true)) / 2
    smape = np.mean(np.where(denominator == 0, 0, numerator / denominator)) * 100
    return 'SMAPE', smape, False  # False -> lower is better

# K-Fold setup
kf = KFold(n_splits=3, shuffle=True, random_state=42)
oof_preds = np.zeros(X.shape[0])

for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
    print(f"\n--- Fold {fold}/{kf.n_splits} ---")
    X_tr, y_tr = X[train_idx], y[train_idx]
    X_val, y_val = X[val_idx], y[val_idx]

    lgb_train = lgb.Dataset(X_tr, label=y_tr)
    lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

    params = {
        'objective': 'regression_l1',
        'metric': 'mae',
        'learning_rate': 0.01,
        'num_leaves': 128,
        'verbose': -1
    }

    # Callback to print SMAPE per 50 iterations
    def print_smape_callback(env):
        for data_name, eval_name, result, _ in env.evaluation_result_list:
            if eval_name == 'SMAPE':
                if env.iteration % 50 == 0 or env.iteration == env.end_iteration:
                    print(f"Iteration {env.iteration}, {data_name} {eval_name}: {result:.4f}%")

    gbm = lgb.train(
        params,
        lgb_train,
        num_boost_round=1000,
        valid_sets=[lgb_val],
        feval=smape_lgb,
        callbacks=[lgb.early_stopping(stopping_rounds=50), print_smape_callback]
    )

    # Store OOF predictions
    oof_preds[val_idx] = gbm.predict(X_val, num_iteration=gbm.best_iteration)

    # Fold SMAPE
    fold_smape = np.mean(np.abs(np.expm1(y_val) - np.expm1(oof_preds[val_idx])) /
                         ((np.expm1(y_val) + np.expm1(oof_preds[val_idx])) / 2)) * 100
    print(f"✅ Fold {fold} SMAPE: {fold_smape:.4f}%")



--- Fold 1/3 ---
Iteration 0, valid_0 SMAPE: 79.5318%
Training until validation scores don't improve for 50 rounds
Iteration 50, valid_0 SMAPE: 62.1085%
Iteration 100, valid_0 SMAPE: 54.2903%
Iteration 150, valid_0 SMAPE: 50.4296%
Iteration 200, valid_0 SMAPE: 48.2809%
Iteration 250, valid_0 SMAPE: 46.9872%
Iteration 300, valid_0 SMAPE: 46.1616%
Iteration 350, valid_0 SMAPE: 45.5866%
Iteration 400, valid_0 SMAPE: 45.1393%
Iteration 450, valid_0 SMAPE: 44.8275%
Iteration 500, valid_0 SMAPE: 44.5875%
Iteration 550, valid_0 SMAPE: 44.4048%
Iteration 600, valid_0 SMAPE: 44.2386%
Iteration 650, valid_0 SMAPE: 44.1100%
Iteration 700, valid_0 SMAPE: 43.9922%
Iteration 750, valid_0 SMAPE: 43.8805%
Iteration 800, valid_0 SMAPE: 43.8045%
Iteration 850, valid_0 SMAPE: 43.7183%
Iteration 900, valid_0 SMAPE: 43.6536%
Iteration 950, valid_0 SMAPE: 43.5890%
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 0.286664	valid_0's SMAPE: 43.5245
✅ Fold 1 SMAPE: 57.6039%

--- Fold 2/3 --

In [14]:
# -------------------------------
# Part 10: Predict on Test Set + Save Submission

# If you want ensemble, average predictions of all folds
test_preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

# Create submission DataFrame
submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],  # Make sure your test_df has this column
    'price': np.expm1(test_preds) * test_df['value'].values  # revert log scaling & scale by 'value'
})

# Save submission to Google Drive
submission_path = os.path.join(PROJECT_DIR, "submission_amazon_ml_challenge.csv")
submission.to_csv(submission_path, index=False)
print(f"✅ Submission saved at: {submission_path}")

# Display first few rows
submission.head()


✅ Submission saved at: /content/drive/MyDrive/Colab_Notebooks/amazon-ml-challenge/submission_amazon_ml_challenge.csv


Unnamed: 0,sample_id,price
0,100179,11.907859
1,245611,11.643384
2,146263,24.220495
3,95658,10.02578
4,36806,18.542529
