In [None]:
import pandas as pd, numpy as np, torch, joblib, requests
from PIL import Image
from io import BytesIO
from transformers import BertTokenizer, BertModel
import timm, torchvision.transforms as T

In [None]:
import os


In [None]:
#Load Data
train_df = pd.read_csv("/content/drive/MyDrive/FinalAmazML/train.csv", engine='python')

test_df = pd.read_csv("/content/drive/MyDrive/FinalAmazML/test.csv", engine='python')

train_df = train_df.dropna(subset=["catalog_content", "image_link", "price"])

In [None]:
#Encode Text with DistilBERT
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from tqdm import tqdm
import numpy as np

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)
bert_model.eval()

# Batched encoding function
def batch_encode_texts(texts, batch_size=64):
    embeddings = []
    # Ensure bert_model is accessible within the function scope
    global bert_model
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding Text"):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        with torch.no_grad():
            # Use the specific bert_model variable
            outputs = bert_model(**inputs)
        batch_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(batch_emb)
    return np.array(embeddings)

# Run encoding
if os.path.exists("/content/drive/MyDrive/FinalAmazML/text_emb.npy"):
    print("âœ… Loaded saved text embeddings")
    text_embeddings = np.load("/content/drive/MyDrive/FinalAmazML/text_emb.npy")
    train_df["text_emb"] = list(text_embeddings)
else:
    print("ðŸš€ Encoding text...")
    text_list = train_df["catalog_content"].tolist()
    text_embeddings = batch_encode_texts(text_list)
    train_df["text_emb"] = list(text_embeddings)
    np.save("/content/drive/MyDrive/FinalAmazML/text_emb.npy", text_embeddings)


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

âœ… Loaded saved text embeddings


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Encode Image with EfficientNet
import torch
from PIL import Image
import requests
from io import BytesIO
import timm
import torchvision.transforms as T
from tqdm import tqdm
import numpy as np

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load pre-trained EfficientNet model
efficientnet_model = timm.create_model('efficientnet_b0', pretrained=True).to(device)
efficientnet_model.eval()

# Define image transformations
transform = T.Compose([
    T.Resize(256),
    T.CenterCrop(224),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Batch encode function
def batch_encode_images(image_links, batch_size=32):
    embeddings = []
    # Ensure efficientnet_model is accessible within the function scope
    global efficientnet_model
    for i in tqdm(range(0, len(image_links), batch_size), desc="Encoding Images"):
        batch_links = image_links[i:i+batch_size]
        batch_images = []
        for link in batch_links:
            try:
                response = requests.get(link)
                img = Image.open(BytesIO(response.content)).convert('RGB')
                img = transform(img)
                batch_images.append(img)
            except:
                # Append a tensor of zeros for failed image loads
                batch_images.append(torch.zeros(3, 224, 224))

        if batch_images:
            batch_images = torch.stack(batch_images).to(device)
            with torch.no_grad():
                # Use the specific efficientnet_model variable
                features = efficientnet_model.forward_features(batch_images)
                # Global average pooling
                if features.ndim == 4:
                    features = features.mean(dim=[2, 3])
            embeddings.extend(features.cpu().numpy())
    return np.array(embeddings)


# Run encoding
if os.path.exists("/content/drive/MyDrive/FinalAmazML/img_emb.npy"):
    print("âœ… Loaded saved image embeddings")
    img_embeddings = np.load("/content/drive/MyDrive/FinalAmazML/img_emb.npy")
    train_df["img_emb"] = list(img_embeddings)
else:
    print("ðŸš€ Encoding images...")
    image_list = train_df["image_link"].tolist()
    img_embeddings = batch_encode_images(image_list)
    train_df["img_emb"] = list(img_embeddings)
    np.save("/content/drive/MyDrive/FinalAmazML/img_emb.npy", img_embeddings)


Using device: cuda


model.safetensors:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

âœ… Loaded saved image embeddings


In [None]:
# Save image embeddings
np.save("/content/drive/MyDrive/FinalAmazML/img_emb.npy", img_embeddings)

In [None]:
import torch
print(torch.cuda.is_available())  # Should be True
print(torch.cuda.get_device_name(0))  # Should show your GPU


True
Tesla T4


In [None]:
#Extract Structured Features
def extract_quantity(text):
    try:
        return float(text.split("Value:")[1].split("\n")[0].strip())
    except:
        return 1.0

train_df["quantity"] = train_df["catalog_content"].apply(extract_quantity)


In [None]:
#Train Model
from sklearn.model_selection import train_test_split
import lightgbm as lgb

X = np.hstack([
    np.vstack(train_df["text_emb"]),
    np.vstack(train_df["img_emb"]),
    train_df["quantity"].values.reshape(-1, 1)
])
y = np.log1p(train_df["price"].values)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
if os.path.exists("/content/drive/MyDrive/FinalAmazML/model.pkl") and os.path.exists("/content/drive/MyDrive/FinalAmazML/train_processed.csv"):
    print("âœ… Loaded saved model and processed training data")
    model = joblib.load("/content/drive/MyDrive/FinalAmazML/model.pkl")
    train_df = pd.read_csv("/content/drive/MyDrive/FinalAmazML/train_processed.csv")
else:
    print("ðŸš€ Training model...")
    model = lgb.LGBMRegressor()
    model.fit(X_train, y_train)
    joblib.dump(model, "/content/drive/MyDrive/FinalAmazML/model.pkl")
    train_df.to_csv("/content/drive/MyDrive/FinalAmazML/train_processed.csv", index=False)



âœ… Loaded saved model and processed training data


In [None]:
import joblib

# Save trained model
joblib.dump(model, "/content/drive/MyDrive/FinalAmazML/model.pkl")

# Save processed training DataFrame
train_df.to_csv("/content/drive/MyDrive/FinalAmazML/train_processed.csv", index=False)

In [None]:
#Predict on Test Set
# Assuming batch_encode_texts and batch_encode_images are updated to use correct models internally

# âœ… Predict on Test Set with Restart-Safe Logic

# Encode or load test text embeddings
if os.path.exists("/content/drive/MyDrive/FinalAmazML/test_text_emb.npy"):
    print("âœ… Loaded saved test text embeddings")
    test_text_embeddings = np.load("/content/drive/MyDrive/FinalAmazML/test_text_emb.npy")
    test_df["text_emb"] = list(test_text_embeddings)
else:
    print("ðŸš€ Encoding test text...")
    test_text_embeddings = batch_encode_texts(test_df["catalog_content"].fillna("").tolist())
    test_df["text_emb"] = list(test_text_embeddings)
    np.save("/content/drive/MyDrive/FinalAmazML/test_text_emb.npy", test_text_embeddings)

# Encode or load test image embeddings
if os.path.exists("/content/drive/MyDrive/FinalAmazML/test_img_emb.npy"):
    print("âœ… Loaded saved test image embeddings")
    test_img_embeddings = np.load("/content/drive/MyDrive/FinalAmazML/test_img_emb.npy")
    test_df["img_emb"] = list(test_img_embeddings)
else:
    print("ðŸš€ Encoding test images...")
    test_img_embeddings = batch_encode_images(test_df["image_link"].fillna("").tolist())
    test_df["img_emb"] = list(test_img_embeddings)
    np.save("/content/drive/MyDrive/FinalAmazML/test_img_emb.npy", test_img_embeddings)

# Extract structured features
test_df["quantity"] = test_df["catalog_content"].apply(extract_quantity)

# Build test feature matrix
X_test = np.hstack([
    np.vstack(test_df["text_emb"]),
    np.vstack(test_df["img_emb"]),
    test_df["quantity"].values.reshape(-1, 1)
])

# Predict prices
preds = np.expm1(model.predict(X_test))

# Save predictions (optional)
np.save("/content/drive/MyDrive/FinalAmazML/test_preds.npy", preds)


ðŸš€ Encoding test text...


Encoding Text: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1172/1172 [07:38<00:00,  2.55it/s]


ðŸš€ Encoding test images...


Encoding Images: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2344/2344 [2:08:13<00:00,  3.28s/it]


In [None]:
#Save Submission
output = pd.DataFrame({
    "sample_id": test_df["sample_id"],
    "price": preds
})
output.to_csv("/content/drive/MyDrive/FinalAmazML/test_out.csv", index=False)
print("âœ… Saved to test_out.csv")


âœ… Saved to test_out.csv
