<a href="https://colab.research.google.com/github/uday862/amazon_ml_hackathon/blob/main/amazon_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
import pandas as pd
import numpy as np

In [30]:
# cell 1
import pandas as pd
import numpy as np

# Load data
train = pd.read_csv("/content/train.csv")
test  = pd.read_csv("/content/test.csv")

print("Train shape:", train.shape)
print("Test shape :", test.shape)
train
test

Train shape: (75000, 4)
Test shape : (75000, 3)


Unnamed: 0,sample_id,catalog_content,image_link
0,100179,Item Name: Rani 14-Spice Eshamaya's Mango Chut...,https://m.media-amazon.com/images/I/71hoAn78AW...
1,245611,Item Name: Natural MILK TEA Flavoring extract ...,https://m.media-amazon.com/images/I/61ex8NHCIj...
2,146263,Item Name: Honey Filled Hard Candy - Bulk Pack...,https://m.media-amazon.com/images/I/61KCM61J8e...
3,95658,Item Name: Vlasic Snack'mm's Kosher Dill 16 Oz...,https://m.media-amazon.com/images/I/51Ex6uOH7y...
4,36806,"Item Name: McCormick Culinary Vanilla Extract,...",https://m.media-amazon.com/images/I/71QYlrOMoS...
...,...,...,...
74995,93616,Item Name: Good Seasons Zezty Italian Salad Dr...,https://m.media-amazon.com/images/I/51e9H27lgv...
74996,249434,"Item Name: Colombina Swirled Love Tiger Pops, ...",https://m.media-amazon.com/images/I/61IpkExmVt...
74997,162217,"Item Name: Kerns, Guava Nectar, 11.5 Fl Oz Can...",https://m.media-amazon.com/images/I/A1NMggyCLz...
74998,230487,Item Name: NY SPICE SHOP Licorice Candy - 1 Po...,https://m.media-amazon.com/images/I/81P69kEP5q...


In [31]:
#cell 2

import re

def extract_value(text):
    match = re.search(r'Value:\s*([\d\.]+)', str(text))
    if match:
        return float(match.group(1))
    return np.nan

def extract_unit(text):
    match = re.search(r'Unit:\s*([A-Za-z]+)', str(text))
    if match:
        return match.group(1)
    return 'Other'

def extract_pack_quantity(text):
    match = re.search(r'Pack of (\d+)', str(text))
    if match:
        return int(match.group(1))
    match = re.search(r'(\d+)\s*per case', str(text))
    if match:
        return int(match.group(1))
    return 1

for df in [train, test]:
    df['Value'] = df['catalog_content'].apply(extract_value)
    df['Unit'] = df['catalog_content'].apply(extract_unit)
    df['pack_quantity'] = df['catalog_content'].apply(extract_pack_quantity)


In [32]:
#cell 3 Fill missing Value by median per Unit
train['Value'] = train.groupby('Unit')['Value'].transform(lambda x: x.fillna(x.median()))
test['Value']  = test.groupby('Unit')['Value'].transform(lambda x: x.fillna(train['Value'].median()))

# Map units
unit_map = {
    'oz': 'Ounce', 'Ounce': 'Ounce', 'Ounces': 'Ounce', 'ounce': 'Ounce',
    'count': 'Count', 'Count': 'Count', 'each': 'Count', 'Each': 'Count',
    'pound': 'Pound', 'Pound': 'Pound', 'lb': 'Pound', 'lbs': 'Pound'
}

for df in [train, test]:
    df['Unit'] = df['Unit'].map(unit_map).fillna('Other')


In [33]:
# cell 4
train['text_length'] = train['catalog_content'].apply(len)
train['num_words']   = train['catalog_content'].apply(lambda x: len(str(x).split()))

test['text_length'] = test['catalog_content'].apply(len)
test['num_words']   = test['catalog_content'].apply(lambda x: len(str(x).split()))


In [34]:
# cell 5
combined = pd.concat([train, test], sort=False)
combined = pd.get_dummies(combined, columns=['Unit'], prefix='Unit')

train = combined.iloc[:len(train)].copy()
test  = combined.iloc[len(train):].copy()


In [35]:
# cell 6 Clip extreme prices (reduce outlier effect)
train['price_clipped'] = train['price'].clip(lower=0.1, upper=train['price'].quantile(0.99))

# Log-transform for LightGBM
train['price_log'] = np.log1p(train['price_clipped'])

# Derived numeric features
train['value_x_pack']   = train['Value'] * train['pack_quantity']
train['value_per_word'] = train['Value'] / (train['num_words']+1)

test['value_x_pack']   = test['Value'] * test['pack_quantity']
test['value_per_word'] = test['Value'] / (test['num_words']+1)


In [36]:
# cell : 7
numeric_features = [
    'Value', 'pack_quantity', 'text_length', 'num_words',
    'value_x_pack', 'value_per_word'
]

unit_features = [col for col in train.columns if col.startswith('Unit_')]

X_train_numeric = train[numeric_features + unit_features].astype(np.float32).values
X_test_numeric  = test[numeric_features + unit_features].astype(np.float32).values

print("Numeric feature shape:", X_train_numeric.shape)


Numeric feature shape: (75000, 10)


In [37]:
# cell 8
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_text = tfidf.fit_transform(train['catalog_content'].fillna(''))
X_test_text  = tfidf.transform(test['catalog_content'].fillna(''))

print("TF-IDF shape:", X_train_text.shape)


TF-IDF shape: (75000, 20000)


In [38]:
# cell 9
from scipy.sparse import hstack

X_train_final = hstack([X_train_text, X_train_numeric])
X_test_final  = hstack([X_test_text, X_test_numeric])

print("Final train shape:", X_train_final.shape)
print("Final test shape :", X_test_final.shape)


Final train shape: (75000, 20010)
Final test shape : (75000, 20010)


In [39]:
# cell 10
from sklearn.model_selection import train_test_split

y = train['price_log'].values
X_train, X_val, y_train, y_val = train_test_split(X_train_final, y, test_size=0.2, random_state=42)


In [None]:
# ------------------------------
# Cell 11: Train LightGBM with early stopping
# ------------------------------
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
import numpy as np

# ------------------------------
# Convert to CSR format for proper indexing (already done in cell 9 for X_train_final and X_test_final)
# ------------------------------
# X_train_final_csr = X_train_final.tocsr() # This is already done in cell 9
# X_test_final_csr  = X_test_final.tocsr()  # This is already done in cell 9

# ------------------------------
# Create LightGBM datasets
# ------------------------------
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_val   = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

# ------------------------------
# LightGBM parameters
# ------------------------------
params = {
    'objective': 'regression',
    'metric': 'mae',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'seed': 42
}

# ------------------------------
# Train model with early stopping using callbacks
# ------------------------------
gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round=2000,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'val'],
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(50)]
)

# ------------------------------
# Predictions + evaluation on validation set
# ------------------------------
y_val_pred_log = gbm.predict(X_val, num_iteration=gbm.best_iteration)
y_val_pred = np.expm1(y_val_pred_log)  # inverse log1p
y_val_true = np.expm1(y_val)

# MAE
mae = mean_absolute_error(y_val_true, y_val_pred)
print("Validation MAE:", mae)

# SMAPE
def smape(y_true, y_pred):
    # Avoid division by zero by adding a small epsilon or checking for zero
    denominator = np.abs(y_true) + np.abs(y_pred)
    # Use a small epsilon to avoid division by zero where both are zero
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (denominator + 1e-8))


smape_score = smape(y_val_true, y_val_pred)
print("Validation SMAPE:", smape_score)

# ------------------------------
# Prediction on the test set for submission
# ------------------------------
y_test_pred_log = gbm.predict(X_test_final, num_iteration=gbm.best_iteration)
y_test_pred = np.expm1(y_test_pred_log) # inverse log1p

Training until validation scores don't improve for 50 rounds
[50]	train's l1: 0.583468	val's l1: 0.602895
[100]	train's l1: 0.543136	val's l1: 0.569947
[150]	train's l1: 0.521844	val's l1: 0.554828
[200]	train's l1: 0.506914	val's l1: 0.546311
[250]	train's l1: 0.494757	val's l1: 0.5402
[300]	train's l1: 0.484833	val's l1: 0.535633
[350]	train's l1: 0.476138	val's l1: 0.532967
[400]	train's l1: 0.468063	val's l1: 0.529922
[450]	train's l1: 0.460978	val's l1: 0.527557
[500]	train's l1: 0.454353	val's l1: 0.525695
[550]	train's l1: 0.448176	val's l1: 0.523938
[600]	train's l1: 0.442459	val's l1: 0.522223
[650]	train's l1: 0.436763	val's l1: 0.52083
[700]	train's l1: 0.431234	val's l1: 0.519557
[750]	train's l1: 0.426068	val's l1: 0.51851
[800]	train's l1: 0.421348	val's l1: 0.517472
[850]	train's l1: 0.416862	val's l1: 0.516544
[900]	train's l1: 0.412613	val's l1: 0.515825
[950]	train's l1: 0.408548	val's l1: 0.515163
[1000]	train's l1: 0.40443	val's l1: 0.514656
[1050]	train's l1: 0.400

In [None]:
# generating the csv file
submission = test[['sample_id']].copy()
submission['price'] = y_test_pred

submission.to_csv("test_out.csv", index=False)
print("Submission file created: test_out.csv")
