# Baseline Model - Text Features Only

This notebook implements a baseline model using only text features from catalog_content.


In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Load preprocessed data
train_df = pd.read_csv('../dataset/train_preprocessed.csv')
print(f"Training data shape: {train_df.shape}")

# Check if preprocessing was done
if 'cleaned_text' not in train_df.columns:
    print("Preprocessing not found, running preprocessing...")
    
    def extract_ipq(text):
        patterns = [
            r'pack of (\d+)',
            r'(\d+) pack',
            r'quantity[\s:]+(\d+)',
            r'ipq[\s:]+(\d+)',
            r'(\d+)\s*x\s*(\d+)',
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text.lower())
            if match:
                if len(match.groups()) == 2:
                    return int(match.group(1)) * int(match.group(2))
                else:
                    return int(match.group(1))
        return 1

    def clean_text(text):
        if pd.isna(text):
            return ""
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    train_df['cleaned_text'] = train_df['catalog_content'].apply(clean_text)
    train_df['ipq'] = train_df['catalog_content'].apply(extract_ipq)
    train_df['text_length'] = train_df['cleaned_text'].str.len()
    train_df['word_count'] = train_df['cleaned_text'].str.split().str.len()

print("Data ready for modeling!")


In [None]:
# Prepare features
print("Preparing features...")

# TF-IDF features
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english',
    min_df=5,
    max_df=0.95
)

tfidf_features = tfidf.fit_transform(train_df['cleaned_text'])
print(f"TF-IDF features shape: {tfidf_features.shape}")

# Numerical features
numerical_features = ['ipq', 'text_length', 'word_count']
X_numerical = train_df[numerical_features].values

# Combine features
from scipy.sparse import hstack
X_combined = hstack([tfidf_features, X_numerical])
print(f"Combined features shape: {X_combined.shape}")

# Target variable
y = train_df['price'].values
print(f"Target shape: {y.shape}")

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    X_combined, y, test_size=0.2, random_state=42
)
print(f"Train set: {X_train.shape}, Validation set: {X_val.shape}")


In [None]:
# SMAPE metric function
def smape(y_true, y_pred):
    """Symmetric Mean Absolute Percentage Error"""
    return np.mean(np.abs(y_true - y_pred) / ((np.abs(y_true) + np.abs(y_pred)) / 2)) * 100

# Train LightGBM model
print("Training LightGBM model...")
lgb_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=8,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)

lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_val)

lgb_mae = mean_absolute_error(y_val, lgb_pred)
lgb_smape = smape(y_val, lgb_pred)

print(f"LightGBM - MAE: {lgb_mae:.2f}, SMAPE: {lgb_smape:.2f}%")


In [None]:
# Train XGBoost model
print("Training XGBoost model...")
xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbosity=0
)

xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_val)

xgb_mae = mean_absolute_error(y_val, xgb_pred)
xgb_smape = smape(y_val, xgb_pred)

print(f"XGBoost - MAE: {xgb_mae:.2f}, SMAPE: {xgb_smape:.2f}%")


In [None]:
# Ensemble model
print("Training ensemble model...")
ensemble_pred = (lgb_pred + xgb_pred) / 2

ensemble_mae = mean_absolute_error(y_val, ensemble_pred)
ensemble_smape = smape(y_val, ensemble_pred)

print(f"Ensemble - MAE: {ensemble_mae:.2f}, SMAPE: {ensemble_smape:.2f}%")

# Results summary
print("\nModel Performance Summary:")
print(f"LightGBM: MAE={lgb_mae:.2f}, SMAPE={lgb_smape:.2f}%")
print(f"XGBoost: MAE={xgb_mae:.2f}, SMAPE={xgb_smape:.2f}%")
print(f"Ensemble: MAE={ensemble_mae:.2f}, SMAPE={ensemble_smape:.2f}%")


In [None]:
# Save models and preprocessing objects
import joblib
import os

# Create models directory
os.makedirs('../models', exist_ok=True)

# Save models
joblib.dump(lgb_model, '../models/lgb_baseline.pkl')
joblib.dump(xgb_model, '../models/xgb_baseline.pkl')
joblib.dump(tfidf, '../models/tfidf_vectorizer.pkl')

print("Models and preprocessing objects saved!")
