In [2]:
import pandas as pd
import numpy as np

# Load existing features
X_train_clean = np.load('../data/processed/train_features_clean.npy')
X_test_final = np.load('../data/processed/test_features_final.npy')

print(f"Current features: {X_train_clean.shape[1]}")

# Load CSV features (check if these are already in your features)
train_text_stats = pd.read_csv('../data/processed/train_text_stats.csv')
test_text_stats = pd.read_csv('../data/processed/test_text_stats.csv')

train_advanced = pd.read_csv('../data/processed/train_advanced_text_features.csv')
test_advanced = pd.read_csv('../data/processed/test_advanced_text_features.csv')

print(f"Text stats features: {train_text_stats.shape}")
print(f"Advanced text features: {train_advanced.shape}")

# Check if already included by comparing shapes
# If train_features_clean has 5109 features, CSV features might be missing

# Load category features
train_cat = pd.read_csv('../data/processed/train_category_features.csv')
test_cat = pd.read_csv('../data/processed/test_category_features.csv')

print(f"Category features: {train_cat.shape}")

# Check columns
print("\nText stats columns:", train_text_stats.columns.tolist())
print("Advanced features columns:", train_advanced.columns.tolist())
print("Category columns:", train_cat.columns.tolist())


Current features: 5109
Text stats features: (75000, 6)
Advanced text features: (75000, 28)
Category features: (75000, 13)

Text stats columns: ['item_name_length', 'item_name_word_count', 'item_name_has_numbers', 'has_bullets', 'bullet_length', 'bullet_word_count']
Advanced features columns: ['premium_score', 'premium_count', 'has_premium', 'budget_score', 'budget_count', 'has_budget', 'net_premium_score', 'material_score', 'material_count', 'has_premium_material', 'brand_score', 'has_known_brand', 'is_luxury_brand', 'unit_price_score', 'is_expensive_unit', 'text_length', 'word_count', 'item_name_length', 'has_bullet_points', 'has_organic', 'has_natural', 'has_pack', 'has_bulk', 'has_frozen', 'has_fresh', 'is_tea_related', 'is_flower_related', 'is_food_related']
Category columns: ['is_tea_beverage', 'score_tea_beverage', 'is_food_snacks', 'score_food_snacks', 'is_organic_health', 'score_organic_health', 'is_bulk_products', 'score_bulk_products', 'is_gourmet_premium', 'score_gourmet_pre

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# ============================================================
# STEP 1: LOAD ALL FEATURES
# ============================================================
print("="*60)
print("LOADING FEATURES")
print("="*60)

# Load existing features (5,109)
X_train_current = np.load('../data/processed/train_features_clean.npy')
X_test_current = np.load('../data/processed/test_features_final.npy')

print(f"✓ Current features: {X_train_current.shape}")

# Load CSV features
train_text_stats = pd.read_csv('../data/processed/train_text_stats.csv')
test_text_stats = pd.read_csv('../data/processed/test_text_stats.csv')

train_advanced = pd.read_csv('../data/processed/train_advanced_text_features.csv')
test_advanced = pd.read_csv('../data/processed/test_advanced_text_features.csv')

train_cat = pd.read_csv('../data/processed/train_category_features.csv')
test_cat = pd.read_csv('../data/processed/test_category_features.csv')

print(f"✓ Text stats: {train_text_stats.shape}")
print(f"✓ Advanced text: {train_advanced.shape}")
print(f"✓ Category: {train_cat.shape}")

# ============================================================
# STEP 2: APPLY OUTLIER MASK TO CSV FEATURES
# ============================================================
print("\n" + "="*60)
print("APPLYING OUTLIER MASK")
print("="*60)

# Load original target to recreate outlier mask
y_train_full = np.load('../data/processed/train_target_final.npy')
y_log = np.log1p(y_train_full)

Q1 = np.percentile(y_log, 25)
Q3 = np.percentile(y_log, 75)
IQR = Q3 - Q1
outlier_mask = (y_log >= Q1 - 1.5*IQR) & (y_log <= Q3 + 1.5*IQR)

print(f"Outliers to remove: {(~outlier_mask).sum()} / {len(outlier_mask)}")

# Apply mask to CSV features
train_text_stats_clean = train_text_stats.iloc[outlier_mask].values
train_advanced_clean = train_advanced.iloc[outlier_mask].values
train_cat_clean = train_cat.iloc[outlier_mask].values

print(f"✓ After mask - Text stats: {train_text_stats_clean.shape}")
print(f"✓ After mask - Advanced: {train_advanced_clean.shape}")
print(f"✓ After mask - Category: {train_cat_clean.shape}")

# ============================================================
# STEP 3: COMBINE ALL FEATURES
# ============================================================
print("\n" + "="*60)
print("COMBINING FEATURES")
print("="*60)

# Combine train features
X_train_enhanced = np.hstack([
    X_train_current,           # 5,109 features
    train_text_stats_clean,    # 6 features
    train_advanced_clean,      # 28 features
    train_cat_clean            # 13 features
])

# Combine test features
X_test_enhanced = np.hstack([
    X_test_current,
    test_text_stats.values,
    test_advanced.values,
    test_cat.values
])

print(f"✓ Enhanced train features: {X_train_enhanced.shape}")
print(f"✓ Enhanced test features: {X_test_enhanced.shape}")
print(f"✓ Total features: {X_train_enhanced.shape[1]} (was 5,109, added 47)")

# Save enhanced features
np.save('../data/processed/train_features_enhanced_v1.npy', X_train_enhanced)
np.save('../data/processed/test_features_enhanced_v1.npy', X_test_enhanced)

# ============================================================
# STEP 4: TRAIN LIGHTGBM WITH ENHANCED FEATURES
# ============================================================
print("\n" + "="*60)
print("TRAINING LIGHTGBM WITH ENHANCED FEATURES")
print("="*60)

# Load target
y_train_full = np.load('../data/processed/train_target_clean.npy')

# Split
X_train, X_val, y_train, y_val = train_test_split(
    X_train_enhanced, y_train_full, test_size=0.2, random_state=42
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}")

# LightGBM parameters (adjusted for more features)
lgb_params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'n_estimators': 2000,
    'max_depth': 7,              # Increased from 6
    'num_leaves': 40,            # Increased from 31
    'subsample': 0.8,
    'colsample_bytree': 0.7,     # Reduced from 0.8 (more features)
    'reg_alpha': 0.8,            # Increased regularization
    'reg_lambda': 3.0,           # Increased regularization
    'min_child_samples': 25,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1
}

print("\nStarting training...")
lgb_model = lgb.LGBMRegressor(**lgb_params)

lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_names=['train', 'valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100, verbose=True),
        lgb.log_evaluation(period=100)
    ]
)

# ============================================================
# STEP 5: EVALUATE
# ============================================================
print("\n" + "="*60)
print("EVALUATION")
print("="*60)

def calculate_smape(y_true, y_pred):
    y_true_orig = np.expm1(y_true)
    y_pred_orig = np.expm1(y_pred)
    numerator = np.abs(y_pred_orig - y_true_orig)
    denominator = (np.abs(y_true_orig) + np.abs(y_pred_orig)) / 2
    return np.mean(numerator / denominator) * 100

train_pred = lgb_model.predict(X_train)
val_pred = lgb_model.predict(X_val)

from sklearn.metrics import mean_absolute_error
train_mae = mean_absolute_error(y_train, train_pred)
val_mae = mean_absolute_error(y_val, val_pred)
val_smape = calculate_smape(y_val, val_pred)

print(f"\n📊 RESULTS WITH ENHANCED FEATURES (5,156 total)")
print(f"{'='*60}")
print(f"Train MAE: {train_mae:.4f}")
print(f"Val MAE: {val_mae:.4f}")
print(f"Gap: {val_mae - train_mae:.4f}")
print(f"\nValidation SMAPE: {val_smape:.2f}")
print(f"\n🎯 COMPARISON:")
print(f"   Previous (5,109 features): 51.39 SMAPE")
print(f"   Current (5,156 features):  {val_smape:.2f} SMAPE")
print(f"   Improvement: {51.39 - val_smape:.2f} points")

if val_smape < 51.39:
    print(f"\n✅ IMPROVED! Making test predictions...")
    
    # Make test predictions
    test_pred_log = lgb_model.predict(X_test_enhanced)
    test_pred_original = np.expm1(test_pred_log)
    
    # Save
    np.save('lightgbm_enhanced_test_predictions.npy', test_pred_original)
    lgb_model.booster_.save_model('lightgbm_enhanced_v1.txt')
    
    print(f"✅ Predictions saved: lightgbm_enhanced_test_predictions.npy")
    print(f"✅ Model saved: lightgbm_enhanced_v1.txt")
    
    print(f"\n📈 TEST PREDICTIONS:")
    print(f"   Samples: {len(test_pred_original):,}")
    print(f"   Min: ${test_pred_original.min():.2f}")
    print(f"   Max: ${test_pred_original.max():.2f}")
    print(f"   Mean: ${test_pred_original.mean():.2f}")
    
    print(f"\n🎯 Expected leaderboard SMAPE: ~{val_smape:.2f}")
    print(f"   (Improvement from 51.825 → ~{val_smape:.2f})")
else:
    print(f"\n⚠️ No improvement. Consider adding TF-IDF features.")

print("="*60)


LOADING FEATURES
✓ Current features: (74758, 5109)
✓ Text stats: (75000, 6)
✓ Advanced text: (75000, 28)
✓ Category: (75000, 13)

APPLYING OUTLIER MASK
Outliers to remove: 242 / 75000
✓ After mask - Text stats: (74758, 6)
✓ After mask - Advanced: (74758, 28)
✓ After mask - Category: (74758, 13)

COMBINING FEATURES
✓ Enhanced train features: (74758, 5156)
✓ Enhanced test features: (75000, 5156)
✓ Total features: 5156 (was 5,109, added 47)

TRAINING LIGHTGBM WITH ENHANCED FEATURES
Train: (59806, 5156), Val: (14952, 5156)

Starting training...
