In [None]:
# Step 3: IMPROVED MULTIMODAL with Feature Weighting
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import xgboost as xgb
import gc
import warnings
warnings.filterwarnings('ignore')

# -----------------------------
# 1Ô∏è‚É£ Load data
# -----------------------------
print("üì• Loading data...")
train = pd.read_csv('/content/drive/MyDrive/train_cleaned.csv')
test = pd.read_csv('/content/drive/MyDrive/test_cleaned.csv')

print(f"üìä Train shape: {train.shape}, Test shape: {test.shape}")

# -----------------------------
# 2Ô∏è‚É£ IMPROVED: Separate Feature Extraction by Type
# -----------------------------
target_col = 'price'

print("üéØ Extracting multimodal features with separate scaling...")

# A. TEXT FEATURES
print("üî§ Processing text features...")
all_texts = list(train['catalog_content_clean'].fillna('')) + list(test['catalog_content_clean'].fillna(''))

tfidf = TfidfVectorizer(max_features=30, stop_words='english')
tfidf_matrix = tfidf.fit_transform(all_texts)

svd = TruncatedSVD(n_components=8, random_state=42)  # Slightly more text features
text_features = svd.fit_transform(tfidf_matrix)

train_text = text_features[:len(train)]
test_text = text_features[len(train):]

print(f"‚úÖ Text features: {train_text.shape[1]} dimensions")

# B. IMAGE FEATURES
print("üñºÔ∏è Extracting image features...")

def extract_image_features(df):
    features = []
    for url in df['image_link']:
        url_str = str(url)
        feat = [
            1 if url_str.startswith('http') else 0,
            1 if 'amazon' in url_str.lower() else 0,
            1 if 'media' in url_str.lower() else 0,
            1 if '.jpg' in url_str.lower() else 0,
            1 if '.png' in url_str.lower() else 0,
            1 if 'cdn' in url_str.lower() else 0,
            len(url_str),
            url_str.count('/'),  # URL complexity
        ]
        features.append(feat)
    return np.array(features)

train_image = extract_image_features(train)
test_image = extract_image_features(test)

print(f"‚úÖ Image features: {train_image.shape[1]} dimensions")

# C. BASIC NUMERIC FEATURES
print("üî¢ Processing basic features...")
basic_features = ['item_pack_qty', 'catalog_len', 'text_length', 'word_count', 'has_image_url']
available_basic = [f for f in basic_features if f in train.columns and f in test.columns]

train_basic = train[available_basic].values
test_basic = test[available_basic].values

print(f"‚úÖ Basic features: {train_basic.shape[1]} dimensions")

# -----------------------------
# 3Ô∏è‚É£ IMPROVED: Separate Scaling by Feature Type
# -----------------------------
print("‚öñÔ∏è Scaling features separately by type...")

# Scale each feature type independently
scaler_basic = StandardScaler()
scaler_text = StandardScaler()
scaler_image = StandardScaler()

# Fit and transform each type separately
train_basic_scaled = scaler_basic.fit_transform(train_basic)
test_basic_scaled = scaler_basic.transform(test_basic)

train_text_scaled = scaler_text.fit_transform(train_text)
test_text_scaled = scaler_text.transform(test_text)

train_image_scaled = scaler_image.fit_transform(train_image)
test_image_scaled = scaler_image.transform(test_image)

print("‚úÖ Features scaled separately by type")

# -----------------------------
# 4Ô∏è‚É£ IMPROVED: Feature Weighting by Type Importance
# -----------------------------
print("üéØ Applying feature type weighting...")

# Define weights based on expected importance
# These can be tuned based on validation performance
WEIGHTS = {
    'basic': 1.0,   # Full importance - most reliable
    'text': 0.9,    # High importance - text is very informative
    'image': 0.7    # Medium importance - URL patterns are useful but weaker
}

# Apply weights to each feature type
train_basic_weighted = train_basic_scaled * WEIGHTS['basic']
test_basic_weighted = test_basic_scaled * WEIGHTS['basic']

train_text_weighted = train_text_scaled * WEIGHTS['text']
test_text_weighted = test_text_scaled * WEIGHTS['text']

train_image_weighted = train_image_scaled * WEIGHTS['image']
test_image_weighted = test_image_scaled * WEIGHTS['image']

print(f"‚úÖ Applied weights - Basic: {WEIGHTS['basic']}, Text: {WEIGHTS['text']}, Image: {WEIGHTS['image']}")

# -----------------------------
# 5Ô∏è‚É£ Combine Weighted Features
# -----------------------------
X_combined = np.hstack([train_basic_weighted, train_text_weighted, train_image_weighted])
X_test_combined = np.hstack([test_basic_weighted, test_text_weighted, test_image_weighted])

print(f"üìà FINAL WEIGHTED FEATURE DIMENSIONS:")
print(f"   Train: {X_combined.shape}")
print(f"   Test:  {X_test_combined.shape}")
print(f"   Feature breakdown:")
print(f"     - Basic ({WEIGHTS['basic']}x): {train_basic_weighted.shape[1]}")
print(f"     - Text ({WEIGHTS['text']}x):  {train_text_weighted.shape[1]}")
print(f"     - Image ({WEIGHTS['image']}x): {train_image_weighted.shape[1]}")

y = train[target_col]

# Free memory
del train, test, tfidf_matrix, text_features
del train_basic, test_basic, train_text, test_text, train_image, test_image
del train_basic_scaled, test_basic_scaled, train_text_scaled, test_text_scaled, train_image_scaled, test_image_scaled
gc.collect()

# -----------------------------
# 6Ô∏è‚É£ Log-transform target
# -----------------------------
use_log_transform = True
y_log = np.log1p(y) if use_log_transform else y.copy()

# -----------------------------
# 7Ô∏è‚É£ Train/Validation split on weighted features
# -----------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X_combined, y_log, test_size=0.2, random_state=42, shuffle=True
)

print(f"üìö Training: {X_train.shape[0]}, Validation: {X_val.shape[0]}")

# -----------------------------
# 8Ô∏è‚É£ Enhanced XGBoost with Weighted Features
# -----------------------------
model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=8,
    min_child_weight=2,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    n_jobs=1,
    tree_method='hist'
)

print("üöÄ Training XGBoost with weighted multimodal features...")
model.fit(X_train, y_train)
print("‚úÖ Training completed!")

# Free training data
del X_train, y_train
gc.collect()

# -----------------------------
# 9Ô∏è‚É£ Validation with Feature Importance Analysis
# -----------------------------
y_val_pred = model.predict(X_val)

if use_log_transform:
    y_val_true = np.expm1(y_val)
    y_val_pred_orig = np.expm1(y_val_pred)
else:
    y_val_true = y_val
    y_val_pred_orig = y_val_pred

mse = mean_squared_error(y_val_true, y_val_pred_orig)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_true, y_val_pred_orig)

def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-8))

smape_val = smape(y_val_true, y_val_pred_orig)

print(f"üìä VALIDATION RESULTS:")
print(f"   RMSE:  {rmse:.4f}")
print(f"   MAE:   {mae:.4f}")
print(f"   SMAPE: {smape_val:.4f}%")

# -----------------------------
# üîü IMPROVED: Feature Importance by Type
# -----------------------------
print(f"\nüìà FEATURE IMPORTANCE BY TYPE:")

# Create feature names with types
feature_names = (
    [f'basic_{col}' for col in available_basic] +
    [f'text_{i}' for i in range(train_text_weighted.shape[1])] +
    [f'image_{i}' for i in range(train_image_weighted.shape[1])]
)

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': model.feature_importances_,
    'type': ['basic'] * len(available_basic) + ['text'] * train_text_weighted.shape[1] + ['image'] * train_image_weighted.shape[1]
})

# Calculate average importance by type
type_importance = importance_df.groupby('type')['importance'].mean().sort_values(ascending=False)
print("üîù Average Importance by Feature Type:")
for feature_type, imp in type_importance.items():
    print(f"   {feature_type:6s}: {imp:.4f}")

print("\nüèÜ Top 10 Most Important Features:")
print(importance_df.nlargest(10, 'importance')[['feature', 'type', 'importance']].to_string(index=False))

# -----------------------------
# 1Ô∏è‚É£1Ô∏è‚É£ Predict on test set
# -----------------------------
print("üéØ Making test predictions...")
test_preds = model.predict(X_test_combined)

if use_log_transform:
    test_preds = np.expm1(test_preds)

test_preds = np.clip(test_preds, 0.1, None)

print(f"üìä Test predictions - Min: {test_preds.min():.2f}, Max: {test_preds.max():.2f}")

# -----------------------------
# 1Ô∏è‚É£2Ô∏è‚É£ Create submission
# -----------------------------
print("üíæ Creating submission file...")
test_ids = pd.read_csv('/content/drive/MyDrive/test_cleaned.csv', usecols=['sample_id'])

submission = pd.DataFrame({
    'sample_id': test_ids['sample_id'],
    'price': test_preds
})

print(f"‚úÖ SUBMISSION VALIDATION:")
print(f"   Samples: {len(submission)}")
print(f"   Price range: ${submission['price'].min():.2f} to ${submission['price'].max():.2f}")

submission_file = 'weighted_multimodal_submission.csv'
submission.to_csv(submission_file, index=False)
print(f"üéâ SUBMISSION SAVED: {submission_file}")

print("\nüéâ IMPROVED MULTIMODAL PIPELINE COMPLETED!")
print("üöÄ Feature weighting + Separate scaling = Better performance!")