In [2]:
# Run this on your local laptop
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

print("Loading data...")
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print("Extracting TF-IDF features...")
# TF-IDF on catalog content
tfidf = TfidfVectorizer(
    max_features=150,
    ngram_range=(1, 2),      # Unigrams + bigrams
    min_df=5,                # Min document frequency
    max_df=0.85,             # Max document frequency
    stop_words='english',
    sublinear_tf=True        # Use log scaling
)

# Fit on all text
all_text = pd.concat([
    train_df['catalog_content'].fillna(''),
    test_df['catalog_content'].fillna('')
])

tfidf.fit(all_text)

# Transform
train_tfidf = tfidf.transform(train_df['catalog_content'].fillna('')).toarray()
test_tfidf = tfidf.transform(test_df['catalog_content'].fillna('')).toarray()

print(f"TF-IDF features extracted: {train_tfidf.shape}")

# Apply outlier mask to train (same as before)
y_train_full = np.load('../data/processed/train_target_final.npy')
y_log = np.log1p(y_train_full)
Q1, Q3 = np.percentile(y_log, [25, 75])
IQR = Q3 - Q1
mask = (y_log >= Q1 - 1.5*IQR) & (y_log <= Q3 + 1.5*IQR)

train_tfidf_clean = train_tfidf[mask]

# Save
np.save('../data/processed/train_tfidf_150.npy', train_tfidf_clean)
np.save('../data/processed/test_tfidf_150.npy', test_tfidf)

print(f"✅ Saved TF-IDF features:")
print(f"   train_tfidf_150.npy: {train_tfidf_clean.shape}")
print(f"   test_tfidf_150.npy: {test_tfidf.shape}")

# Save feature names for analysis
feature_names = tfidf.get_feature_names_out()
np.save('../data/processed/tfidf_feature_names.npy', feature_names)
print(f"✅ Top TF-IDF terms:")
print(feature_names[:20])


Loading data...
Extracting TF-IDF features...
TF-IDF features extracted: (75000, 150)
✅ Saved TF-IDF features:
   train_tfidf_150.npy: (74758, 150)
   test_tfidf_150.npy: (75000, 150)
✅ Top TF-IDF terms:
['10' '100' '12' '16' '24' 'add' 'added' 'artificial' 'bag' 'bags' 'beans'
 'best' 'black' 'black tea' 'blend' 'bottle' 'box' 'br' 'br br' 'bulk']
