In [None]:
!pip install -q pandas numpy scikit-learn scipy lightgbm joblib matplotlib seaborn tqdm requests

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
from pathlib import Path

# If you uploaded to Colab session root (upload button), dataset likely in /content or root
BASE = '/content/drive/MyDrive/student_resource'   # change if your files are in drive e.g. '/content/drive/MyDrive/yourfolder'
print("Base folder:", BASE)
print("Files in base folder:")
for f in sorted(os.listdir(BASE))[:200]:
    print("-", f)

# If you uploaded into a 'dataset' folder:
DATA_DIR = os.path.join(BASE, 'dataset')
print("\nChecking dataset folder:", DATA_DIR)
if os.path.exists(DATA_DIR):
    print("Files in dataset/:")
    for f in sorted(os.listdir(DATA_DIR)):
        print(" -", f)
else:
    print("dataset/ folder not found. If you uploaded elsewhere, set DATA_DIR to that path.")


Base folder: /content/drive/MyDrive/student_resource
Files in base folder:
- .DS_Store
- Documentation_template.md
- README.md
- dataset
- lgb_fast_fold1.txt
- lgb_fast_fold2.txt
- sample_code.py
- src
- tfidf_baseline.pkl
- tfidf_fast.pkl

Checking dataset folder: /content/drive/MyDrive/student_resource/dataset
Files in dataset/:
 - sample_test.csv
 - sample_test_out.csv
 - test.csv
 - test_features_step3.parquet
 - test_features_step4.parquet
 - test_features_step5.parquet
 - test_with_basic_feats.parquet
 - train.csv
 - train_features_step3.parquet
 - train_features_step4.parquet
 - train_features_step5.parquet
 - train_with_basic_feats.parquet


In [None]:
import pandas as pd
import numpy as np

DATA_DIR = '/content/drive/MyDrive/student_resource/dataset'   # change if needed
train_path = f"{DATA_DIR}/train.csv"
test_path  = f"{DATA_DIR}/test.csv"

print("Loading files...")
train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)
print("Loaded.")

def ds_info(df, name):
    print(f"\n== {name} ==")
    print("Shape:", df.shape)
    print("Columns:", list(df.columns))
    print("Sample rows:")
    display(df.head(3))

ds_info(train, "TRAIN")
ds_info(test,  "TEST")


Loading files...
Loaded.

== TRAIN ==
Shape: (75000, 4)
Columns: ['sample_id', 'catalog_content', 'image_link', 'price']
Sample rows:


Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97



== TEST ==
Shape: (75000, 3)
Columns: ['sample_id', 'catalog_content', 'image_link']
Sample rows:


Unnamed: 0,sample_id,catalog_content,image_link
0,100179,Item Name: Rani 14-Spice Eshamaya's Mango Chut...,https://m.media-amazon.com/images/I/71hoAn78AW...
1,245611,Item Name: Natural MILK TEA Flavoring extract ...,https://m.media-amazon.com/images/I/61ex8NHCIj...
2,146263,Item Name: Honey Filled Hard Candy - Bulk Pack...,https://m.media-amazon.com/images/I/61KCM61J8e...


In [None]:
# Duplicates, missing target, zero/negative price
print("Train sample_id unique:", train['sample_id'].nunique(), " vs rows:", len(train))
dups = train['sample_id'].duplicated().sum()
print("Duplicate sample_id in train:", dups)

# price checks
print("\nPrice stats (train):")
print(train['price'].describe())
print("Count of non-positive prices:", (train['price'] <= 0).sum())

# missingness overall
print("\nMissing counts (top columns):")
print(train.isnull().sum().sort_values(ascending=False).head(20))

# Check if any sample_id in test duplicates or matches train (that's ok, but just info)
print("\nTest sample_id unique:", test['sample_id'].nunique(), " vs rows:", len(test))
print("Overlap train/test sample_id (should be 0):", len(set(train['sample_id']).intersection(set(test['sample_id']))))


Train sample_id unique: 75000  vs rows: 75000
Duplicate sample_id in train: 0

Price stats (train):
count    75000.000000
mean        23.647654
std         33.376932
min          0.130000
25%          6.795000
50%         14.000000
75%         28.625000
max       2796.000000
Name: price, dtype: float64
Count of non-positive prices: 0

Missing counts (top columns):
sample_id          0
catalog_content    0
image_link         0
price              0
dtype: int64

Test sample_id unique: 75000  vs rows: 75000
Overlap train/test sample_id (should be 0): 0


In [None]:
# Cell 1: core feature extraction functions
import re, math
import pandas as pd
import numpy as np

def clean_text(s):
    if pd.isna(s): return ""
    s = str(s).lower()
    s = re.sub(r'https?://\S+', ' ', s)
    s = re.sub(r'[\u2018\u2019\u201c\u201d]', "'", s)
    s = re.sub(r'[^a-z0-9\.\,\-\s/x×]', ' ', s)  # keep slash and x and × for multipliers
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def extract_ipq(s):
    if not s or pd.isna(s): return np.nan
    s = str(s).lower()
    # common forms: "pack of 4", "4 pack", "4pk", "4 pcs", "4x", "4 x 100ml"
    m = re.search(r'(\d+)\s*(?:pack|packs|pk|pks|pcs|pieces|count|set)\b', s)
    if m: return int(m.group(1))
    m = re.search(r'\b(\d+)\s*[x×]\s*\d', s)   # "3x100ml" -> 3
    if m: return int(m.group(1))
    m = re.search(r'\b(\d+)[x×]\b', s)         # "3x"
    if m: return int(m.group(1))
    m = re.search(r'\b(\d+) ?- ?pack\b', s)
    if m: return int(m.group(1))
    return np.nan

def extract_unit_amount(s):
    """Return (amount, unit) like (500, 'ml') or (1.5, 'l'), else (nan, None)"""
    if not s or pd.isna(s): return (np.nan, None)
    s = str(s).lower()
    m = re.search(r'(\d+(?:\.\d+)?)\s*(ml|milliliter|milliliters|l|litre|liter|litres|g|gram|grams|kg|kg\.|ounce|oz|mg)\b', s)
    if m:
        amt = float(m.group(1))
        unit = m.group(2)
        # normalize
        if unit.startswith('millil'): unit = 'ml'
        if unit in ['l','litre','liter','litres']: unit = 'l'
        if unit in ['g','gram','grams','mg']: unit = 'g'
        if unit in ['ounce','oz']: unit = 'oz'
        return (amt, unit)
    # try compact forms like "250g" without space
    m2 = re.search(r'(\d+(?:\.\d+)?)(ml|g|kg|l|oz|mg)\b', s)
    if m2:
        amt = float(m2.group(1)); unit = m2.group(2)
        if unit == 'kg': amt = amt * 1000; unit = 'g'  # convert to grams
        return (amt, unit)
    return (np.nan, None)


def guess_brand(s):
    if not s: return ""
    toks = s.split()
    if not toks: return ""
    # choose first token if alphabetic-like and not a number
    first = toks[0]
    if re.match(r'^[a-zA-Z][a-zA-Z0-9\-]+$', first) and len(first) > 1:
        return first
    # fallback: first token with letters
    for t in toks[:5]:
        if re.search('[a-zA-Z]', t):
            return t
    return ""

In [None]:
# Cell 2: apply features to train/test and save
DATA_DIR = '/content/drive/MyDrive/student_resource/dataset'  # change if your path differs
train = pd.read_csv(f"{DATA_DIR}/train.csv")
test  = pd.read_csv(f"{DATA_DIR}/test.csv")

for df in (train, test):
    df['catalog_clean'] = df['catalog_content'].fillna('').map(clean_text)
    df['text_len'] = df['catalog_clean'].str.len()
    df['num_tokens'] = df['catalog_clean'].str.split().map(len)
    df['num_digits'] = df['catalog_clean'].str.count(r'\d')
    df['has_image'] = df['image_link'].fillna('').str.strip() != ''
    df['has_image'] = df['has_image'].astype(int)
    df['ipq'] = df['catalog_clean'].map(extract_ipq)
    # Modified to ensure tuple/list of length 2 is returned
    df[['unit_amount','unit_type']] = df['catalog_clean'].apply(extract_unit_amount).apply(pd.Series)
    df['brand_guess'] = df['catalog_clean'].map(guess_brand)

# Quick coverage report
print("IPQ coverage (train):", train['ipq'].notna().mean())
print("Unit amount coverage (train):", train['unit_amount'].notna().mean())
print("Brand non-empty (train):", (train['brand_guess'] != "").mean())

# Save intermediate
train.to_parquet(f"{DATA_DIR}/train_with_basic_feats.parquet", index=False)
test.to_parquet(f"{DATA_DIR}/test_with_basic_feats.parquet", index=False)
print("Saved files to dataset/ as parquet: train_with_basic_feats.parquet, test_with_basic_feats.parquet")

IPQ coverage (train): 0.25176
Unit amount coverage (train): 0.66228
Brand non-empty (train): 1.0
Saved files to dataset/ as parquet: train_with_basic_feats.parquet, test_with_basic_feats.parquet


In [None]:
# Cell 3: brand frequency and OOF-safe brand mean price
from sklearn.model_selection import KFold
DATA_DIR = '/content/drive/MyDrive/student_resource/dataset'
train = pd.read_parquet(f"{DATA_DIR}/train_with_basic_feats.parquet")
test  = pd.read_parquet(f"{DATA_DIR}/test_with_basic_feats.parquet")

# brand frequency mapping (global)
brand_freq = train['brand_guess'].value_counts(normalize=True)
train['brand_freq'] = train['brand_guess'].map(brand_freq).fillna(0)
test['brand_freq']  = test['brand_guess'].map(brand_freq).fillna(0)

# OOF brand mean price
kf = KFold(n_splits=5, shuffle=True, random_state=42)
train['brand_mean_oof'] = np.nan
global_mean = train['price'].mean()
for tr_idx, val_idx in kf.split(train):
    tr = train.iloc[tr_idx]
    means = tr.groupby('brand_guess')['price'].mean()
    train.iloc[val_idx, train.columns.get_loc('brand_mean_oof')] = train.iloc[val_idx]['brand_guess'].map(means).fillna(global_mean)

# For test: compute brand mean on full train
brand_mean_full = train.groupby('brand_guess')['price'].mean()
test['brand_mean'] = test['brand_guess'].map(brand_mean_full).fillna(global_mean)

# Diagnostics
print("Train brand_mean_oof nulls:", train['brand_mean_oof'].isna().sum())
print("Example brand mean (train):")
display(train[['brand_guess','brand_freq','brand_mean_oof']].head(10))

# Save back
train.to_parquet(f"{DATA_DIR}/train_features_step3.parquet", index=False)
test.to_parquet(f"{DATA_DIR}/test_features_step3.parquet", index=False)
print("Saved train_features_step3.parquet and test_features_step3.parquet")


Train brand_mean_oof nulls: 0
Example brand mean (train):


Unnamed: 0,brand_guess,brand_freq,brand_mean_oof
0,item,1.0,23.709702
1,item,1.0,23.709702
2,item,1.0,23.631827
3,item,1.0,23.709702
4,item,1.0,23.598634
5,item,1.0,23.631827
6,item,1.0,23.620979
7,item,1.0,23.620979
8,item,1.0,23.677128
9,item,1.0,23.709702


Saved train_features_step3.parquet and test_features_step3.parquet


In [None]:
# Cell 4: top-K token OOF mean features
from collections import Counter
from sklearn.model_selection import KFold
DATA_DIR = '/content/drive/MyDrive/student_resource/dataset'
train = pd.read_parquet(f"{DATA_DIR}/train_features_step3.parquet")
test  = pd.read_parquet(f"{DATA_DIR}/test_features_step3.parquet")

# Tokenize (simple whitespace)
train['tokens'] = train['catalog_clean'].str.split()
test['tokens']  = test['catalog_clean'].str.split()

# top-K tokens by frequency
all_tokens = train['tokens'].explode().value_counts()
K = 2000
topk = set(all_tokens.index[:K].tolist())
print("Top-K tokens count:", len(topk))

# Prepare storage for OOF token means per token (dict of arrays per fold is heavy)
# We'll compute per-fold token means and for each row average the token means of tokens in topk.
kf = KFold(n_splits=5, shuffle=True, random_state=42)
train['token_mean_topk_avg_oof'] = np.nan

for tr_idx, val_idx in kf.split(train):
    tr = train.iloc[tr_idx]
    # compute mean price per token on tr for tokens in topk
    exploded = tr[['tokens','price']].explode('tokens')
    exploded = exploded[exploded['tokens'].isin(topk)]
    token_price_mean = exploded.groupby('tokens')['price'].mean()
    # function to compute average token mean for a row
    def avg_token_mean(tokens):
        if not isinstance(tokens, list): return np.nan
        vals = [token_price_mean.get(t, np.nan) for t in tokens if t in token_price_mean.index]
        if len(vals) == 0: return np.nan
        return float(np.nanmean(vals))
    train.iloc[val_idx, train.columns.get_loc('token_mean_topk_avg_oof')] = train.iloc[val_idx]['tokens'].map(avg_token_mean).fillna(global_mean)

# For test, use full-train token means
exploded_full = train[['tokens','price']].explode('tokens')
exploded_full = exploded_full[exploded_full['tokens'].isin(topk)]
token_price_mean_full = exploded_full.groupby('tokens')['price'].mean()
def avg_token_mean_test(tokens):
    if not isinstance(tokens, list): return np.nan
    vals = [token_price_mean_full.get(t, np.nan) for t in tokens if t in token_price_mean_full.index]
    if len(vals) == 0: return np.nan
    return float(np.nanmean(vals))
test['token_mean_topk_avg'] = test['tokens'].map(avg_token_mean_test).fillna(global_mean)
# Fill any remaining NaNs in train
train['token_mean_topk_avg_oof'] = train['token_mean_topk_avg_oof'].fillna(global_mean)

# Diagnostics
print("Train token_mean_topk_avg_oof nulls:", train['token_mean_topk_avg_oof'].isna().sum())
print("Test token_mean_topk_avg nulls:", test['token_mean_topk_avg'].isna().sum())
display(train[['sample_id','catalog_clean','token_mean_topk_avg_oof']].head(8))

# Save
train.to_parquet(f"{DATA_DIR}/train_features_step4.parquet", index=False)
test.to_parquet(f"{DATA_DIR}/test_features_step4.parquet", index=False)
print("Saved train_features_step4.parquet and test_features_step4.parquet")


Top-K tokens count: 2000
Train token_mean_topk_avg_oof nulls: 0
Test token_mean_topk_avg nulls: 0


Unnamed: 0,sample_id,catalog_clean,token_mean_topk_avg_oof
0,33127,"item name la victoria green taco sauce mild, 1...",22.612822
1,198967,"item name salerno cookies, the original butter...",25.125791
2,261251,"item name bear creek hearty soup bowl, creamy ...",24.696959
3,55858,item name judee s blue cheese powder 11.25 oz ...,26.949572
4,292686,"item name kedem sherry cooking wine, 12.7 ounc...",32.130011
5,9259,"item name member s mark member s mark, basil, ...",25.675648
6,191846,item name goya foods sazonador total seasoning...,25.650534
7,222007,item name vineco original series chilean sauvi...,29.059199


Saved train_features_step4.parquet and test_features_step4.parquet


In [None]:
# Cell 5: price_per_unit derivation and aggregated OOF mean per brand
DATA_DIR = '/content/drive/MyDrive/student_resource/dataset'
train = pd.read_parquet(f"{DATA_DIR}/train_features_step4.parquet")
test  = pd.read_parquet(f"{DATA_DIR}/test_features_step4.parquet")

# raw price_per_unit where ipq and unit_amount present
def compute_units_total(row):
    # attempt: total units = ipq * unit_amount (if both present)
    ipq = row.get('ipq', np.nan)
    amt = row.get('unit_amount', np.nan)
    if pd.isna(ipq) and pd.isna(amt):
        return np.nan
    if pd.isna(ipq): ipq = 1.0
    if pd.isna(amt): amt = 1.0
    try:
        return float(ipq) * float(amt)
    except:
        return np.nan

train['units_total'] = train.apply(compute_units_total, axis=1)
test['units_total']  = test.apply(compute_units_total, axis=1)

train['price_per_unit'] = np.where(train['units_total'].notna() & (train['units_total']>0),
                                   train['price'] / train['units_total'], np.nan)

print("Price per unit coverage (train):", train['price_per_unit'].notna().mean())
display(train[['sample_id','catalog_clean','ipq','unit_amount','units_total','price','price_per_unit']].head(8))

# OOF aggregated price_per_unit per brand (safe)
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
train['brand_ppu_oof'] = np.nan
for tr_idx, val_idx in kf.split(train):
    tr = train.iloc[tr_idx]
    # use only rows with price_per_unit not null
    grp = tr[tr['price_per_unit'].notna()].groupby('brand_guess')['price_per_unit'].mean()
    train.iloc[val_idx, train.columns.get_loc('brand_ppu_oof')] = train.iloc[val_idx]['brand_guess'].map(grp).fillna(np.nan)

# For test: compute brand ppu from full train
brand_ppu_full = train[train['price_per_unit'].notna()].groupby('brand_guess')['price_per_unit'].mean()
test['brand_ppu'] = test['brand_guess'].map(brand_ppu_full).fillna(np.nan)

# Save
train.to_parquet(f"{DATA_DIR}/train_features_step5.parquet", index=False)
test.to_parquet(f"{DATA_DIR}/test_features_step5.parquet", index=False)
print("Saved train_features_step5.parquet and test_features_step5.parquet")

Price per unit coverage (train): 0.7493066666666667


Unnamed: 0,sample_id,catalog_clean,ipq,unit_amount,units_total,price,price_per_unit
0,33127,"item name la victoria green taco sauce mild, 1...",,12.0,12.0,4.89,0.4075
1,198967,"item name salerno cookies, the original butter...",,8.0,8.0,13.12,1.64
2,261251,"item name bear creek hearty soup bowl, creamy ...",,1.9,1.9,1.97,1.036842
3,55858,item name judee s blue cheese powder 11.25 oz ...,,11.25,11.25,30.34,2.696889
4,292686,"item name kedem sherry cooking wine, 12.7 ounc...",,12.7,12.7,66.49,5.235433
5,9259,"item name member s mark member s mark, basil, ...",,6.25,6.25,18.5,2.96
6,191846,item name goya foods sazonador total seasoning...,5.0,30.0,150.0,5.99,0.039933
7,222007,item name vineco original series chilean sauvi...,,8.0,8.0,94.0,11.75


Saved train_features_step5.parquet and test_features_step5.parquet


In [None]:
# FAST baseline (smaller TF-IDF + lighter LightGBM) — run this to get quick OOF SMAPE
import os, joblib, gc, time
import numpy as np, pandas as pd
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

def smape_pct(y_true, y_pred):
    y_true = np.array(y_true, dtype=float); y_pred = np.array(y_pred, dtype=float)
    denom = (np.abs(y_true)+np.abs(y_pred))/2.0
    mask = denom!=0
    res = np.zeros_like(denom); res[mask] = np.abs(y_true[mask]-y_pred[mask]) / denom[mask]
    return np.mean(res) * 100.0

DATA_DIR = '/content/drive/MyDrive/student_resource/dataset'
MODEL_DIR = '/content/drive/MyDrive/student_resource'
os.makedirs(MODEL_DIR, exist_ok=True)

# load feature parquet (prepared earlier)
train = pd.read_parquet(os.path.join(DATA_DIR, 'train_features_step5.parquet'))
test  = pd.read_parquet(os.path.join(DATA_DIR, 'test_features_step5.parquet'))
train['price_bucket'] = pd.qcut(train['price'], q=10, labels=False, duplicates='drop')

# TF-IDF smaller
TFV_MAX_FEAT = 10000   # <<<<< faster
tfidf = TfidfVectorizer(max_features=TFV_MAX_FEAT, ngram_range=(1,2), min_df=5)
X_text = tfidf.fit_transform(train['catalog_clean'].fillna(''))
X_text_test = tfidf.transform(test['catalog_clean'].fillna(''))
joblib.dump(tfidf, os.path.join(MODEL_DIR, 'tfidf_fast.pkl'))
print("TF-IDF shape:", X_text.shape)

# numeric features mapping (same as before)
train_to_test_name = {'brand_mean_oof':'brand_mean', 'token_mean_topk_avg_oof':'token_mean_topk_avg', 'brand_ppu_oof':'brand_ppu'}
candidate_nums = ['ipq','text_len','num_tokens','num_digits','has_image','brand_freq','brand_mean_oof','token_mean_topk_avg_oof','brand_ppu_oof']
train_num_cols, test_num_cols = [], []
for c in candidate_nums:
    test_name = train_to_test_name.get(c, c)
    if c in train.columns and test_name in test.columns:
        train_num_cols.append(c); test_num_cols.append(test_name)
print("Using numeric cols:", train_num_cols)

X_num = train[train_num_cols].fillna(-1).astype(float).values
X_num_test = test[test_num_cols].fillna(-1).astype(float).values

X_full = hstack([X_text, csr_matrix(X_num)])
X_full_test = hstack([X_text_test, csr_matrix(X_num_test)])
y = train['price'].values

# LGB faster settings
lgb_params_fast = {
    'objective':'regression','metric':'mae','learning_rate':0.07,
    'num_leaves':64,'min_data_in_leaf':80,'verbosity':-1,'seed':42
}
N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

oof = np.zeros(len(train)); preds_test = np.zeros(X_full_test.shape[0])
for fold, (tr_idx, val_idx) in enumerate(skf.split(train, train['price_bucket']), start=1):
    print(f"\n--- Fold {fold} ---")
    X_tr, X_val = X_full[tr_idx], X_full[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]
    dtrain = lgb.Dataset(X_tr, label=y_tr); dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)
    model = lgb.train(lgb_params_fast, dtrain, num_boost_round=1200,
                      valid_sets=[dtrain,dval],
                      callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(200)])
    oof[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
    preds_test += model.predict(X_full_test, num_iteration=model.best_iteration)/N_FOLDS
    model.save_model(os.path.join(MODEL_DIR, f"lgb_fast_fold{fold}.txt"))
    gc.collect()

print("\nFAST OOF SMAPE (%):", smape_pct(y, oof))
print("FAST OOF MAE:", np.mean(np.abs(y - oof)))
np.save(os.path.join(MODEL_DIR,'oof_fast.npy'), oof)
np.save(os.path.join(MODEL_DIR,'test_preds_fast.npy'), preds_test)
print("Saved fast models to", MODEL_DIR)


TF-IDF shape: (75000, 10000)
Using numeric cols: ['ipq', 'text_len', 'num_tokens', 'num_digits', 'has_image', 'brand_freq', 'brand_mean_oof', 'token_mean_topk_avg_oof', 'brand_ppu_oof']

--- Fold 1 ---
Training until validation scores don't improve for 50 rounds
[200]	training's l1: 11.1305	valid_1's l1: 13.3661
[400]	training's l1: 10.1235	valid_1's l1: 13.2396
Early stopping, best iteration is:
[540]	training's l1: 9.58522	valid_1's l1: 13.204

--- Fold 2 ---
Training until validation scores don't improve for 50 rounds
[200]	training's l1: 11.3288	valid_1's l1: 12.9574
[400]	training's l1: 10.3181	valid_1's l1: 12.8835
Early stopping, best iteration is:
[422]	training's l1: 10.2271	valid_1's l1: 12.8788

--- Fold 3 ---
Training until validation scores don't improve for 50 rounds
[200]	training's l1: 11.3614	valid_1's l1: 12.8388
[400]	training's l1: 10.3293	valid_1's l1: 12.8076
Early stopping, best iteration is:
[367]	training's l1: 10.4611	valid_1's l1: 12.796

--- Fold 4 ---
Train

In [None]:
import pandas as pd, numpy as np

train = pd.read_parquet('/content/drive/MyDrive/student_resource/dataset/train_features_step5.parquet')
oof = np.load('/content/drive/MyDrive/student_resource/oof_fast.npy')
train['pred'] = oof
train['error'] = train['price'] - train['pred']
train['abs_error'] = train['error'].abs()
train['rel_err'] = train['abs_error'] / ((train['price'].abs() + train['pred'].abs()) / 2)
train['price_bucket'] = pd.qcut(train['price'], 10, duplicates='drop')

print("Mean SMAPE:", train['rel_err'].mean()*100)
print("\nError by price range:")
print(train.groupby('price_bucket')['rel_err'].mean()*100)

print("\nTop 10 worst predictions:")
worst = train.sort_values('rel_err', ascending=False).head(10)
print(worst[['sample_id','price','pred','catalog_clean']])


Mean SMAPE: 60.39059215792726

Error by price range:
price_bucket
(0.129, 3.565]      125.208173
(3.565, 5.6]         91.453660
(5.6, 7.99]          71.271535
(7.99, 10.78]        53.630011
(10.78, 14.0]        44.755826
(14.0, 18.45]        39.723940
(18.45, 24.69]       36.215570
(24.69, 33.891]      38.060316
(33.891, 52.301]     46.237164
(52.301, 2796.0]     56.982175
Name: rel_err, dtype: float64

Top 10 worst predictions:
       sample_id  price       pred  \
74965     120313  9.990  -1.200244   
7663       79318  1.530  -0.394049   
7760      285552  2.005  -0.894728   
7557       68296  1.360 -12.144212   
55912      52380  1.840  -0.066913   
55924     257838  3.890  -5.184625   
36882      95931  1.540  -2.872465   
37628      61558  1.590  -1.860430   
74484     276510  1.240  -2.250351   
55777     188784  4.510  -0.402071   

                                           catalog_clean  
74965  item name shan chapli kabab recipe and seasoni...  
7663   item name mccormick glu

  print(train.groupby('price_bucket')['rel_err'].mean()*100)


In [None]:
import numpy as np, pandas as pd, lightgbm as lgb, joblib, gc
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold

def smape(y_true, y_pred):
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2
    mask = denom != 0
    res = np.zeros_like(denom)
    res[mask] = np.abs(y_true[mask] - y_pred[mask]) / denom[mask]
    return np.mean(res) * 100

DATA_DIR = '/content/drive/MyDrive/student_resource/dataset'
train = pd.read_parquet(f"{DATA_DIR}/train_features_step5.parquet")
train['price_bucket'] = pd.qcut(train['price'], 10, labels=False, duplicates='drop')

# Target transformation
y_log = np.log1p(train['price'].values)

# TF-IDF smaller for fast iteration
tfidf = TfidfVectorizer(max_features=15000, ngram_range=(1,2), min_df=5)
X_text = tfidf.fit_transform(train['catalog_clean'])
joblib.dump(tfidf, '/content/tfidf_log.pkl')

# numeric features
num_cols = ['ipq','text_len','num_tokens','num_digits','has_image','brand_freq',
            'brand_mean_oof','token_mean_topk_avg_oof','brand_ppu_oof']
num_cols = [c for c in num_cols if c in train.columns]
X_num = train[num_cols].fillna(-1).astype(float).values
X = hstack([X_text, csr_matrix(X_num)])

params = {
    'objective': 'regression',
    'metric': 'mae',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'min_data_in_leaf': 50,
    'verbosity': -1,
    'seed': 42
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_pred = np.zeros(len(train))
for fold, (tr, val) in enumerate(skf.split(X, train['price_bucket']), 1):
    print(f"\n--- Fold {fold} ---")
    dtr = lgb.Dataset(X[tr], label=y_log[tr])
    dval = lgb.Dataset(X[val], label=y_log[val], reference=dtr)
    model = lgb.train(params, dtr, num_boost_round=2000,
                      valid_sets=[dtr,dval],
                      callbacks=[lgb.early_stopping(100), lgb.log_evaluation(200)])
    oof_pred[val] = model.predict(X[val], num_iteration=model.best_iteration)
    model.save_model(f"/content/drive/MyDrive/student_resource/lgb_log_fold{fold}.txt")
    gc.collect()

# Convert back to price scale
oof_price = np.expm1(oof_pred)
smape_val = smape(train['price'], np.clip(oof_price, 0.01, None))
mae_val = np.mean(np.abs(train['price'] - oof_price))
print(f"\n✅ LOG-Target OOF SMAPE: {smape_val:.4f}")
print(f"✅ LOG-Target OOF MAE: {mae_val:.4f}")


--- Fold 1 ---
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 0.480057	valid_1's l1: 0.536743
[400]	training's l1: 0.431779	valid_1's l1: 0.524565
[600]	training's l1: 0.398756	valid_1's l1: 0.519867
[800]	training's l1: 0.372245	valid_1's l1: 0.517296
[1000]	training's l1: 0.35005	valid_1's l1: 0.515764
[1200]	training's l1: 0.331087	valid_1's l1: 0.515248
Early stopping, best iteration is:
[1245]	training's l1: 0.327079	valid_1's l1: 0.515094

--- Fold 2 ---
Training until validation scores don't improve for 100 rounds
[200]	training's l1: 0.481821	valid_1's l1: 0.531073
[400]	training's l1: 0.433792	valid_1's l1: 0.52003
[600]	training's l1: 0.40012	valid_1's l1: 0.516017
[800]	training's l1: 0.373517	valid_1's l1: 0.513704
[1000]	training's l1: 0.351311	valid_1's l1: 0.512835
[1200]	training's l1: 0.331946	valid_1's l1: 0.512266
Early stopping, best iteration is:
[1282]	training's l1: 0.32457	valid_1's l1: 0.51208

--- Fold 3 ---
Training until 

In [None]:
import pandas as pd, numpy as np
import os # Import the os module
train = pd.read_parquet('/content/drive/MyDrive/student_resource/dataset/train_features_step5.parquet')
oof_log = np.load('/content/oof_log.npy') if os.path.exists('/content/oof_log.npy') else None  # optional if you saved it
# reuse your oof_price array if it's in memory:
train['pred_log'] = np.expm1(oof_pred)
train['rel_err'] = np.abs(train['price'] - train['pred_log']) / ((train['price'].abs() + train['pred_log'].abs()) / 2)
train['price_bucket'] = pd.qcut(train['price'], 10, duplicates='drop')
print(train.groupby('price_bucket')['rel_err'].mean()*100)

price_bucket
(0.129, 3.565]      91.069367
(3.565, 5.6]        61.448891
(5.6, 7.99]         46.585576
(7.99, 10.78]       36.223069
(10.78, 14.0]       33.718445
(14.0, 18.45]       34.047098
(18.45, 24.69]      37.441463
(24.69, 33.891]     46.639423
(33.891, 52.301]    57.988496
(52.301, 2796.0]    69.899908
Name: rel_err, dtype: float64


  print(train.groupby('price_bucket')['rel_err'].mean()*100)


In [None]:
# FAST HOLDOUT specialist test (single split, small TF-IDF, light LGB)
import os, gc, joblib, numpy as np, pandas as pd
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from scipy.optimize import minimize

DATA_DIR = '/content/drive/MyDrive/student_resource/dataset'
OUT_DIR = '/content/drive/MyDrive/student_resource/fast_specialist'
os.makedirs(OUT_DIR, exist_ok=True)

train = pd.read_parquet(os.path.join(DATA_DIR, 'train_features_step5.parquet'))

# Fast TF-IDF sizes
TF_WORD = 8000   # small
TF_CHAR = 1000   # small
tf_word = TfidfVectorizer(max_features=TF_WORD, ngram_range=(1,2), min_df=5)
tf_char = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), max_features=TF_CHAR, min_df=10)

Xw = tf_word.fit_transform(train['catalog_clean'].fillna(''))
Xc = tf_char.fit_transform(train['catalog_clean'].fillna(''))
joblib.dump(tf_word, os.path.join(OUT_DIR,'tf_word_fast.pkl'))
joblib.dump(tf_char, os.path.join(OUT_DIR,'tf_char_fast.pkl'))

# numeric features (same as before)
num_cols = [c for c in ['ipq','text_len','num_tokens','num_digits','has_image',
                        'brand_freq','brand_mean_oof','token_mean_topk_avg_oof','brand_ppu_oof'] if c in train.columns]
X_num = train[num_cols].fillna(-1).astype(float).values
X_full = hstack([Xw, Xc, csr_matrix(X_num)])

# split once
train_idx, val_idx = train_test_split(np.arange(len(train)), test_size=0.2, random_state=42, stratify=pd.qcut(train['price'], 10, duplicates='drop'))
print("Train rows:", len(train_idx), "Val rows:", len(val_idx))

# log-target
y_log = np.log1p(train['price'].values)

# Train main on train_idx
dtr = lgb.Dataset(X_full[train_idx], label=y_log[train_idx])
dval = lgb.Dataset(X_full[val_idx], label=y_log[val_idx], reference=dtr)
params = {'objective':'regression','metric':'mae','learning_rate':0.06,'num_leaves':48,'min_data_in_leaf':80,'verbosity':-1,'seed':42}
m_main = lgb.train(params, dtr, num_boost_round=1200, valid_sets=[dtr,dval], callbacks=[lgb.early_stopping(50)])
joblib.dump(m_main, os.path.join(OUT_DIR,'m_main_fast.pkl'))

# main preds (val)
pred_main_val_log = m_main.predict(X_full[val_idx], num_iteration=m_main.best_iteration)
pred_main_val = np.expm1(pred_main_val_log)

# Specialist low: train only on low-price rows from train_idx (no CV)
LOW_T = 5.0
low_train_mask = (train['price'].values[train_idx] <= LOW_T)
low_val_mask = (train['price'].values[val_idx] <= LOW_T)
if low_train_mask.sum() >= 50 and low_val_mask.sum() >= 10:
    dtr_low = lgb.Dataset(X_full[train_idx][low_train_mask], label=y_log[train_idx][low_train_mask])
    dval_low = lgb.Dataset(X_full[val_idx][low_val_mask], label=y_log[val_idx][low_val_mask], reference=dtr_low)
    m_low = lgb.train({'objective':'regression','metric':'mae','learning_rate':0.07,'num_leaves':32,'min_data_in_leaf':30,'verbosity':-1,'seed':42},
                      dtr_low, num_boost_round=1000, valid_sets=[dtr_low,dval_low], callbacks=[lgb.early_stopping(40)])
    joblib.dump(m_low, os.path.join(OUT_DIR,'m_low_fast.pkl'))
    pred_low_val_log = np.full(len(val_idx), np.nan)
    pred_low_val_log[low_val_mask] = m_low.predict(X_full[val_idx][low_val_mask], num_iteration=m_low.best_iteration)
    pred_low_val = np.expm1(np.where(np.isnan(pred_low_val_log), np.log1p(pred_main_val), pred_low_val_log))  # fallback to main where specialist missing
else:
    print("Not enough low rows in split for specialist; skipping.")
    pred_low_val = pred_main_val.copy()

# Specialist high: similar (only if enough rows)
HIGH_T = 50.0
high_train_mask = (train['price'].values[train_idx] >= HIGH_T)
high_val_mask = (train['price'].values[val_idx] >= HIGH_T)
if high_train_mask.sum() >= 50 and high_val_mask.sum() >= 10:
    dtr_high = lgb.Dataset(X_full[train_idx][high_train_mask], label=y_log[train_idx][high_train_mask])
    dval_high = lgb.Dataset(X_full[val_idx][high_val_mask], label=y_log[val_idx][high_val_mask], reference=dtr_high)
    m_high = lgb.train({'objective':'regression','metric':'mae','learning_rate':0.06,'num_leaves':32,'min_data_in_leaf':50,'verbosity':-1,'seed':42},
                       dtr_high, num_boost_round=1000, valid_sets=[dtr_high,dval_high], callbacks=[lgb.early_stopping(40)])
    joblib.dump(m_high, os.path.join(OUT_DIR,'m_high_fast.pkl'))
    pred_high_val_log = np.full(len(val_idx), np.nan)
    pred_high_val_log[high_val_mask] = m_high.predict(X_full[val_idx][high_val_mask], num_iteration=m_high.best_iteration)
    pred_high_val = np.expm1(np.where(np.isnan(pred_high_val_log), np.log1p(pred_main_val), pred_high_val_log))
else:
    print("Not enough high rows in split for specialist; skipping.")
    pred_high_val = pred_main_val.copy()

# Optimize weights on the validation split to minimize SMAPE
y_val = train['price'].values[val_idx]
def smape(a,b):
    denom = (np.abs(a)+np.abs(b))/2
    mask = denom!=0
    r = np.zeros_like(denom); r[mask]=np.abs(a[mask]-b[mask])/denom[mask]; return r.mean()*100

# initial preds
A = pred_main_val
B = pred_low_val
C = pred_high_val
from scipy.optimize import minimize
def loss(w):
    w0, w1, w2 = w
    pred = w0*A + w1*B + w2*C
    return smape(y_val, pred)

# constraint sum=1
cons = ({'type':'eq','fun': lambda x: x.sum()-1.0})
res = minimize(loss, x0=np.array([0.7,0.15,0.15]), bounds=[(0,1)]*3, constraints=cons, method='SLSQP')
w_opt = res.x
print("Weights:", w_opt)
print("Validation SMAPE (blended):", loss(w_opt))


Train rows: 60000 Val rows: 15000
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1200]	training's l1: 0.311827	valid_1's l1: 0.512384
Training until validation scores don't improve for 40 rounds
Early stopping, best iteration is:
[370]	training's l1: 0.105927	valid_1's l1: 0.200051
Training until validation scores don't improve for 40 rounds
Early stopping, best iteration is:
[340]	training's l1: 0.102376	valid_1's l1: 0.246095
Weights: [4.75059618e-14 9.18513151e-01 8.14868495e-02]
Validation SMAPE (blended): 41.21927178348943


In [None]:
#  Auto-detect vectorizer + SVD that matches saved LightGBM fold models
# Then produce submission without retraining.
import os, joblib, json, sys
import numpy as np, pandas as pd
from scipy.sparse import hstack, csr_matrix
import lightgbm as lgb
import re

ROOT = '/content/drive/MyDrive/student_resource'
DATA_DIR = os.path.join(ROOT, 'dataset')
OUT_DIR = os.path.join(ROOT, 'final_submission_auto')
os.makedirs(OUT_DIR, exist_ok=True)

# Model files (use the lgb_log folds you reported)
MODEL_PATHS = [os.path.join(ROOT, f'lgb_log_fold{i}.txt') for i in range(1,6)]
# pick one model to query feature count
if len(MODEL_PATHS) == 0:
    raise FileNotFoundError("No fold models found in expected path.")
model_check = lgb.Booster(model_file=MODEL_PATHS[0])
model_num_feat = model_check.num_feature()
print("Model expects num_feature() =", model_num_feat)

# candidate vectorizers / SVDs to try (update names you see in Drive)
candidates = {
    'tf_word': os.path.join(ROOT, 'tf_word.pkl'),
    'tf_char': os.path.join(ROOT, 'tf_char.pkl'),
    'tfidf_fast': os.path.join(ROOT, 'tfidf_fast.pkl'),
    'tfidf_baseline': os.path.join(ROOT, 'tfidf_baseline.pkl'),
    'tfidf_log': os.path.join(ROOT, 'tf_log.pkl')  # optional
}
svd_w_path = os.path.join(ROOT, 'svd_w.pkl')
svd_c_path = os.path.join(ROOT, 'svd_c.pkl')

# load test csv
test_path = os.path.join(DATA_DIR, 'test.csv')
if not os.path.exists(test_path):
    test_path = os.path.join(DATA_DIR, 'test_prod.csv')
if not os.path.exists(test_path):
    raise FileNotFoundError("test.csv/test_prod.csv not found in dataset folder.")
test = pd.read_csv(test_path)
test.columns = [c.lower() for c in test.columns]
# rename id/desc
id_col = 'sample_id' if 'sample_id' in test.columns else test.columns[0]
desc_col = 'catalog_content' if 'catalog_content' in test.columns else test.columns[1]
test = test.rename(columns={id_col:'sample_id', desc_col:'catalog_content'})

# same text cleaning used before
def clean_text(s):
    if pd.isna(s): return ""
    s = str(s).lower()
    s = re.sub(r'https?://\S+', ' ', s)
    s = re.sub(r'[^a-z0-9\.\,\-\s/x×]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

test['catalog_clean'] = test['catalog_content'].fillna('').map(clean_text)
test['text_len'] = test['catalog_clean'].str.len()
test['num_tokens'] = test['catalog_clean'].str.split().map(len)
test['num_digits'] = test['catalog_clean'].str.count(r'\d')
test['has_image'] = test.get('image_link','').fillna('').astype(str).str.strip().astype(bool).astype(int)

# numeric columns that training used (if present in train_features parquet)
train_feat_path = os.path.join(DATA_DIR, 'train_features_step5.parquet')
if not os.path.exists(train_feat_path):
    raise FileNotFoundError("train_features_step5.parquet required (for numeric features).")
train = pd.read_parquet(train_feat_path)
candidate_num_cols = ['ipq','text_len','num_tokens','num_digits','has_image',
                      'brand_freq','brand_mean_oof','token_mean_topk_avg_oof','brand_ppu_oof',
                      'brand_mean','token_mean_topk_avg','brand_ppu']
# determine numeric columns available for test (map names)
test_num_cols = []
for c in candidate_num_cols:
    base = c.replace('_oof','')
    if base in test.columns:
        test_num_cols.append(base)
# fallback to safe core
if len(test_num_cols) == 0:
    for c in ['text_len','num_tokens','num_digits','has_image']:
        if c in test.columns: test_num_cols.append(c)

print("Numeric columns to include in X:", test_num_cols)

# Helper to try a transform and check feature count
def try_transform_and_count(what):
    """
    what: dict specifying the option to try, e.g.
    {'mode':'single','vec':'tfidf_fast','use_svd':False}
    or {'mode':'wordchar','vecs':['tf_word','tf_char'],'svd':(True,False)}
    """
    # build text features
    if what['mode']=='single':
        vec_path = candidates[what['vec']]
        vec = joblib.load(vec_path)
        X_text = vec.transform(test['catalog_clean'].fillna(''))
        if what.get('use_svd', False):
            svd = joblib.load(svd_w_path) if os.path.exists(svd_w_path) else None
            if svd is None:
                # cannot apply svd
                return None
            X_text = svd.transform(X_text)
    elif what['mode']=='wordchar':
        vec_w = joblib.load(candidates['tf_word'])
        vec_c = joblib.load(candidates['tf_char'])
        Xw = vec_w.transform(test['catalog_clean'].fillna(''))
        Xc = vec_c.transform(test['catalog_clean'].fillna(''))
        if what.get('svd_w', False):
            if not os.path.exists(svd_w_path):
                return None
            Xw = joblib.load(svd_w_path).transform(Xw)
        if what.get('svd_c', False):
            if not os.path.exists(svd_c_path):
                return None
            Xc = joblib.load(svd_c_path).transform(Xc)
        X_text = hstack([csr_matrix(Xw) if not isinstance(Xw, csr_matrix) else Xw,
                         csr_matrix(Xc) if not isinstance(Xc, csr_matrix) else Xc])
    else:
        return None

    # numeric part
    if len(test_num_cols) > 0:
        X_num = test[test_num_cols].fillna(-1).astype(float).values
        X_full = hstack([csr_matrix(X_text) if not isinstance(X_text, csr_matrix) else X_text, csr_matrix(X_num)])
    else:
        X_full = X_text

    # get shape (n_features)
    n_feat = X_full.shape[1]
    return X_full, n_feat

# Enumerate candidate pipelines to try
options = []
# single tfidf candidates (no svd)
for k,p in candidates.items():
    if os.path.exists(p):
        options.append({'mode':'single','vec':k,'use_svd':False})
# single with svd (if svd exists)
if os.path.exists(svd_w_path):
    for k,p in candidates.items():
        if os.path.exists(p):
            options.append({'mode':'single','vec':k,'use_svd':True})
# try combined word+char with/without svd
if os.path.exists(candidates['tf_word']) and os.path.exists(candidates['tf_char']):
    options.append({'mode':'wordchar','svd_w':False,'svd_c':False})
    if os.path.exists(svd_w_path) or os.path.exists(svd_c_path):
        options.append({'mode':'wordchar','svd_w':True,'svd_c':True})
        options.append({'mode':'wordchar','svd_w':True,'svd_c':False})
        options.append({'mode':'wordchar','svd_w':False,'svd_c':True})

print("Trying", len(options), "candidate preprocessing options...")

matched = False
match_info = None
X_chosen = None

for opt in options:
    try:
        res = try_transform_and_count(opt)
        if res is None:
            # skip if transformation couldn't be applied
            continue
        X_tmp, n_feat = res
        print("Tried:", opt, "-> features:", n_feat)
        if n_feat == model_num_feat:
            print("✔ MATCH found for option:", opt)
            matched = True
            X_chosen = X_tmp
            match_info = opt
            break
    except Exception as e:
        print("Error trying option", opt, ":", e)
        continue

if not matched:
    print("\n❌ No exact match found. Next steps:")
    print("- Inspect model.feature_name() to see pattern of feature names:")
    try:
        fnames = model_check.feature_name()[:50]
        print("First feature names in model:", fnames)
    except Exception as e:
        print("Could not read model feature names:", e)
    # show available vectorizer dims for info
    for k,p in candidates.items():
        if os.path.exists(p):
            v = joblib.load(p)
            try:
                dim = v.transform(test['catalog_clean'].fillna('')).shape[1]
            except:
                dim = None
            print(f"Vectorizer {k} -> dim {dim}")
    if os.path.exists(svd_w_path):
        svd = joblib.load(svd_w_path)
        print("svd_w n_components:", getattr(svd,'n_components_', None))
    if os.path.exists(svd_c_path):
        svd = joblib.load(svd_c_path)
        print("svd_c n_components:", getattr(svd,'n_components_', None))
    print("\nIf no match, either:")
    print(" - model expects combined TF-IDF with different stacking/order (we need that exact pipeline),")
    print(" - or the model used a TF-IDF file not present here (e.g. tfidf_fast.pkl). Check which file was used when training.")
    raise SystemExit("No matching preprocessing found. Ask to inspect model.feature_name() and TF-IDF files.")

# If matched, do inference with all fold models and save submission
print("Using matched option:", match_info, "to build features and predict with all folds.")

# Average preds across all fold models (verify each model num_feature equals n_feat)
for p in MODEL_PATHS:
    m = lgb.Booster(model_file=p)
    if m.num_feature() != X_chosen.shape[1]:
        print(f"Warning: model {p} expects {m.num_feature()} features but X has {X_chosen.shape[1]}. Skipping this model.")
    else:
        pass

# Build list of valid fold models to use (match shape)
valid_models = []
for p in MODEL_PATHS:
    m = lgb.Booster(model_file=p)
    if m.num_feature() == X_chosen.shape[1]:
        valid_models.append(p)
    else:
        print("Skipping model (shape mismatch):", p)

if len(valid_models) == 0:
    raise SystemExit("No fold models have matching feature count. We cannot proceed without the correct preprocessing pipeline.")

print("Valid models to use:", valid_models)

# Predict and average
preds = []
for p in valid_models:
    m = lgb.Booster(model_file=p)
    preds.append(m.predict(X_chosen))
pred = np.mean(preds, axis=0)
# model was trained on log1p -> invert
pred_price = np.expm1(pred)
pred_price = np.clip(pred_price, 0.01, None)

# Save submission
out_path = os.path.join(OUT_DIR, 'test_out_matched.csv')
pd.DataFrame({'sample_id': test['sample_id'], 'price': pred_price}).to_csv(out_path, index=False)
print("Saved submission to:", out_path)


Model expects num_feature() = 15009
Numeric columns to include in X: ['text_len', 'num_tokens', 'num_digits', 'has_image']
Trying 12 candidate preprocessing options...
Tried: {'mode': 'single', 'vec': 'tf_word', 'use_svd': False} -> features: 20004
Tried: {'mode': 'single', 'vec': 'tf_char', 'use_svd': False} -> features: 5004
Tried: {'mode': 'single', 'vec': 'tfidf_fast', 'use_svd': False} -> features: 10004
Tried: {'mode': 'single', 'vec': 'tfidf_baseline', 'use_svd': False} -> features: 30004
Tried: {'mode': 'single', 'vec': 'tf_word', 'use_svd': True} -> features: 204
Error trying option {'mode': 'single', 'vec': 'tf_char', 'use_svd': True} : X has 5000 features, but TruncatedSVD is expecting 20000 features as input.
Error trying option {'mode': 'single', 'vec': 'tfidf_fast', 'use_svd': True} : X has 10000 features, but TruncatedSVD is expecting 20000 features as input.
Error trying option {'mode': 'single', 'vec': 'tfidf_baseline', 'use_svd': True} : X has 30000 features, but Trun

SystemExit: No matching preprocessing found. Ask to inspect model.feature_name() and TF-IDF files.

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# ===== Fix missing numeric features & run inference (auto) =====
import os, joblib, re, numpy as np, pandas as pd
from scipy.sparse import hstack, csr_matrix
import lightgbm as lgb

ROOT = '/content/drive/MyDrive/student_resource'
DATA_DIR = os.path.join(ROOT, 'dataset')
OUT_DIR = os.path.join(ROOT, 'final_submission_fixed2')
os.makedirs(OUT_DIR, exist_ok=True)

# paths
MODEL_PATHS = [os.path.join(ROOT, f'lgb_log_fold{i}.txt') for i in range(1,6)]
TF_FAST = os.path.join(ROOT, 'tfidf_fast.pkl')   # 10000 dim
TF_CHAR = os.path.join(ROOT, 'tf_char.pkl')      # 5000 dim
TRAIN_FEAT = os.path.join(DATA_DIR, 'train_features_step5.parquet')
TEST_PATH = os.path.join(DATA_DIR, 'test.csv')
if not os.path.exists(TEST_PATH):
    TEST_PATH = os.path.join(DATA_DIR, 'test_prod.csv')

# quick checks
for p in [TF_FAST, TF_CHAR, TRAIN_FEAT, TEST_PATH]:
    if not os.path.exists(p):
        raise FileNotFoundError(f"Missing required file: {p}")

# load train features to inspect numeric columns used in training
train = pd.read_parquet(TRAIN_FEAT)
print("Loaded train features shape:", train.shape)
print("Example train columns:", train.columns.tolist()[:40])

# Inspect numeric columns that likely were appended at training time.
# We will search for the common numeric feature names used earlier.
possible_numeric_order = [
    'ipq', 'text_len', 'num_tokens', 'num_digits', 'has_image',
    'brand_freq', 'brand_mean_oof', 'token_mean_topk_avg_oof', 'brand_ppu_oof',
    'brand_mean', 'token_mean_topk_avg', 'brand_ppu', 'price_per_unit'
]

# Determine which of these exist in train, and capture the list in the order expected
numeric_in_train = [c for c in possible_numeric_order if c in train.columns]
print("Numeric features present in train (in guessed priority order):", numeric_in_train)

# We expect the training numeric block length = model_num - tf_fast_dim - tf_char_dim
model_ref = lgb.Booster(model_file=MODEL_PATHS[0])
model_num_feat = model_ref.num_feature()
print("Model expects total features:", model_num_feat)

# get tf dims
tf_fast = joblib.load(TF_FAST)
tf_char = joblib.load(TF_CHAR)
try:
    tf_fast_dim = tf_fast.transform(["test"]).shape[1]
except:
    tf_fast_dim = getattr(tf_fast, 'vocabulary_', None) and len(tf_fast.vocabulary_) or None
tf_char_dim = tf_char.transform(["test"]).shape[1]
print("tf_fast_dim:", tf_fast_dim, "tf_char_dim:", tf_char_dim)

expected_numeric_cnt = model_num_feat - tf_fast_dim - tf_char_dim
print("Model expects numeric block length =", expected_numeric_cnt)

# Build test dataframe and compute/derive numeric columns
test = pd.read_csv(TEST_PATH)
test.columns = [c.lower() for c in test.columns]
# rename id & desc
if 'sample_id' not in test.columns:
    test.rename(columns={test.columns[0]: 'sample_id'}, inplace=True)
if 'catalog_content' not in test.columns:
    test.rename(columns={test.columns[1]: 'catalog_content'}, inplace=True)

# basic cleaning + some extractors (same as before)
def clean_text(s):
    if pd.isna(s): return ""
    s = str(s).lower()
    s = re.sub(r'https?://\S+', ' ', s)
    s = re.sub(r'[^a-z0-9\.\,\-\s/x×]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def extract_ipq(s):
    if not s: return np.nan
    s = str(s).lower()
    m = re.search(r'(\d+)\s*(?:pack|packs|pk|pks|pcs|pieces|count|set)\b', s)
    if m: return int(m.group(1))
    m = re.search(r'\b(\d+)\s*[x×]\s*\d', s)
    if m: return int(m.group(1))
    m = re.search(r'\b(\d+)[x×]\b', s)
    if m: return int(m.group(1))
    m = re.search(r'\b(\d+) ?- ?pack\b', s)
    if m: return int(m.group(1))
    return np.nan

test['catalog_clean'] = test['catalog_content'].fillna('').map(clean_text)
test['text_len'] = test['catalog_clean'].str.len()
test['num_tokens'] = test['catalog_clean'].str.split().map(len)
test['num_digits'] = test['catalog_clean'].str.count(r'\d')
test['has_image'] = test.get('image_link','').fillna('').astype(str).str.strip().astype(bool).astype(int)
test['ipq'] = test['catalog_clean'].map(extract_ipq)

# derive brand_guess and mapping-based aggregates from train
if 'brand_guess' in train.columns:
    # compute mapping tables from train
    brand_freq_map = train.groupby('brand_guess')['brand_guess'].count().to_dict()
    brand_freq_map = {k: v / len(train) for k, v in brand_freq_map.items()}
    # brand_mean_oof if present in train: use groupby first value or mean
    if 'brand_mean_oof' in train.columns:
        brand_mean_map = train.groupby('brand_guess')['brand_mean_oof'].first().to_dict()
    else:
        brand_mean_map = train.groupby('brand_guess')['price'].mean().to_dict()
    # brand_ppu_oof
    if 'brand_ppu_oof' in train.columns:
        brand_ppu_map = train.groupby('brand_guess')['brand_ppu_oof'].first().to_dict()
    elif 'price_per_unit' in train.columns:
        brand_ppu_map = train.groupby('brand_guess')['price_per_unit'].mean().to_dict()
    else:
        brand_ppu_map = {}

    # token mean: fallback to column token_mean_topk_avg_oof if present
    if 'token_mean_topk_avg_oof' in train.columns:
        token_mean_default = train['token_mean_topk_avg_oof'].mean()
    elif 'token_mean_topk_avg' in train.columns:
        token_mean_default = train['token_mean_topk_avg'].mean()
    else:
        token_mean_default = train['price'].mean()

    # compute brand_guess for test
    def guess_brand(s):
        toks = str(s).split()
        return toks[0] if len(toks)>0 else ''
    test['brand_guess'] = test['catalog_clean'].map(guess_brand)
    test['brand_freq'] = test['brand_guess'].map(brand_freq_map).fillna(0.0)
    test['brand_mean_oof'] = test['brand_guess'].map(brand_mean_map).fillna(train['price'].mean())
    test['brand_ppu_oof'] = test['brand_guess'].map(brand_ppu_map).fillna(np.nan)
    # token_mean_topk_avg_oof fallback
    test['token_mean_topk_avg_oof'] = token_mean_default
else:
    # best-effort fallback: create these columns with defaults
    test['brand_guess'] = ''
    test['brand_freq'] = 0.0
    test['brand_mean_oof'] = train['price'].mean()
    test['brand_ppu_oof'] = np.nan
    test['token_mean_topk_avg_oof'] = train['price'].mean()

# Now assemble numeric columns in the exact order expected.
# We will attempt the 9-column set; if expected_numeric_cnt is different, adjust.
numeric_full_order = []
# Try to use exact columns (the training pipeline likely used these 9)
for c in ['ipq','text_len','num_tokens','num_digits','has_image',
          'brand_freq','brand_mean_oof','token_mean_topk_avg_oof','brand_ppu_oof']:
    if c in test.columns:
        numeric_full_order.append(c)

print("Numeric columns to append (final):", numeric_full_order)
print("Count numeric columns available:", len(numeric_full_order), "Expected numeric count by model:", expected_numeric_cnt)

if len(numeric_full_order) < expected_numeric_cnt:
    # try adding any other numeric present in train that we might have missed
    extras = [c for c in train.columns if c not in numeric_full_order and train[c].dtype in [int,float]]
    for e in extras:
        if len(numeric_full_order) >= expected_numeric_cnt: break
        numeric_full_order.append(e)
    print("After adding extras attempt, numeric columns:", numeric_full_order)

# Build TF-IDF matrices using exact tfidf_fast + tf_char
tf_fast = joblib.load(TF_FAST)
tf_char = joblib.load(TF_CHAR)
X_fast = tf_fast.transform(test['catalog_clean'])
X_char = tf_char.transform(test['catalog_clean'])
print("tf_fast dim:", X_fast.shape[1], "tf_char dim:", X_char.shape[1])

# numeric array
X_num = test[numeric_full_order].fillna(-1).astype(float).values if len(numeric_full_order)>0 else None

# compose final X_test in exact stacking order: fast + char + numeric_full_order
from scipy.sparse import csr_matrix
if X_num is not None:
    X_test = hstack([csr_matrix(X_fast), csr_matrix(X_char), csr_matrix(X_num)])
else:
    X_test = hstack([csr_matrix(X_fast), csr_matrix(X_char)])
print("Built X_test shape:", X_test.shape)

# verify feature count matches model
if X_test.shape[1] != model_num_feat:
    raise RuntimeError(f"Feature count mismatch after adding numerics: X_test.columns = {X_test.shape[1]}, model expects {model_num_feat}. Adjust numeric order or include the missing columns.")

# predict with fold models
preds = []
for p in MODEL_PATHS:
    m = lgb.Booster(model_file=p)
    if m.num_feature() != X_test.shape[1]:
        print("Skipping model (shape mismatch):", p, "expects", m.num_feature())
        continue
    preds.append(m.predict(X_test))
if len(preds) == 0:
    raise RuntimeError("No matching fold models to predict with (shape mismatch).")

pred = np.mean(preds, axis=0)
pred_price = np.expm1(pred)
pred_price = np.clip(pred_price, 0.01, None)

out_path = os.path.join(OUT_DIR, 'test_out_fixed2.csv')
pd.DataFrame({'sample_id': test['sample_id'], 'price': pred_price}).to_csv(out_path, index=False)
print("Saved submission:", out_path)
print("Rows:", len(pred_price))


Loaded train features shape: (75000, 20)
Example train columns: ['sample_id', 'catalog_content', 'image_link', 'price', 'catalog_clean', 'text_len', 'num_tokens', 'num_digits', 'has_image', 'ipq', 'unit_amount', 'unit_type', 'brand_guess', 'brand_freq', 'brand_mean_oof', 'tokens', 'token_mean_topk_avg_oof', 'units_total', 'price_per_unit', 'brand_ppu_oof']
Numeric features present in train (in guessed priority order): ['ipq', 'text_len', 'num_tokens', 'num_digits', 'has_image', 'brand_freq', 'brand_mean_oof', 'token_mean_topk_avg_oof', 'brand_ppu_oof', 'price_per_unit']
Model expects total features: 15009
tf_fast_dim: 10000 tf_char_dim: 5000
Model expects numeric block length = 9
Numeric columns to append (final): ['ipq', 'text_len', 'num_tokens', 'num_digits', 'has_image', 'brand_freq', 'brand_mean_oof', 'token_mean_topk_avg_oof', 'brand_ppu_oof']
Count numeric columns available: 9 Expected numeric count by model: 9
tf_fast dim: 10000 tf_char dim: 5000
Built X_test shape: (75000, 1500

In [None]:
for path in MODEL_PATHS:
    print(lgb.Booster(model_file=path).num_feature(), X_test.shape[1])


15009 15009
15009 15009
15009 15009
15009 15009
15009 15009
