In [1]:
pip install tf-keras


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# ============================
# RAG + Small LLM (Flan-T5 Small) — Zero-deps friendly
# ============================

import sys, subprocess, importlib, os, re, math, random, warnings
warnings.filterwarnings("ignore")

# --- installer ---
def ensure(pkg_spec, import_name=None):
    name = (import_name or pkg_spec.split("==")[0].split(">=")[0].split("[")[0]).strip()
    try:
        importlib.import_module(name)
    except Exception:
        print(f"Installing {pkg_spec} …")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg_spec])

# Core scientific stack
ensure("numpy>=1.22", "numpy")
ensure("pandas>=1.5.0", "pandas")
ensure("scikit-learn>=1.1.0", "sklearn")
ensure("tqdm>=4.66.0", "tqdm")

# Torch (CPU by default; will use GPU if available)
try:
    import torch
except Exception:
    try:
        # Generic install (pip resolves best wheel)
        ensure("torch>=2.0.0", "torch")
    except Exception:
        # CPU fallback index
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
                               "-i", "https://download.pytorch.org/whl/cpu", "torch"])
import torch

# NLP + embeddings
ensure("transformers>=4.37.0", "transformers")
ensure("sentencepiece>=0.1.99", "sentencepiece")
ensure("accelerate>=0.27.0", "accelerate")
ensure("sentence-transformers>=2.2.2", "sentence_transformers")

# FAISS (optional). If it fails, we'll fallback to sklearn NearestNeighbors.
USE_FAISS = True
try:
    import faiss # type: ignore
except Exception:
    try:
        ensure("faiss-cpu>=1.7.4", "faiss")
        import faiss # type: ignore
    except Exception:
        print("faiss-cpu not available; falling back to sklearn NearestNeighbors.")
        USE_FAISS = False

# ---- imports after install ----
import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

# ---------------- Config ----------------
SEED = 42
random.seed(SEED); np.random.seed(SEED)
MODEL_NAME = "google/flan-t5-small" # you can switch to "google/t5-v1_1-small" or "google/byt5-small"
TOPK = 13 # CHANGED FROM 5 TO 13
SNIP = 200      # chars per neighbor
VAL_SIZE = 0.20
MAX_PROMPT_TOKENS = 512
MAX_NEW_TOKENS = 12

# ---------------- Helpers ----------------
_WS = re.compile(r"\s+")
def clean_text(s):
    s = "" if not isinstance(s, str) else s.replace("\x00", " ")
    return _WS.sub(" ", s).strip()

def parse_price_str(s, default=0.0):
    s = "" if s is None else str(s)
    s = s.replace(",", "").strip()
    m = re.search(r"[-+]?\d*\.?\d+", s)
    return float(m.group(0)) if m else default

def smape(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, float); y_pred = np.asarray(y_pred, float)
    return 100.0 * np.mean(np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + eps))

def smape_calibrate(y_true, y_pred, lo=0.7, hi=1.3, n=121):
    xs = np.linspace(lo, hi, int(n)); best_s, best_v = 1.0, 1e9
    for s in xs:
        v = smape(y_true, y_pred * s)
        if v < best_v: best_s, best_v = float(s), float(v)
    return best_s, best_v

def clamp_and_clip(pred, lo_clip, hi_clip):
    pred = np.asarray(pred, float)
    pred[pred < 0.0] = 0.0
    return np.clip(pred, lo_clip, hi_clip)

def rmse_compat(y_true, y_pred):
    try:
        return float(mean_squared_error(y_true, y_pred, squared=False))
    except TypeError:
        return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# ---------------- Load data ----------------
df = pd.read_csv("train.csv")
assert {"catalog_content", "price"}.issubset(df.columns), "train.csv must contain catalog_content and price."

df["catalog_content"] = df["catalog_content"].map(clean_text)
df["price"] = pd.to_numeric(df["price"], errors="coerce").fillna(0.0).astype(float)
df["input_text"] = df["catalog_content"].astype(str)

# Stratified 80/20 split by price quantiles
q_bins = min(20, max(2, int(df["price"].nunique() // 5)))
bins = pd.qcut(df["price"], q=q_bins, duplicates="drop", labels=False)
train_df, valid_df = train_test_split(df, test_size=VAL_SIZE, random_state=SEED, stratify=bins)
print(f"Train: {len(train_df)} | Valid: {len(valid_df)}")

# Robust clipping thresholds from train only
lo_clip = float(np.quantile(train_df["price"].values, 0.001))
hi_clip = float(np.quantile(train_df["price"].values, 0.999))
print(f"Train price clip range: [{lo_clip:.3f}, {hi_clip:.3f}]")

# ---------------- Build retriever ----------------
print("Encoding train embeddings…")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
train_texts = train_df["input_text"].tolist()
train_prices = train_df["price"].values
train_emb = embedder.encode(train_texts, convert_to_numpy=True, show_progress_bar=True, batch_size=64, normalize_embeddings=True)

if USE_FAISS:
    dim = train_emb.shape[1]
    index = faiss.IndexFlatIP(dim) # cosine via inner product on normalized embeddings
    index.add(train_emb)
else:
    # Fit a cosine NN index with sklearn (brute force)
    nn = NearestNeighbors(n_neighbors=min(13, len(train_texts)), metric="cosine", algorithm="brute") # CHANGED FROM TOPK TO 13
    nn.fit(train_emb)

# ---------------- Load small LLM ----------------
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

# ---------------- RAG prediction ----------------
def rag_predict_prices(texts, k=13): # CHANGED FROM TOPK TO 13
    preds = []
    med_train = float(np.median(train_prices))
    for txt in tqdm(texts, desc="🔍 RAG + Small LLM Predicting"):
        # retrieve indices of top-k similar train items
        q_emb = embedder.encode([txt], convert_to_numpy=True, normalize_embeddings=True)
        if USE_FAISS:
            D, I = index.search(q_emb, k)
            retrieved = train_df.iloc[I[0]]
        else:
            # sklearn cosine distance; lower is closer
            D, I = nn.kneighbors(q_emb, n_neighbors=k, return_distance=True)
            retrieved = train_df.iloc[I[0]]

        # build examples text (avoid f-string backslash issue)
        lines = []
        for _, r in retrieved.iterrows():
            snippet = str(r["input_text"])[:SNIP].replace("\n", " ")
            lines.append(f"Example: {snippet} ... => price={float(r['price']):.2f}")
        examples = "\n".join(lines)

        prompt = (
            "You are a pricing assistant. Using the examples of similar products and their prices, "
            "predict the price for the new product.\n\n"
            f"Similar examples:\n{examples}\n\n"
            f"New product:\n{txt[:512]}\n\n"
            "Answer with only a number in rupees, with two decimals (e.g., 129.99)."
        )

        inp = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_PROMPT_TOKENS).to(device)
        with torch.no_grad():
            out_ids = model.generate(**inp, max_new_tokens=MAX_NEW_TOKENS, do_sample=False, num_beams=1)
        pred_text = tokenizer.decode(out_ids[0], skip_special_tokens=True)
        preds.append(parse_price_str(pred_text, default=med_train))
    return np.array(preds, dtype=float)

# ---------------- Evaluate on validation ----------------
y_true = valid_df["price"].values.astype(float)
y_pred = rag_predict_prices(valid_df["input_text"].tolist(), k=13) # CHANGED FROM TOPK TO 13

# post-process + calibration
y_pred = clamp_and_clip(y_pred, lo_clip, hi_clip)
scale, smape_scaled = smape_calibrate(y_true, y_pred, lo=0.7, hi=1.3, n=121)
y_pred_scaled = y_pred * scale

mae  = mean_absolute_error(y_true, y_pred_scaled)
rmse = rmse_compat(y_true, y_pred_scaled)

print("\n RAG + Small LLM (model: %s, k=%d)" % (MODEL_NAME, 13)) # CHANGED FROM TOPK TO 13
print(f"SMAPE (raw)    : {smape(y_true, y_pred):.3f}")
print(f"SMAPE (scaled) : {smape(y_true, y_pred_scaled):.3f}  (scale={scale:.4f})")
print(f"MAE            : {mae:.3f}")
print(f"RMSE           : {rmse:.3f}")

2025-10-30 00:48:01.344221: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-30 00:48:01.396965: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Train: 60000 | Valid: 15000
Train price clip range: [0.640, 329.993]
Encoding train embeddings…


Batches:   0%|          | 0/938 [00:00<?, ?it/s]

🔍 RAG + Small LLM Predicting: 100%|██████████████████████████████████████████████| 15000/15000 [22:15<00:00, 11.23it/s]


 RAG + Small LLM (model: google/flan-t5-small, k=13)
SMAPE (raw)    : 43.196
SMAPE (scaled) : 43.196  (scale=1.0000)
MAE            : 21.186
RMSE           : 43.799



