In [1]:
pip install lightgbm tqdm scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install -U lightgbm

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import ast, re, pickle
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from pathlib import Path

#  Load LeetCode dataset
df = pd.read_csv("../Data_Pipeline/preprocessed_data.csv")

def clean_title(text):
    text = str(text).lower().strip()
    text = re.sub(r'^\d+\.\s*', '', text)
    text = re.sub(r'[^a-z0-9\s\-]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def parse_similar(x):
    try:
        if isinstance(x, str):
            lst = ast.literal_eval(x)
            return [clean_title(i) for i in lst]
        return []
    except Exception:
        return []

df['clean_title'] = df['title'].apply(clean_title)
df['ground_truth'] = df['similar_questions'].apply(parse_similar)

# ---- Load precomputed embeddings (from your previous hybrid model) ----
cache_path = Path("sbert_embeddings.pkl")
if cache_path.exists():
    with open(cache_path, "rb") as f:
        cache = pickle.load(f)
    embeddings = np.array(cache["embeddings"], dtype=np.float32)
else:
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(df['title'].tolist(), show_progress_bar=True, normalize_embeddings=True)
    with open(cache_path, "wb") as f:
        pickle.dump({"model_name": "all-MiniLM-L6-v2", "embeddings": embeddings}, f)

# Recreate helper similarities (tag, difficulty, popularity)
def to_tag_list(s):
    if isinstance(s, str) and s.startswith('['):
        try:
            vals = ast.literal_eval(s)
            return [str(v).lower().strip() for v in vals]
        except Exception:
            pass
    return [t.strip().lower() for t in str(s).split(',') if t.strip()]

df['tag_list'] = df['topic_tags'].apply(to_tag_list)
df['difficulty'] = df['difficulty'].fillna('Medium')

N = len(df)

# Tag Jaccard matrix
tag_sims = np.zeros((N, N), dtype=np.float32)
for i in tqdm(range(N), desc="Tag similarity"):
    tags_i = set(df.iloc[i]['tag_list'])
    for j in range(i+1, N):
        tags_j = set(df.iloc[j]['tag_list'])
        if not tags_i or not tags_j:
            continue
        inter = len(tags_i & tags_j)
        uni = len(tags_i | tags_j)
        if uni:
            val = inter / uni
            tag_sims[i,j] = tag_sims[j,i] = val

# Difficulty matrix
ladder = {"easy":0, "medium":1, "hard":2}
diff_vals = df['difficulty'].str.lower().map(ladder).fillna(1).to_numpy(dtype=np.int8)
diff_sims = np.ones((N, N), dtype=np.float32)
for i in range(N):
    for j in range(i+1, N):
        d = abs(diff_vals[i]-diff_vals[j])
        s = 1.0 if d==0 else 0.7 if d==1 else 0.4
        diff_sims[i,j] = diff_sims[j,i] = s

# Popularity normalization
def minmax(x):
    x = x.fillna(0)
    rng = x.max() - x.min()
    return (x - x.min())/rng if rng != 0 else x*0

acc = minmax(df['acceptance']) if 'acceptance' in df else 0
likes = minmax(df['likes']) if 'likes' in df else 0
subs  = minmax(df['submission']) if 'submission' in df else 0
popularity_score = (0.3*acc + 0.5*likes + 0.2*subs).fillna(0).to_numpy(dtype=np.float32)

print("Setup complete: df, embeddings, tag_sims, diff_sims, popularity_score loaded.")

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 116/116 [00:11<00:00, 10.21it/s]
Tag similarity: 100%|██████████| 3706/3706 [06:01<00:00, 10.25it/s] 


Setup complete: df, embeddings, tag_sims, diff_sims, popularity_score loaded.


In [2]:
# FAST LIGHTGBM PAIRWISE RECOMMENDER
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# TRAINING DATA GENERATION
print("Generating positive & negative pairs (vectorized)...")

positive_rows = []
negative_rows = []

# Generate positive and negative pairs more efficiently
for i in tqdm(range(len(df))):
    gt = df.loc[i, 'ground_truth']
    if not gt:
        continue
    # positive pairs
    pos_idx = []
    for title in gt:
        j_match = df.index[df['clean_title'] == title].tolist()
        if j_match:
            pos_idx.append(j_match[0])
    if not pos_idx:
        continue

    # compute positives
    pos_emb = np.dot(embeddings[i], embeddings[pos_idx].T)
    pos_tag = tag_sims[i, pos_idx]
    pos_diff = diff_sims[i, pos_idx]
    pos_pop = np.abs(popularity_score[i] - popularity_score[pos_idx])
    pos_feats = np.stack([pos_emb, pos_tag, pos_diff, pos_pop], axis=1)
    positive_rows.append(pos_feats)

    # negative samples (5x random)
    rand_idx = np.random.choice(len(df), size=min(5*len(pos_idx), len(df)), replace=False)
    rand_idx = rand_idx[rand_idx != i]
    neg_emb = np.dot(embeddings[i], embeddings[rand_idx].T)
    neg_tag = tag_sims[i, rand_idx]
    neg_diff = diff_sims[i, rand_idx]
    neg_pop = np.abs(popularity_score[i] - popularity_score[rand_idx])
    neg_feats = np.stack([neg_emb, neg_tag, neg_diff, neg_pop], axis=1)
    negative_rows.append(neg_feats)

X_pos = np.vstack(positive_rows)
X_neg = np.vstack(negative_rows)
y_pos = np.ones(len(X_pos))
y_neg = np.zeros(len(X_neg))

X = np.vstack([X_pos, X_neg])
y = np.concatenate([y_pos, y_neg])

print(f"Training samples: {len(X)}, Positives: {int(sum(y))}, Negatives: {len(y)-int(sum(y))}")

#  TRAIN LIGHTGBM
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "verbosity": -1
}

model = lgb.train(
    params,
    train_data,
    num_boost_round=200,
    valid_sets=[train_data, val_data]
)


print("\nFeature importances:")
for name, importance in zip(["sim_emb", "sim_tag", "sim_diff", "pop_diff"], model.feature_importance()):
    print(f"{name:10s} -> {importance}")

# RECOMMEND FUNCTION
def rec_lightgbm_fast(i, k=10):
    emb_sims = np.dot(embeddings[i], embeddings.T)
    tag_vals = tag_sims[i]
    diff_vals = diff_sims[i]
    pop_diffs = np.abs(popularity_score[i] - popularity_score)
    feats = np.stack([emb_sims, tag_vals, diff_vals, pop_diffs], axis=1)
    scores = model.predict(feats)
    scores[i] = -1e9  # exclude self
    top_idx = np.argsort(scores)[-k:][::-1]
    return [df.iloc[j]['clean_title'] for j in top_idx]

#EVALUATION
def precision_recall_ndcg(k=10, limit=300):
    P,R,N = [],[],[]
    for i in tqdm(range(min(limit, len(df)))):
        gt = df.loc[i, 'ground_truth']
        if not gt:
            continue
        preds = rec_lightgbm_fast(i, k)
        hits = len(set(preds) & set(gt))
        p = hits / k
        r = hits / len(gt)
        rel = [1 if x in gt else 0 for x in preds[:k]]
        dcg = sum(r_ / np.log2(idx+2) for idx, r_ in enumerate(rel))
        idcg = sum(sorted(rel, reverse=True)[i]/np.log2(i+2) for i in range(len(rel)))
        ndcg = dcg/idcg if idcg>0 else 0
        P.append(p); R.append(r); N.append(ndcg)
    return np.mean(P), np.mean(R), np.mean(N)

print("\nEvaluating LightGBM model on 300 problems...")
P, R, N = precision_recall_ndcg(k=10, limit=300)
print(f"\nLightGBM → Precision@10={P:.4f}, Recall@10={R:.4f}, NDCG@10={N:.4f}")


Generating positive & negative pairs (vectorized)...


100%|██████████| 3706/3706 [00:03<00:00, 940.50it/s] 


Training samples: 30720, Positives: 5121, Negatives: 25599

Feature importances:
sim_emb    -> 2373
sim_tag    -> 1027
sim_diff   -> 371
pop_diff   -> 2229

Evaluating LightGBM model on 300 problems...


100%|██████████| 300/300 [00:04<00:00, 72.53it/s]


LightGBM → Precision@10=0.1407, Recall@10=0.4554, NDCG@10=0.6041





In [4]:
import pickle
pickle.dump(model, open("../models/lightgbm_model.pkl", "wb"))
print(" Model saved successfully ")


 Model saved successfully 
