In [13]:
import os, re, ast, pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from sentence_transformers import SentenceTransformer
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle


DATA_PATH = Path(r"C:\Users\yashw\PycharmProjects\PythonProject4\data\processed\preprocessed_data.csv")
EMB_PATH  = Path(r"C:\Users\yashw\PycharmProjects\PythonProject4\models\sbert_recommender.pkl")
MODEL_SAVE = Path(r"C:\Users\yashw\PycharmProjects\PythonProject4\models\lambdarank_model.pkl")

def clean_title(t):
    t = str(t).lower().strip()
    t = re.sub(r'^\d+\.\s*', '', t)
    t = re.sub(r'[^a-z0-9\s\-]', '', t)
    t = re.sub(r'\s+', ' ', t)
    return t

def parse_similar(x):
    try:
        if isinstance(x, str):
            return [clean_title(i) for i in ast.literal_eval(x)]
    except Exception:
        pass
    return []

def to_tag_list(s):
    if isinstance(s, str) and s.startswith('['):
        try:
            vals = ast.literal_eval(s)
            return [v.lower().strip() for v in vals]
        except Exception:
            pass
    return [t.strip().lower() for t in str(s).split(',') if t.strip()]

def minmax(x):
    x = x.fillna(0)
    rng = x.max() - x.min()
    return (x - x.min()) / rng if rng != 0 else x * 0

df = pd.read_csv(DATA_PATH)
df['clean_title'] = df['title'].apply(clean_title)
df['ground_truth'] = df['similar_questions'].apply(parse_similar)
df['tag_list'] = df['topic_tags'].apply(to_tag_list)
df['difficulty'] = df['difficulty'].fillna('Medium')

if EMB_PATH.exists():
    with open(EMB_PATH, "rb") as f:
        cache = pickle.load(f)
    embeddings = np.array(cache["embeddings"], dtype=np.float32)
else:
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(df['title'].tolist(), show_progress_bar=True, normalize_embeddings=True)
    with open(EMB_PATH, "wb") as f:
        pickle.dump({"model_name": "all-MiniLM-L6-v2", "embeddings": embeddings}, f)

N = len(df)

ladder = {"easy": 0, "medium": 1, "hard": 2}
diff_vals = df['difficulty'].str.lower().map(ladder).fillna(1).to_numpy(dtype=np.int8)

acc = minmax(df['acceptance']) if 'acceptance' in df else 0
likes = minmax(df['likes']) if 'likes' in df else 0
subs = minmax(df['submission']) if 'submission' in df else 0
popularity = (0.3 * acc + 0.5 * likes + 0.2 * subs).fillna(0).to_numpy(dtype=np.float32)

print("Building tag similarity matrix...")
tag_sims = np.zeros((N, N), dtype=np.float32)
for i in tqdm(range(N)):
    tags_i = set(df.iloc[i]['tag_list'])
    for j in range(i + 1, N):
        tags_j = set(df.iloc[j]['tag_list'])
        if not tags_i or not tags_j:
            continue
        inter = len(tags_i & tags_j)
        uni = len(tags_i | tags_j)
        if uni:
            val = inter / uni
            tag_sims[i, j] = tag_sims[j, i] = val

print("Building difficulty similarity matrix...")
diff_sims = np.ones((N, N), dtype=np.float32)
for i in range(N):
    for j in range(i + 1, N):
        d = abs(diff_vals[i] - diff_vals[j])
        diff_sims[i, j] = diff_sims[j, i] = 1.0 if d == 0 else 0.7 if d == 1 else 0.4


query_ids = np.arange(N)
query_ids = shuffle(query_ids, random_state=42)
train_queries, val_queries = train_test_split(query_ids, test_size=0.2, random_state=42)

print("Generating grouped feature data...")
features, labels, group_sizes = [], [], []

for i in tqdm(train_queries, desc="Building training pairs"):
    gt = df.loc[i, 'ground_truth']
    if not gt:
        continue
    pos_idx = []
    for title in gt:
        j_match = df.index[df['clean_title'] == title].tolist()
        if j_match:
            pos_idx.append(j_match[0])
    if not pos_idx:
        continue

    # Positive pairs
    for j in pos_idx:
        emb_sim = np.dot(embeddings[i], embeddings[j])
        tag_sim = tag_sims[i, j]
        diff_sim = diff_sims[i, j]
        pop_diff = abs(popularity[i] - popularity[j])
        features.append([emb_sim, tag_sim, diff_sim, pop_diff])
        labels.append(1)

    # Negative pairs
    rand_idx = np.random.choice(N, size=min(5 * len(pos_idx), N), replace=False)
    rand_idx = rand_idx[rand_idx != i]
    for j in rand_idx:
        emb_sim = np.dot(embeddings[i], embeddings[j])
        tag_sim = tag_sims[i, j]
        diff_sim = diff_sims[i, j]
        pop_diff = abs(popularity[i] - popularity[j])
        features.append([emb_sim, tag_sim, diff_sim, pop_diff])
        labels.append(0)

    group_sizes.append(len(pos_idx) + len(rand_idx))

X = np.array(features, dtype=np.float32)
y = np.array(labels, dtype=np.float32)
print(f"Training samples: {len(X)}, Groups: {len(group_sizes)}")

train_data = lgb.Dataset(X, label=y, group=group_sizes)

params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "boosting_type": "gbdt",
    "num_leaves": 63,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "ndcg_eval_at": [10],
    "verbosity": -1
}

model = lgb.train(
    params,
    train_data,
    num_boost_round=500,
    valid_sets=[train_data],
    valid_names=['train'],
    callbacks=[
        lgb.log_evaluation(50)
    ]
)

print("\nFeature importances:")
for n, imp in zip(["emb_sim", "tag_sim", "diff_sim", "pop_diff"], model.feature_importance()):
    print(f"{n:10s} -> {imp}")

with open(MODEL_SAVE, "wb") as f:
    pickle.dump(model, f)
print("LambdaRank model trained and saved successfully.")


Building tag similarity matrix...


100%|██████████| 3735/3735 [05:30<00:00, 11.31it/s] 


Building difficulty similarity matrix...
Generating grouped feature data...


Building training pairs: 100%|██████████| 2988/2988 [00:01<00:00, 1860.78it/s]


Training samples: 24195, Groups: 1757
[50]	train's ndcg@10: 0.962555
[100]	train's ndcg@10: 0.973675
[150]	train's ndcg@10: 0.98291
[200]	train's ndcg@10: 0.989443
[250]	train's ndcg@10: 0.992758
[300]	train's ndcg@10: 0.995212
[350]	train's ndcg@10: 0.996645
[400]	train's ndcg@10: 0.997513
[450]	train's ndcg@10: 0.998284
[500]	train's ndcg@10: 0.998787

Feature importances:
emb_sim    -> 12136
tag_sim    -> 5354
diff_sim   -> 1508
pop_diff   -> 12002
LambdaRank model trained and saved successfully.
