In [3]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity

DATA_PATH = Path(r"C:\Users\yashw\PycharmProjects\PythonProject4\data\processed\preprocessed_data.csv")
EMB_PATH  = Path(r"C:\Users\yashw\PycharmProjects\PythonProject4\models\sbert_recommender.pkl")
MODEL_PATH = Path(r"C:\Users\yashw\PycharmProjects\PythonProject4\models\lambdarank_model.pkl")

with open(EMB_PATH, "rb") as f:
    cache = pickle.load(f)
embeddings = np.array(cache["embeddings"], dtype=np.float32)

with open(MODEL_PATH, "rb") as f:
    model = pickle.load(f)

df = pd.read_csv(DATA_PATH)

# Basic cleaning for consistency
import ast, re
def clean_title(t):
    t = str(t).lower().strip()
    t = re.sub(r'^\d+\.\s*', '', t)
    t = re.sub(r'[^a-z0-9\s\-]', '', t)
    t = re.sub(r'\s+', ' ', t)
    return t

def parse_similar(x):
    try:
        if isinstance(x, str):
            return [clean_title(i) for i in ast.literal_eval(x)]
    except Exception:
        pass
    return []

df['clean_title'] = df['title'].apply(clean_title)
df['ground_truth'] = df['similar_questions'].apply(parse_similar)
df['difficulty'] = df['difficulty'].fillna('Medium')
df['topic_tags'] = df['topic_tags'].fillna('').astype(str)

# Difficulty & popularity setup
ladder = {"easy":0,"medium":1,"hard":2}
diff_vals = df['difficulty'].str.lower().map(ladder).fillna(1).to_numpy(dtype=np.int8)
def minmax(x):
    x = x.fillna(0)
    rng = x.max() - x.min()
    return (x - x.min()) / rng if rng != 0 else x * 0

acc = minmax(df['acceptance'])
likes = minmax(df['likes'])
subs = minmax(df['submission'])
popularity = (0.3*acc + 0.5*likes + 0.2*subs).fillna(0).to_numpy(dtype=np.float32)

# Helper: tag list + Jaccard
import ast
def to_tag_list(s):
    if isinstance(s, str) and s.startswith('['):
        try:
            vals = ast.literal_eval(s)
            return [v.lower().strip() for v in vals]
        except Exception:
            pass
    return [t.strip().lower() for t in str(s).split(',') if t.strip()]

df['tag_list'] = df['topic_tags'].apply(to_tag_list)
tag_sets = [set(t) for t in df['tag_list']]

def tag_sim(i, j):
    a, b = tag_sets[i], tag_sets[j]
    if not a or not b: return 0.0
    return len(a & b) / len(a | b)

def rec_lambdarank(i, k=10):
    sims = embeddings @ embeddings[i]
    diff_s = np.array([1.0 if abs(diff_vals[i]-diff_vals[j])==0 else 0.7 if abs(diff_vals[i]-diff_vals[j])==1 else 0.4 for j in range(len(df))])
    pop_diffs = np.abs(popularity[i] - popularity)
    tag_s = np.array([tag_sim(i, j) for j in range(len(df))])
    feats = np.stack([sims, tag_s, diff_s, pop_diffs], axis=1)
    preds = model.predict(feats)
    preds[i] = -1e9
    top_idx = np.argsort(preds)[-k:][::-1]
    return [df.iloc[j]['clean_title'] for j in top_idx]

def precision_recall_ndcg(k=10, limit=300):
    P, R, N = [], [], []
    for i in tqdm(range(min(limit, len(df)))):
        gt = df.loc[i, 'ground_truth']
        if not gt:
            continue
        preds = rec_lambdarank(i, k)
        hits = len(set(preds) & set(gt))
        p = hits / k
        r = hits / len(gt)
        rel = [1 if x in gt else 0 for x in preds[:k]]
        dcg = sum(r_ / np.log2(idx+2) for idx, r_ in enumerate(rel))
        idcg = sum(sorted(rel, reverse=True)[i]/np.log2(i+2) for i in range(len(rel)))
        ndcg = dcg/idcg if idcg>0 else 0
        P.append(p); R.append(r); N.append(ndcg)
    return np.mean(P), np.mean(R), np.mean(N)

print("\nEvaluating LambdaRank on 300 random problems...")
P, R, N = precision_recall_ndcg(k=10, limit=300)
print(f"\nLambdaRank → Precision@10={P:.4f}, Recall@10={R:.4f}, NDCG@10={N:.4f}")



Evaluating LambdaRank on 300 random problems...


100%|██████████| 300/300 [00:08<00:00, 34.30it/s]


LambdaRank → Precision@10=0.1467, Recall@10=0.4581, NDCG@10=0.5632



