In [1]:
import os, pickle, ast, re
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

DATA_PATH  = Path(r"C:\Users\yashw\PycharmProjects\PythonProject4\data\processed\preprocessed_data.csv")
CACHE_PATH = Path(r"C:\Users\yashw\PycharmProjects\PythonProject4\models\sbert_embeddings.pkl")
MODEL_NAME = "all-MiniLM-L6-v2"

def clean_title(text):
    text = str(text).lower().strip()
    text = re.sub(r'^\d+\.\s*', '', text)
    text = re.sub(r'[^a-z0-9\s\-]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def parse_similar(x):
    try:
        if isinstance(x, str):
            lst = ast.literal_eval(x)
            return [clean_title(i) for i in lst]
        return []
    except Exception:
        return []

def to_tag_list(s):
    if isinstance(s, str) and s.startswith('['):
        try:
            vals = ast.literal_eval(s)
            return [str(v).lower().strip() for v in vals]
        except Exception:
            pass
    return [t.strip().lower() for t in str(s).split(',') if t.strip()]

def minmax(x):
    x = x.fillna(0)
    rng = x.max() - x.min()
    return (x - x.min()) / rng if rng != 0 else x * 0

df = pd.read_csv(DATA_PATH)
df['clean_title'] = df['title'].apply(clean_title)
df['topic_tags']  = df['topic_tags'].fillna('').astype(str)
df['difficulty']  = df['difficulty'].fillna('').astype(str)
df['tag_list']    = df['topic_tags'].apply(to_tag_list)

# combine text for embedding
df['combined_text'] = (
    df['clean_title'] + " | diff: " + df['difficulty'].str.lower() +
    " | tags: " + df['topic_tags'].str.lower()
)

if CACHE_PATH.exists():
    with open(CACHE_PATH, "rb") as f:
        cache = pickle.load(f)
    if cache.get("model_name") == MODEL_NAME and len(cache.get("embeddings", [])) == len(df):
        print("Loaded cached SBERT embeddings.")
        embeddings = cache["embeddings"]
    else:
        model = SentenceTransformer(MODEL_NAME)
        embeddings = model.encode(df['combined_text'].tolist(), batch_size=64, show_progress_bar=True, normalize_embeddings=True)
        with open(CACHE_PATH, "wb") as f:
            pickle.dump({"model_name": MODEL_NAME, "embeddings": embeddings}, f)
else:
    model = SentenceTransformer(MODEL_NAME)
    embeddings = model.encode(df['combined_text'].tolist(), batch_size=64, show_progress_bar=True, normalize_embeddings=True)
    with open(CACHE_PATH, "wb") as f:
        pickle.dump({"model_name": MODEL_NAME, "embeddings": embeddings}, f)

embeddings = np.array(embeddings, dtype=np.float32)
N = len(df)

print("Precomputing tag & difficulty similarities...")

tag_sims = np.zeros((N, N), dtype=np.float32)
for i in tqdm(range(N), desc="Tag Jaccard"):
    tags_i = set(df.iloc[i]['tag_list'])
    if not tags_i: continue
    for j in range(i + 1, N):
        tags_j = set(df.iloc[j]['tag_list'])
        if not tags_j: continue
        inter = len(tags_i & tags_j)
        uni = len(tags_i | tags_j)
        if uni:
            val = inter / uni
            tag_sims[i, j] = tag_sims[j, i] = val

ladder = {"easy": 0, "medium": 1, "hard": 2}
diff_vals = df['difficulty'].str.lower().map(ladder).fillna(1).to_numpy(dtype=np.int8)
diff_sims = np.ones((N, N), dtype=np.float32)
for i in range(N):
    for j in range(i + 1, N):
        d = abs(diff_vals[i] - diff_vals[j])
        s = 1.0 if d == 0 else 0.7 if d == 1 else 0.4
        diff_sims[i, j] = diff_sims[j, i] = s

# popularity
acc = minmax(df['acceptance']) if 'acceptance' in df else 0
likes = minmax(df['likes']) if 'likes' in df else 0
subs = minmax(df['submission']) if 'submission' in df else 0
popularity_score = (0.3 * acc + 0.5 * likes + 0.2 * subs).fillna(0).to_numpy(dtype=np.float32)

def hybrid_scores(i, w_sim=0.70, w_tag=0.15, w_diff=0.10, w_pop=0.05):
    sims = embeddings @ embeddings[i]
    score = (
        w_sim * sims +
        w_tag * tag_sims[i] +
        w_diff * diff_sims[i] +
        w_pop * popularity_score
    )
    mn, mx = score.min(), score.max()
    if mx > mn:
        score = (score - mn) / (mx - mn)
    return score

def rank_by_scores(scores, k=10, exclude=None):
    if exclude is not None: scores[exclude] = -1e9
    idxs = np.argpartition(scores, -k)[-k:]
    idxs = idxs[np.argsort(scores[idxs])][::-1]
    return idxs

def recommend(problem_idx, k=10, weights=(0.70, 0.15, 0.10, 0.05)):
    s = hybrid_scores(problem_idx, *weights)
    idxs = rank_by_scores(s.copy(), k=k, exclude=problem_idx)
    return df.iloc[idxs][['frontend_id','title','difficulty','topic_tags','problem_URL']].reset_index(drop=True)

# Save recommender as pickle for evaluation script
with open("../../models/sbert_recommender.pkl", "wb") as f:
    pickle.dump({
        "df": df,
        "embeddings": embeddings,
        "tag_sims": tag_sims,
        "diff_sims": diff_sims,
        "popularity_score": popularity_score
    }, f)

print("SBERT Recommender built and cached successfully.")


  from .autonotebook import tqdm as notebook_tqdm


Loaded cached SBERT embeddings.
Precomputing tag & difficulty similarities...


Tag Jaccard: 100%|██████████| 3735/3735 [06:30<00:00,  9.56it/s] 


SBERT Recommender built and cached successfully.
