In [17]:
from pathlib import Path
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

current_directory = Path().resolve()
csv_file_path = Path(r"C:/Users/yashw/PycharmProjects/PythonProject4/data/processed/preprocessed_data.csv")

ACCEPTANCE_WEIGHT = 0.3
ENGAGEMENT_WEIGHT = 0.5
SUBMISSION_WEIGHT = 0.2
SIMILARITY_WEIGHT = 0.8
POPULARITY_WEIGHT = 0.2

class TextProcessor:
    def __init__(self, text_data):
        self.text_data = text_data
        self.vectorizer = TfidfVectorizer(stop_words='english')

    def preprocess_text_data(self):
        self.text_data = self.text_data.fillna('')
        return self.vectorizer.fit_transform(self.text_data)

class PopularityCalculator:
    def __init__(self, acceptance, engagement, submission):
        self.acceptance = acceptance
        self.engagement = engagement
        self.submission = submission

    @staticmethod
    def normalize(series):
        rng = series.max() - series.min()
        return (series - series.min()) / (rng if rng != 0 else 1)

    def calculate_popularity_score(self):
        return (
            self.normalize(self.acceptance.fillna(0)) * ACCEPTANCE_WEIGHT +
            self.normalize(self.engagement.fillna(0)) * ENGAGEMENT_WEIGHT +
            self.normalize(self.submission.fillna(0)) * SUBMISSION_WEIGHT
        )

class ProblemRecommender:
    @staticmethod
    def recommend_similar_problems(df, problem_id, X_processed, n=10):
        idx = df.index[df['frontend_id'] == problem_id]
        if len(idx) == 0:
            print(f"Problem ID {problem_id} not found.")
            return pd.DataFrame()
        idx = idx[0]
        sim_scores = cosine_similarity(X_processed[idx], X_processed).flatten()
        sim_scores[idx] = 0  # exclude the same problem

        # Add similarity scores to df for blending
        df = df.copy()
        df['similarity_score'] = sim_scores

        # Blend similarity + popularity for final ranking
        df['final_score'] = (
            SIMILARITY_WEIGHT * df['similarity_score'] +
            POPULARITY_WEIGHT * df['popularity_score']
        )

        top_indices = df['final_score'].nlargest(n).index
        return df.loc[top_indices]

def recommender_system(problem_id=1, n=10):
    df = pd.read_csv(csv_file_path)

    # Combine title, tags, and difficulty
    df['topic_tags'] = df['topic_tags'].fillna('').astype(str)
    df['combined_text'] = (
        df['title'].astype(str) + " " +
        df['difficulty'].astype(str) + " " +
        df['topic_tags'].astype(str)
    )

    # Vectorize text
    text_processor = TextProcessor(df['combined_text'])
    X_processed = text_processor.preprocess_text_data()

    # Compute popularity score
    popularity_calculator = PopularityCalculator(
        df['acceptance'], df['likes'], df['submission']
    )
    df['popularity_score'] = popularity_calculator.calculate_popularity_score()

    # Get recommendations
    recommendations = ProblemRecommender.recommend_similar_problems(
        df, problem_id, X_processed, n
    )

    return recommendations[
        ['frontend_id', 'title', 'difficulty', 'topic_tags',
 'problem_URL', 'similarity_score', 'popularity_score', 'final_score']
    ].sort_values(by='final_score', ascending=False).reset_index(drop=True)


In [18]:
results = recommender_system(problem_id=42, n=10)
print(results)

   frontend_id                                 title difficulty  \
0         3061         Calculate Trapping Rain Water       Hard   
1          407                Trapping Rain Water II       Hard   
2         1063             Number of Valid Subarrays       Hard   
3           85                     Maximal Rectangle       Hard   
4         2355  Maximum Number of Books You Can Take       Hard   
5          321                 Create Maximum Number       Hard   
6         1793      Maximum Score of a Good Subarray       Hard   
7         3205         Maximum Array Hopping Score I     Medium   
8          496                Next Greater Element I       Easy   
9         1504       Count Submatrices With All Ones     Medium   

                                          topic_tags  \
0                                       ['Database']   
1  ['Array', 'Breadth-First Search', 'Heap (Prior...   
2              ['Array', 'Stack', 'Monotonic Stack']   
3  ['Array', 'Dynamic Programming', 'S