In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path

In [3]:
# File location setup
current_directory = Path().resolve()  # current working directory
csv_file_path = current_directory / "preprocessed_data.csv"

# Constants
ACCEPTANCE_WEIGHT = 0.3
ENGAGEMENT_WEIGHT = 0.5
SUBMISSION_WEIGHT = 0.2
SMOOTHING_FACTOR = 100000

# Text Processing
class TextProcessor:
    def __init__(self, text_data):
        self.text_data = text_data
        self.vectorizer = TfidfVectorizer(stop_words='english')

    def preprocess_text_data(self):
        self.text_data = self.text_data.fillna('')
        return self.vectorizer.fit_transform(self.text_data)

# Popularity Calculation
class PopularityCalculator:
    def __init__(self, acceptance, engagement, submission):
        self.acceptance = acceptance
        self.engagement = engagement
        self.submission = submission

    @staticmethod
    def normalize(series):
        return (series - series.min()) / (series.max() - series.min())

    def calculate_popularity_score(self):
        return (
            self.normalize(self.acceptance.fillna(0)) * ACCEPTANCE_WEIGHT +
            self.normalize(self.engagement.fillna(0)) * ENGAGEMENT_WEIGHT +
            self.normalize(self.submission.fillna(0)) * SUBMISSION_WEIGHT
        )

# Recommendation Engine
class ProblemRecommender:
    @staticmethod
    def recommend_similar_problems(df, problem_id, X_processed, n=10):
        idx = df.index[df['id'] == problem_id]
        if len(idx) == 0:
            print(f" Problem ID {problem_id} not found.")
            return pd.DataFrame()
        idx = idx[0]
        sim_scores = cosine_similarity(X_processed[idx], X_processed).flatten()
        sim_scores[idx] = 0  # exclude the same problem
        top_indices = sim_scores.argsort()[-n:][::-1]
        return df.iloc[top_indices]

    @staticmethod
    def recommend_top_problems(df, n=10):
        return df.sort_values(by='popularity_score', ascending=False).head(n)

# Main function
def recommender_system(problem_id=1):
    df = pd.read_csv(csv_file_path)

    # --- Prepare textual content ---
    # Combine title, tags, and difficulty into a single string
    df['topic_tags'] = df['topic_tags'].fillna('').astype(str)
    df['combined_text'] = (
        df['title'].astype(str) + " " +
        df['difficulty'].astype(str) + " " +
        df['topic_tags'].astype(str)
    )

    # --- Text Vectorization ---
    text_processor = TextProcessor(df['combined_text'])
    X_processed = text_processor.preprocess_text_data()

    # --- Popularity Calculation ---
    popularity_calculator = PopularityCalculator(
        df['acceptance'], df['likes'], df['submission']
    )
    df['popularity_score'] = popularity_calculator.calculate_popularity_score()

    # --- Recommendations ---
    content_recommendations = ProblemRecommender.recommend_similar_problems(
        df, problem_id, X_processed
    )
    popularity_recommendations = ProblemRecommender.recommend_top_problems(
        content_recommendations
    )

    return popularity_recommendations[['id', 'title', 'difficulty', 'topic_tags', 'problem_URL']]


In [4]:
# Example usage
if __name__ == "__main__":
    results = recommender_system(problem_id=1)
    print("Recommended Problems:")
    print(results)


Recommended Problems:
        id                                            title difficulty  \
559    560                       560. Subarray Sum Equals K     Medium   
1747  1748                     1748. Sum of Unique Elements       Easy   
929    930                   930. Binary Subarrays With Sum     Medium   
2394  2395              2395. Find Subarrays With Equal Sum       Easy   
2814  2815                   2815. Max Pair Sum in an Array       Easy   
547    548                  548. Split Array with Equal Sum       Hard   
1589  1590                    1590. Make Sum Divisible by P     Medium   
3653  3654  3654. Minimum Sum After Divisible Sum Deletions     Medium   
2614  2615                           2615. Sum of Distances     Medium   
3025  3026                  3026. Maximum Good Subarray Sum     Medium   

                                             topic_tags  \
559               ['Array', 'Hash Table', 'Prefix Sum']   
1747                ['Array', 'Hash Table', '