In [1]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m112.6/154.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [2]:
interactions = pd.read_csv("user_interaction.csv")
metadata = pd.read_csv("metadata.csv")

In [3]:
interactions.columns

Index(['user_id', 'pratilipi_id', 'read_percent', 'updated_at'], dtype='object')

In [4]:
metadata.columns

Index(['author_id', 'pratilipi_id', 'category_name', 'reading_time',
       'updated_at', 'published_at'],
      dtype='object')

In [5]:
merged_df = interactions.merge(metadata, on='pratilipi_id')

In [6]:
len(merged_df)

3974678

In [7]:
merged_df = merged_df[:30000]

In [8]:
user_mapping = {user: i for i, user in enumerate(merged_df['user_id'].unique())}
pratilipi_mapping = {p: i for i, p in enumerate(merged_df['pratilipi_id'].unique())}
merged_df['user_id'] = merged_df['user_id'].map(user_mapping)
merged_df['pratilipi_id'] = merged_df['pratilipi_id'].map(pratilipi_mapping)

In [9]:
reader = Reader(rating_scale=(0, 100))
data = Dataset.load_from_df(merged_df[['user_id', 'pratilipi_id', 'read_percent']], reader)
trainset, testset = train_test_split(data, test_size=0.25)

In [10]:
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7c503413c1d0>

In [11]:
def recommend_pratilipis(user_id, n=5):
    pratilipi_ids = merged_df['pratilipi_id'].unique()
    predictions = [svd.predict(user_id, pid) for pid in pratilipi_ids]
    top_n = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]
    return [int(pred.iid) for pred in top_n]

In [None]:
pratilipi_features = pd.get_dummies(metadata[['category_name']])
pratilipi_matrix = csr_matrix(pratilipi_features.values)
similarity_matrix = cosine_similarity(pratilipi_matrix)

In [None]:
def recommend_similar(pratilipi_id, n=5):
    if pratilipi_id not in pratilipi_mapping:
        return []  # Return empty list if pratilipi_id is not found

    idx = pratilipi_mapping[pratilipi_id]
    similar_indices = similarity_matrix[idx].argsort()[-n-1:-1][::-1]
    return [list(pratilipi_mapping.keys())[i] for i in similar_indices]

In [None]:
def hybrid_recommendation(user_id, n=5):
    collab_recs = recommend_pratilipis(user_id, n//2)
    content_recs = []
    for pid in collab_recs:
        content_recs.extend(recommend_similar(pid, 1))
    return list(set(collab_recs + content_recs))[:n]

In [None]:
map_to_pratilipi = {v: k for k, v in pratilipi_mapping.items()}

In [None]:
# print(hybrid_recommendation(3))

In [None]:
result = hybrid_recommendation(3)
result = [map_to_pratilipi[i] for i in result]
print(result)