In [1]:
# Install necessary libraries
!pip install pandas numpy scikit-learn matplotlib seaborn




In [2]:
import pandas as pd

# Load the datasets from Google Drive
user_interaction = pd.read_csv("/content/drive/MyDrive/dataset/user_interaction.csv")
meta_data = pd.read_csv("/content/drive/MyDrive/dataset/metadata.csv")


In [3]:
# Inspecting the first few rows of both datasets
print(user_interaction.head())
print(meta_data.head())

# Summary statistics
print(user_interaction.describe())
print(meta_data.describe())

# Check for missing values
print(user_interaction.isnull().sum())
print(meta_data.isnull().sum())


            user_id      pratilipi_id  read_percent               updated_at
0  5506791961876448  1377786228262109         100.0  2022-03-22 10:29:57.291
1  5506791971543560  1377786223038206          40.0  2022-03-19 13:49:25.660
2  5506791996468218  1377786227025240         100.0  2022-03-21 17:28:47.288
3  5506791978752866  1377786222398208          65.0  2022-03-21 07:39:25.183
4  5506791978962946  1377786228157051         100.0  2022-03-22 17:32:44.777
          author_id      pratilipi_id category_name  reading_time  \
0 -3418949279741297  1025741862639304   translation             0   
1 -2270332351871840  1377786215601277   translation           171   
2 -2270332352037261  1377786215601962   translation            92   
3 -2270332352521845  1377786215640994   translation             0   
4 -2270332349665658  1377786215931338   translation            47   

            updated_at         published_at  
0  2020-08-19 15:26:13  2016-09-30 10:37:04  
1  2021-01-21 16:27:07  2018-06

In [4]:
# Convert 'updated_at' to datetime
user_interaction['updated_at'] = pd.to_datetime(user_interaction['updated_at'])
meta_data['updated_at'] = pd.to_datetime(meta_data['updated_at'])
meta_data['published_at'] = pd.to_datetime(meta_data['published_at'], errors='coerce')

# Drop rows where 'published_at' is NaN
meta_data = meta_data.dropna(subset=['published_at'])

# Check for null values again after cleaning
print(user_interaction.isnull().sum())
print(meta_data.isnull().sum())


user_id         0
pratilipi_id    0
read_percent    0
updated_at      0
dtype: int64
author_id        0
pratilipi_id     0
category_name    0
reading_time     0
updated_at       0
published_at     0
dtype: int64


In [5]:
# Remove outliers in read_percent (greater than 100)
user_interaction = user_interaction[user_interaction['read_percent'] <= 100]


In [6]:
# Create user features (e.g., total stories read, average read percentage)
user_features = user_interaction.groupby('user_id').agg(
    total_stories_read=('pratilipi_id', 'nunique'),
    avg_read_percentage=('read_percent', 'mean')
).reset_index()

# Merge with user_interaction data to get user-level features
user_interaction = user_interaction.merge(user_features, on='user_id', how='left')


In [7]:
# Extract multiple categories per pratilipi (in case of multiple categories)
meta_data['category_list'] = meta_data['category_name'].str.split(',')

# Merge user interactions with meta data to get pratilipi category and reading time
interaction_with_meta = user_interaction.merge(meta_data[['pratilipi_id', 'category_name', 'reading_time']],
                                               on='pratilipi_id', how='left')


In [8]:
# Sort by updated_at and split the data
user_interaction = user_interaction.sort_values(by='updated_at')

# Train-Test Split (75% train, 25% test)
train_data = user_interaction.iloc[:int(0.75 * len(user_interaction))]
test_data = user_interaction.iloc[int(0.75 * len(user_interaction)):]


In [10]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2505174 sha256=9c3502d07aecf07f2147ed4d7f5653e427ed499b87d185e53e753eac2d1aa3c1
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e28991

In [11]:
#Model Building
#Set up and train the collaborative filtering model (SVD).

from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

# Prepare the data for the Surprise library
reader = Reader(rating_scale=(0, 100))
data = Dataset.load_from_df(train_data[['user_id', 'pratilipi_id', 'read_percent']], reader)

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25)

# Train the SVD model
model = SVD()
model.fit(trainset)

# Evaluate the model on the test data
predictions = model.test(testset)

# Calculate RMSE (Root Mean Squared Error)
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")


RMSE: 22.3720
RMSE: 22.372039837988925


In [12]:
#Predictions and Recommendations:
#Generate top 5 pratilipis for each user.


from collections import defaultdict

# Get the top 5 recommendations for each user
top_n = defaultdict(list)

for uid, iid, true_r, est, _ in predictions:
    top_n[uid].append((iid, est))

# Sort and get the top 5 recommendations for each user
for uid, user_ratings in top_n.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    top_n[uid] = user_ratings[:5]

# Display top 5 recommendations for the first 5 users
for uid, recommended_items in list(top_n.items())[:5]:
    print(f"User {uid}: {recommended_items}")


User 5506791966280495: [(1377786219672726, 100)]
User 5506791968624247: [(1377786223936248, 100), (1377786222606858, 100), (1377786224655297, 100), (1377786228039628, 100), (1377786222825840, 100)]
User 5506791987341146: [(1377786227792278, 100), (1377786227820041, 100), (1377786221889554, 100), (1377786222685592, 100), (1377786222053149, 100)]
User 5506791973485874: [(1377786221237656, 100), (1377786221321675, 100)]
User 5506791983111623: [(1377786220907572, 100), (1377786221024320, 100)]
