In [22]:
import pandas as _pd
import numpy as _np


from surprise import Reader, Dataset
from surprise.model_selection import train_test_split
from surprise import SVD



In [23]:
_authorData = _pd.read_csv('author_data.csv')
_blogData = _pd.read_csv('blog_data.csv')
_mediumBlogData = _pd.read_csv('medium_blog_data.csv')

In [24]:
_authorData.head()
_blogData.head()
_mediumBlogData.head()

Unnamed: 0,blog_id,author_id,blog_title,blog_content,blog_link,blog_img,topic,scrape_time
0,1,4,Let’s Dominate The Launchpad Space Again,"Hello, fam! If you’ve been with us since 2021,...",https://medium.com/@seedifyfund/lets-dominate-...,https://miro.medium.com/fit/c/140/140/1*nByLJr...,ai,2023-02-27 07:37:48
1,3,4,Let’s Dominate The Launchpad Space Again,"Hello, fam! If you’ve been with us since 2021,...",https://medium.com/@seedifyfund/lets-dominate-...,https://miro.medium.com/fit/c/140/140/1*nByLJr...,ai,2023-02-27 07:41:47
2,4,7,Using ChatGPT for User Research,Applying AI to 4 common user research activiti...,https://medium.com/ux-planet/using-chatgpt-for...,https://miro.medium.com/fit/c/140/140/1*TZSGnN...,ai,2023-02-27 07:41:47
3,5,8,The Automated Stable-Diffusion Checkpoint Merg...,Checkpoint merging is powerful. The power of c...,https://medium.com/@media_97267/the-automated-...,https://miro.medium.com/fit/c/140/140/1*x3N_Hj...,ai,2023-02-27 07:41:47
4,6,9,The Art of Lazy Creativity: My Experience Co-W...,I was feeling particularly lazy one day and co...,https://medium.com/@digitalshedmedia/the-art-o...,https://miro.medium.com/fit/c/140/140/0*m2DdeT...,ai,2023-02-27 07:41:47


In [25]:
_authorData.shape
_blogData.shape

(200140, 3)

In [26]:
## Removing the unwanted values
_mediumBlogData.drop(["blog_title","blog_content","blog_img","blog_link","scrape_time"], axis=1, inplace=True)
_mediumBlogData.shape

_mediumBlogData.head()

Unnamed: 0,blog_id,author_id,topic
0,1,4,ai
1,3,4,ai
2,4,7,ai
3,5,8,ai
4,6,9,ai


In [27]:
_dataWithAuthor = _pd.merge(_mediumBlogData,_authorData, on='author_id')

print(_dataWithAuthor)


       blog_id  author_id            topic           author_name
0            1          4               ai          Seedify Fund
1            3          4               ai          Seedify Fund
2          242          4             web3          Seedify Fund
3          754          4       blockchain          Seedify Fund
4            4          7               ai           Nick Babich
...        ...        ...              ...                   ...
10462    10485       6864  web-development  Fresh Frontend Links
10463    10486       6865  web-development         Mukesh buwade
10464    10487       6866  web-development            Osei Owusu
10465    10489       6867  web-development        Yasas Sandeepa
10466    10492       6868  web-development     Aphinya Dechalert

[10467 rows x 4 columns]


In [28]:
_fullBlogData = _pd.merge(_dataWithAuthor,_blogData,on="blog_id")
_fullBlogData.shape
_fullBlogData.head()


Unnamed: 0,blog_id,author_id,topic,author_name,userId,ratings
0,1,4,ai,Seedify Fund,624,2.0
1,1,4,ai,Seedify Fund,1256,0.5
2,1,4,ai,Seedify Fund,2095,3.5
3,1,4,ai,Seedify Fund,2103,5.0
4,1,4,ai,Seedify Fund,2286,2.0


In [29]:
_fullBlogData["features"] = _fullBlogData["topic"] + ',' + _fullBlogData["author_name"] + ',' + _fullBlogData["ratings"].astype(str)
_fullBlogData.shape
_fullBlogData.head()

Unnamed: 0,blog_id,author_id,topic,author_name,userId,ratings,features
0,1,4,ai,Seedify Fund,624,2.0,"ai,Seedify Fund,2.0"
1,1,4,ai,Seedify Fund,1256,0.5,"ai,Seedify Fund,0.5"
2,1,4,ai,Seedify Fund,2095,3.5,"ai,Seedify Fund,3.5"
3,1,4,ai,Seedify Fund,2103,5.0,"ai,Seedify Fund,5.0"
4,1,4,ai,Seedify Fund,2286,2.0,"ai,Seedify Fund,2.0"


In [30]:
#Number of ratings per user
ratings_per_user = _fullBlogData.groupby('userId').size().reset_index(name='num_ratings')

print(ratings_per_user)

      userId  num_ratings
0         10           19
1         11           16
2         12           71
3         13           51
4         14          170
...      ...          ...
4996    5006            9
4997    5007           49
4998    5008           31
4999    5009          107
5000    5010           40

[5001 rows x 2 columns]


In [31]:
#Number of ratings blog per user
rating_counts = _fullBlogData.groupby(['userId', 'blog_id']).size().reset_index(name='ratings')

print(rating_counts)

        userId  blog_id  ratings
0           10      137        1
1           10      142        1
2           10      145        1
3           10      152        1
4           10      158        1
...        ...      ...      ...
200123    5010     9735        1
200124    5010     9736        1
200125    5010     9737        1
200126    5010     9741        1
200127    5010     9750        1

[200128 rows x 3 columns]


In [32]:
# Convert ratings to NumPy array
ratings = _fullBlogData['ratings'].values

# Convert userIds to NumPy array
user_ids = _fullBlogData['userId'].values

# Calculate number of ratings per user
unique_users, ratings_count = _np.unique(user_ids, return_counts=True)

# Calculate average number of ratings per user
average_ratings_per_user = _np.mean(ratings_count)




In [33]:
rating_scale = (0.5, 5.0)
reader = Reader(rating_scale=rating_scale)
data = Dataset.load_from_df(_fullBlogData[['userId', 'blog_id', 'ratings']], reader)




In [34]:
#Split data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2)


In [35]:
# Train the algorithm on the training set
algo = SVD()
algo.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2827f5ecb30>

In [36]:
# Example: Recommend items for a specific user (userId = 1)
userId = 12
items_to_predict = _fullBlogData['blog_id'].unique()
predictions = [algo.predict(userId, blog_id) for blog_id in items_to_predict]

In [37]:
#Sort and present recommendations

# Sort predictions by estimated rating in descending order
predictions.sort(key=lambda x: x.est, reverse=True)

# Get top-N recommendations
top_n = 10
top_predictions = predictions[:top_n]

# Display recommendations
for i, prediction in enumerate(top_predictions, 1):
    print(f"Rank {i}: Blog ID {prediction.iid} (Estimated rating: {prediction.est})")



Rank 1: Blog ID 8791 (Estimated rating: 4.73903107414081)
Rank 2: Blog ID 275 (Estimated rating: 4.725343853797849)
Rank 3: Blog ID 3965 (Estimated rating: 4.660631083675385)
Rank 4: Blog ID 6740 (Estimated rating: 4.626595907630862)
Rank 5: Blog ID 8569 (Estimated rating: 4.570424582640152)
Rank 6: Blog ID 5713 (Estimated rating: 4.534783741477136)
Rank 7: Blog ID 8214 (Estimated rating: 4.5206987528346705)
Rank 8: Blog ID 2807 (Estimated rating: 4.478466172141897)
Rank 9: Blog ID 9560 (Estimated rating: 4.453017522800084)
Rank 10: Blog ID 6550 (Estimated rating: 4.4354961518954426)


In [38]:
# recommend based on the topic

selected_topic = 'ai'

# Step 2: Generate predictions for items related to the selected topic
topic_related_items = _fullBlogData[_fullBlogData['topic'] == selected_topic]['blog_id'].unique()

predictions = []
for blog_id in topic_related_items:
    prediction = algo.predict(userId, blog_id)
    predictions.append((blog_id, prediction.est))

# Step 3: Sort predictions based on estimated ratings (est) in descending order
predictions.sort(key=lambda x: x[1], reverse=True)

# Display top recommended items

top_n = 5 

print(f"Top {top_n} Recommendations for User {userId} based on topic '{selected_topic}':")

for i, (blog_id, estimated_rating) in enumerate(predictions[:top_n], 1):
    blog_info = _fullBlogData[_fullBlogData['blog_id'] == blog_id].iloc[0]
    print(f"Rank {i}: Blog ID {blog_id} (Topic: {blog_info['topic']}, Author: {blog_info['author_name']}, Estimated Rating: {estimated_rating})")


Top 5 Recommendations for User 12 based on topic 'ai':
Rank 1: Blog ID 3145 (Topic: ai, Author: Christie C., Estimated Rating: 4.189758736029309)
Rank 2: Blog ID 1378 (Topic: ai, Author: steven arellano, Estimated Rating: 4.150173328135927)
Rank 3: Blog ID 18 (Topic: ai, Author: Matt Ryan Allen, Estimated Rating: 4.117055379996544)
Rank 4: Blog ID 3160 (Topic: ai, Author: Swapna M, Estimated Rating: 4.036207681033946)
Rank 5: Blog ID 3221 (Topic: ai, Author: Melissa Lim, Estimated Rating: 4.025205024964296)


In [39]:
# Example: Predicting user rating for a specific blog

userId3 = 3 
blog_id = 123  

prediction = algo.predict(userId3, blog_id)

# Display prediction result
print(f"Predicted Rating for User {userId3} on Blog {blog_id}: {prediction.est}")

Predicted Rating for User 3 on Blog 123: 2.920325435150373


In [40]:
# Example: Showing unrated blogs predicted to be rated by a user


# Get list of all blog IDs
all_blog_ids = _fullBlogData['blog_id'].unique()

# Get list of blog IDs rated by the user
rated_blog_ids = _fullBlogData[_fullBlogData['userId'] == userId]['blog_id'].unique()

# Filter out unrated blog IDs
unrated_blog_ids = list(set(all_blog_ids) - set(rated_blog_ids))

# Generate predictions for unrated blogs
predictions = []
for blog_id in unrated_blog_ids:
    prediction = algo.predict(userId, blog_id)
    predictions.append((blog_id, prediction.est))


predictions.sort(key=lambda x: x[1], reverse=True)

# Display top unrated blogs predicted to be rated by the user
top_n = 5  
print(f"Top {top_n} Unrated Blogs Predicted to be Rated by User {userId}:")
for i, (blog_id, estimated_rating) in enumerate(predictions[:top_n], 1):
    blog_info = _fullBlogData[_fullBlogData['blog_id'] == blog_id].iloc[0] 
    print(f"Rank {i}: Blog ID {blog_id} (Topic: {blog_info['topic']}, Author: {blog_info['author_name']}, Estimated Rating: {estimated_rating})")


Top 5 Unrated Blogs Predicted to be Rated by User 12:
Rank 1: Blog ID 275 (Topic: cybersecurity, Author: Ama Victor, Estimated Rating: 4.725343853797849)
Rank 2: Blog ID 3965 (Topic: information-security, Author: Trishank Karthik Kuppusamy, Estimated Rating: 4.660631083675385)
Rank 3: Blog ID 6740 (Topic: data-analysis, Author: Juan De Dios Santos, Estimated Rating: 4.626595907630862)
Rank 4: Blog ID 8569 (Topic: app-development, Author: Prakriti Jain, Estimated Rating: 4.570424582640152)
Rank 5: Blog ID 5713 (Topic: deep-learning, Author: Vinicius Queiroz, Estimated Rating: 4.534783741477136)


In [41]:
from sklearn.metrics.pairwise import cosine_similarity

# Example features (replace with actual features extracted from your data)
blog_features = _fullBlogData[['blog_id', 'topic', 'author_name']]

# Assume userId is defined as in the previous example
# userId = 1

# Get blog IDs rated by the user
rated_blog_ids = _fullBlogData[_fullBlogData['userId'] == userId]['blog_id'].unique()

# Generate predictions for rated blogs using collaborative filtering
predictions_collab = []
for blog_id in rated_blog_ids:
    prediction = algo.predict(userId, blog_id)
    predictions_collab.append((blog_id, prediction.est))

# Sort collaborative filtering predictions by estimated rating (highest to lowest)
predictions_collab.sort(key=lambda x: x[1], reverse=True)

# Content-Based Filtering: Calculate similarity based on 'topic' and 'author_name'
def calculate_similarity(blog1, blog2):
    features1 = blog1[['topic', 'author_name']].values.reshape(1, -1)
    features2 = blog2[['topic', 'author_name']].values.reshape(1, -1)
    return cosine_similarity(features1, features2)[0][0]

# Generate similarities for all blogs with respect to rated blogs
similarities = []
for blog_id in blog_features['blog_id'].unique():
    if blog_id not in rated_blog_ids:
        blog_info = blog_features[blog_features['blog_id'] == blog_id].iloc[0]
        similarity_sum = 0
        count = 0
        for rated_blog_id in rated_blog_ids:
            rated_blog_info = blog_features[blog_features['blog_id'] == rated_blog_id].iloc[0]
            similarity_sum += calculate_similarity(blog_info, rated_blog_info)
            count += 1
        if count > 0:
            average_similarity = similarity_sum / count
            similarities.append((blog_id, average_similarity))

# Sort blogs by content-based similarity (highest to lowest)
similarities.sort(key=lambda x: x[1], reverse=True)

# Display top recommended blogs combining both approaches
top_n = 5
print(f"Top {top_n} Combined Recommendations Based on Ratings and Content Similarity for User {userId}:")
for i, (blog_id, similarity) in enumerate(similarities[:top_n], 1):
    blog_info = _fullBlogData[_fullBlogData['blog_id'] == blog_id].iloc[0]
    print(f"Rank {i}: Blog ID {blog_id} (Topic: {blog_info['topic']}, Author: {blog_info['author_name']}, Similarity: {similarity})")


ValueError: could not convert string to float: 'ai'