# Post Recommendation System

Post Recommendation System can be implemented by two different ways.
1. Content Based Filtering
2. Collaborative Filtering

Here, I have implemented both of them one by one.

# A. Content Based Filtering Method

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
users=pd.read_csv('Mldata/users.csv')
posts=pd.read_csv('Mldata/posts.csv')
views=pd.read_csv('Mldata/views.csv')

## User Information

In [3]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 4 columns):
_id          118 non-null object
name         118 non-null object
gender       118 non-null object
academics    118 non-null object
dtypes: object(4)
memory usage: 3.8+ KB


In [4]:
users.head()

Unnamed: 0,_id,name,gender,academics
0,5d60098a653a331687083238,Nivesh Singh Chauhan,male,undergraduate
1,5d610ae1653a331687083239,Gaurav Sharma,male,graduate
2,5d618359fc5fcf3bdd9a0910,Akshay Mishra,male,undergraduate
3,5d6d2bb87fa40e1417a49315,Saksham Mathur,male,undergraduate
4,5d7c994d5720533e15c3b1e9,Varun Chowhan,male,undergraduate


## Post Information

In [5]:
posts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 4 columns):
_id           493 non-null object
title         493 non-null object
category      465 non-null object
 post_type    493 non-null object
dtypes: object(4)
memory usage: 15.5+ KB


In [6]:
posts.head()

Unnamed: 0,_id,title,category,post_type
0,5d62abaa65218653a132c956,hello there,Plant Biotechnology,blog
1,5d6d39567fa40e1417a4931c,Ml and AI,Artificial Intelligence|Machine Learning|Infor...,blog
2,5d7d23315720533e15c3b1ee,What is an Operating System ?,Operating Systems,blog
3,5d7d405e5720533e15c3b1f3,Lord Shiva,Drawings,artwork
4,5d80dfbc6c53455f896e600e,How Competition law evolved?,Competition Laws,blog


## View Information

In [7]:
views.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1449 entries, 0 to 1448
Data columns (total 3 columns):
user_id      1449 non-null object
post_id      1449 non-null object
timestamp    1449 non-null object
dtypes: object(3)
memory usage: 34.0+ KB


In [8]:
views.head()

Unnamed: 0,user_id,post_id,timestamp
0,5df49b32cc709107827fb3c7,5ec821ddec493f4a2655889e,2020-06-01T10:46:45.131Z
1,5ed3748576027d35905ccaab,5ed4cbadbd514d602c1531a6,2020-06-01T09:39:20.021Z
2,5ed0defa76027d35905cc2de,5eac305f10426255a7aa9dd3,2020-06-01T08:12:42.682Z
3,5ed0defa76027d35905cc2de,5ed1ff0276027d35905cc60d,2020-06-01T08:10:23.880Z
4,5ed0defa76027d35905cc2de,5ed3820f76027d35905ccac8,2020-06-01T08:08:54.124Z


## Model Building

In [9]:
users.drop(['name'], axis=1, inplace=True)
posts.drop(['title'], axis=1, inplace=True)
views.drop(['timestamp'], axis=1, inplace=True)

In [10]:
users_array=users.values
users_id_list=list(users['_id'])

In [11]:
posts['category'].fillna('', inplace=True)
posts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 3 columns):
_id           493 non-null object
category      493 non-null object
 post_type    493 non-null object
dtypes: object(3)
memory usage: 11.6+ KB


In [12]:
posts_array=posts.values
posts_id_list=list(posts['_id'])

In [13]:
views_array=views.values

In [14]:
# category post_type academics gender
expanding_table=[]
clarifying_views_array=[]
for user_id, post_id in views_array:
    try:
        index_of_user=users_id_list.index(user_id)
        index_of_post=posts_id_list.index(post_id)
        clarifying_views_array.append([user_id, post_id])
        #Category
        categories=posts_array[index_of_post][1]
        all_categories=categories.split('|')
        expanding_table.append([all_categories, posts_array[index_of_post][2], users_array[index_of_user][2], users_array[index_of_user][1]])
    except:
        pass

In [15]:
expanding_table_df=pd.DataFrame(expanding_table, columns=['category', 'post_type', 'academics', 'gender'])
clarifying_views_df=pd.DataFrame(clarifying_views_array, columns=['user_id', 'post_id'])
index=pd.DataFrame(list(range(0, len(clarifying_views_df))), columns=['index'])
final_df=pd.concat([index, clarifying_views_df, expanding_table_df], axis=1)
final_df.head()

Unnamed: 0,index,user_id,post_id,category,post_type,academics,gender
0,0,5df49b32cc709107827fb3c7,5ec821ddec493f4a2655889e,"[Visual Arts, Graphic Design, Artistic design,...",artwork,undergraduate,female
1,1,5ed3748576027d35905ccaab,5ed4cbadbd514d602c1531a6,"[Computer Technology, Computer Application, In...",blog,undergraduate,male
2,2,5ed0defa76027d35905cc2de,5eac305f10426255a7aa9dd3,[Photography],artwork,undergraduate,male
3,3,5ed0defa76027d35905cc2de,5ed1ff0276027d35905cc60d,"[Computer Technology, Cloud Computing]",blog,undergraduate,male
4,4,5ed0defa76027d35905cc2de,5ed3820f76027d35905ccac8,"[Computer Technology, Computer Application, In...",blog,undergraduate,male


In [16]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1447 entries, 0 to 1446
Data columns (total 7 columns):
index        1447 non-null int64
user_id      1447 non-null object
post_id      1447 non-null object
category     1447 non-null object
post_type    1447 non-null object
academics    1447 non-null object
gender       1447 non-null object
dtypes: int64(1), object(6)
memory usage: 79.2+ KB


## Features that are going to checked in this Model

In [17]:
features = ['user_id','post_id','category','post_type', 'academics', 'gender']

In [18]:
def combine_features(row):
    list_of_categories=row['category']
    if len(list_of_categories)==0:
        return row['user_id']+" "+row['post_id']+" "+row['post_type']+" "+row['academics']+" "+row['gender']
    elif len(list_of_categories)==1:
        return row['user_id']+" "+row['post_id']+" "+list_of_categories[0]+" "+row['post_type']+" "+row['academics']+" "+row['gender']
    else:
        space_separated_categories=" ".join(list_of_categories)
        return row['user_id']+" "+row['post_id']+" "+space_separated_categories+" "+row['post_type']+" "+row['academics']+" "+row['gender']

In [19]:
final_df["combined_features"] = final_df.apply(combine_features,axis=1)
final_df['combined_features'].head()

0    5df49b32cc709107827fb3c7 5ec821ddec493f4a26558...
1    5ed3748576027d35905ccaab 5ed4cbadbd514d602c153...
2    5ed0defa76027d35905cc2de 5eac305f10426255a7aa9...
3    5ed0defa76027d35905cc2de 5ed1ff0276027d35905cc...
4    5ed0defa76027d35905cc2de 5ed3820f76027d35905cc...
Name: combined_features, dtype: object

### Frequency Count

In [20]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(final_df["combined_features"])

### Cosine Similarity

In [21]:
cosine_sim = cosine_similarity(count_matrix)
pd.DataFrame(cosine_sim).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1437,1438,1439,1440,1441,1442,1443,1444,1445,1446
0,1.0,0.062622,0.19803,0.080845,0.062622,0.153393,0.09167,0.129641,0.16169,0.19803,...,0.19803,0.171499,0.366679,0.19803,0.19803,0.19803,0.19803,0.19803,0.076696,0.514496
1,0.062622,1.0,0.210819,0.602464,0.866667,0.163299,0.39036,0.621059,0.516398,0.316228,...,0.210819,0.182574,0.19518,0.210819,0.210819,0.210819,0.210819,0.210819,0.244949,0.182574
2,0.19803,0.210819,1.0,0.408248,0.316228,0.516398,0.46291,0.109109,0.136083,0.166667,...,0.666667,0.433013,0.46291,0.5,0.5,0.5,0.5,0.5,0.258199,0.433013
3,0.080845,0.602464,0.408248,1.0,0.68853,0.316228,0.503953,0.62361,0.444444,0.272166,...,0.272166,0.235702,0.251976,0.272166,0.272166,0.272166,0.272166,0.272166,0.316228,0.235702
4,0.062622,0.866667,0.316228,0.68853,1.0,0.244949,0.48795,0.621059,0.516398,0.316228,...,0.210819,0.182574,0.19518,0.210819,0.210819,0.210819,0.210819,0.210819,0.244949,0.182574


## Testing The Model With Sample Input

In [23]:
post_id_of_just_read_post='5ec821ddec493f4a2655889e'
number_of_top_recommended_posts=10

In [24]:
def get_post_id_from_index(index):
    return final_df[final_df.index == index]["post_id"].values[0]

def get_index_from_post_id(post_id):
    return final_df[final_df.post_id == post_id]["index"].values[0]

In [25]:
post_index = get_index_from_post_id(post_id_of_just_read_post)
similar_posts = list(enumerate(cosine_sim[post_index]))
sorted_similar_posts = sorted(similar_posts,key=lambda x:x[1],reverse=True)[1:]

i=0
print("Top {} similar post ids to ".format(number_of_top_recommended_posts)+post_id_of_just_read_post+" are:\n")
for element in sorted_similar_posts:
    recommended_post_id=get_post_id_from_index(element[0])
    if recommended_post_id!=post_id_of_just_read_post:
        print(get_post_id_from_index(element[0]))
        i=i+1
        if i>number_of_top_recommended_posts:
            break

Top 10 similar post ids to 5ec821ddec493f4a2655889e are:

5e2d516fc85ab714a7da66dd
5ecb72c0eaff6b0c3a58a48e
5e2d516fc85ab714a7da66dd
5dbc622a99cbb90e4339c7f6
5ec8204cec493f4a26558893
5ecb72c0eaff6b0c3a58a48e
5e2d4d63c85ab714a7da66db
5e2d4d63c85ab714a7da66db
5e2d516fc85ab714a7da66dd
5ecb72c0eaff6b0c3a58a48e
5ecb72c0eaff6b0c3a58a48e


# B. Collaborative Filtering Method

In [26]:
import pandas as pd
import seaborn as sns
from scipy import sparse

In [27]:
users=pd.read_csv('Mldata/users.csv')
posts=pd.read_csv('Mldata/posts.csv')
views=pd.read_csv('Mldata/views.csv')

## Model Building

In [28]:
total_users=users['_id']
total_users_list=list(total_users)

total_posts=posts['_id']
total_posts_list=list(total_posts)

### Creating Pivot Table

In [29]:
final_table=[]
for i in range(len(total_users)):
    tempArr=[0]*len(total_posts)
    final_table.append(tempArr)

views.drop(['timestamp'], axis=1, inplace=True)
views_array=views.values

In [30]:
for user_id, post_id in views_array:
    try:
        index_of_user=total_users_list.index(user_id)
        index_of_post=total_posts_list.index(post_id)
        final_table[index_of_user][index_of_post]=5
    except:
        #'5eb3ba7f10426255a7aaa09e', '5e8bfacca3258347b42f261b' this two post_ids are in the views.csv dataset 
        # but not in posts.csv's '_id' dataset!!! so I have skip those two rows of views.csv dataset
        pass

In [31]:
final_df=pd.DataFrame(final_table, index=total_users_list, columns=total_posts_list)
final_df.head()

Unnamed: 0,5d62abaa65218653a132c956,5d6d39567fa40e1417a4931c,5d7d23315720533e15c3b1ee,5d7d405e5720533e15c3b1f3,5d80dfbc6c53455f896e600e,5d80e7c16c53455f896e6014,5d80ecfd6c53455f896e601a,5d81323a6c53455f896e6044,5d9b3514979d5962253c2f90,5d9b950768671220a1b2b153,...,5ed23cf876027d35905cc790,5ed23d4276027d35905cc798,5ed23e4d76027d35905cc7b8,5ed2502b76027d35905cc7db,5ed3476576027d35905cca1d,5ed3791976027d35905ccab6,5ed3820f76027d35905ccac8,5ed3ef4cbd514d602c1530f2,5ed415c6bd514d602c15312d,5ed4cbadbd514d602c1531a6
5d60098a653a331687083238,5,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,5,0,0,0
5d610ae1653a331687083239,0,0,0,0,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5d618359fc5fcf3bdd9a0910,0,0,0,0,0,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5d6d2bb87fa40e1417a49315,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5d7c994d5720533e15c3b1e9,0,0,0,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 118 entries, 5d60098a653a331687083238 to 5ed3e898bd514d602c1530de
Columns: 493 entries, 5d62abaa65218653a132c956 to 5ed4cbadbd514d602c1531a6
dtypes: int64(493)
memory usage: 455.4+ KB


### Pearson Correlation

In [33]:
corrMatrix = final_df.corr(method='pearson')
corrMatrix.head()

Unnamed: 0,5d62abaa65218653a132c956,5d6d39567fa40e1417a4931c,5d7d23315720533e15c3b1ee,5d7d405e5720533e15c3b1f3,5d80dfbc6c53455f896e600e,5d80e7c16c53455f896e6014,5d80ecfd6c53455f896e601a,5d81323a6c53455f896e6044,5d9b3514979d5962253c2f90,5d9b950768671220a1b2b153,...,5ed23cf876027d35905cc790,5ed23d4276027d35905cc798,5ed23e4d76027d35905cc7b8,5ed2502b76027d35905cc7db,5ed3476576027d35905cca1d,5ed3791976027d35905ccab6,5ed3820f76027d35905ccac8,5ed3ef4cbd514d602c1530f2,5ed415c6bd514d602c15312d,5ed4cbadbd514d602c1531a6
5d62abaa65218653a132c956,1.0,1.0,-0.012139,-0.014932,-0.012139,-0.012139,-0.008547,-0.008547,-0.008547,-0.008547,...,-0.008547,-0.008547,-0.008547,-0.017317,-0.012139,-0.012139,0.493548,-0.008547,-0.012139,-0.008547
5d6d39567fa40e1417a4931c,1.0,1.0,-0.012139,-0.014932,-0.012139,-0.012139,-0.008547,-0.008547,-0.008547,-0.008547,...,-0.008547,-0.008547,-0.008547,-0.017317,-0.012139,-0.012139,0.493548,-0.008547,-0.012139,-0.008547
5d7d23315720533e15c3b1ee,-0.012139,-0.012139,1.0,0.395881,-0.017241,-0.017241,-0.012139,-0.012139,-0.012139,-0.012139,...,-0.012139,-0.012139,-0.012139,-0.024596,-0.017241,-0.017241,-0.024596,-0.012139,-0.017241,-0.012139
5d7d405e5720533e15c3b1f3,-0.014932,-0.014932,0.395881,1.0,-0.021208,-0.021208,-0.014932,-0.014932,-0.014932,-0.014932,...,-0.014932,-0.014932,-0.014932,-0.030254,-0.021208,-0.021208,-0.030254,-0.014932,-0.021208,-0.014932
5d80dfbc6c53455f896e600e,-0.012139,-0.012139,-0.017241,-0.021208,1.0,0.491379,-0.012139,-0.012139,-0.012139,-0.012139,...,0.704078,-0.012139,0.704078,0.338194,0.491379,0.491379,0.338194,0.704078,0.491379,-0.012139


## Testing The Model With Sample Input

In [37]:
test_user = ["5d6d39567fa40e1417a4931c","5ed2502b76027d35905cc7db","5ed415c6bd514d602c15312d"]
number_of_total_recommended_posts=10

highest_rating=5
def get_similar(post_id):
    rating=5
    similar_score = corrMatrix[post_id]*(rating-highest_rating/2)
    similar_score = similar_score.sort_values(ascending=False)
    return similar_score

similar_scores = pd.DataFrame()
for already_read_post in test_user:
    similar_scores = similar_scores.append(get_similar(already_read_post),ignore_index = True)

similar_scores.head(len(test_user))

Unnamed: 0,5d62abaa65218653a132c956,5d6d39567fa40e1417a4931c,5d7d23315720533e15c3b1ee,5d7d405e5720533e15c3b1f3,5d80dfbc6c53455f896e600e,5d80e7c16c53455f896e6014,5d80ecfd6c53455f896e601a,5d81323a6c53455f896e6044,5d9b3514979d5962253c2f90,5d9b950768671220a1b2b153,...,5ed23cf876027d35905cc790,5ed23d4276027d35905cc798,5ed23e4d76027d35905cc7b8,5ed2502b76027d35905cc7db,5ed3476576027d35905cca1d,5ed3791976027d35905ccab6,5ed3820f76027d35905ccac8,5ed3ef4cbd514d602c1530f2,5ed415c6bd514d602c15312d,5ed4cbadbd514d602c1531a6
0,2.5,2.5,-0.030348,-0.03733,-0.030348,-0.030348,-0.021368,-0.021368,-0.021368,-0.021368,...,-0.021368,-0.021368,-0.021368,-0.043294,-0.030348,-0.030348,1.23387,-0.021368,-0.030348,-0.021368
1,-0.043294,-0.043294,-0.06149,-0.075636,0.845486,0.845486,-0.043294,-0.043294,-0.043294,-0.043294,...,1.23387,-0.043294,1.23387,2.5,0.845486,0.845486,0.559211,1.23387,0.845486,-0.043294
2,-0.030348,-0.030348,-0.043103,-0.05302,1.228448,1.228448,-0.030348,-0.030348,-0.030348,-0.030348,...,1.760196,1.760196,1.760196,0.845486,1.228448,1.228448,1.752461,1.760196,2.5,-0.030348


In [44]:
total_recommendations=similar_scores.sum().sort_values(ascending=False).head(number_of_total_recommended_posts)
total_recommendations

5ed3820f76027d35905ccac8    3.545542
5ed415c6bd514d602c15312d    3.315138
5ed2502b76027d35905cc7db    3.302192
5ed23a6376027d35905cc73b    2.972699
5ed23c8676027d35905cc780    2.972699
5ed2378276027d35905cc6b5    2.972699
5ed237d676027d35905cc6bd    2.972699
5ed238d376027d35905cc6e5    2.972699
5ed23b0776027d35905cc746    2.972699
5ed23b6276027d35905cc74f    2.972699
dtype: float64

In [45]:
total_index=total_recommendations.index
total_value=total_recommendations.values

In [46]:
final_recommedation_array=[]
count=0
for index in total_index:
    if index not in test_user:
        final_recommedation_array.append([index, total_value[count]])    
    count+=1
final_recommended_df=pd.DataFrame(final_recommedation_array, columns=['post_id', 'Recommendation_Score'])
final_recommended_df

Unnamed: 0,post_id,Recommendation_Score
0,5ed3820f76027d35905ccac8,3.545542
1,5ed23a6376027d35905cc73b,2.972699
2,5ed23c8676027d35905cc780,2.972699
3,5ed2378276027d35905cc6b5,2.972699
4,5ed237d676027d35905cc6bd,2.972699
5,5ed238d376027d35905cc6e5,2.972699
6,5ed23b0776027d35905cc746,2.972699
7,5ed23b6276027d35905cc74f,2.972699
