### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Loading Data

In [2]:
user_info = pd.read_csv("user.csv", engine = "python")
user_and_post_info = pd.read_csv("view.csv")
post_info = pd.read_csv("post.csv")

### Examining Data

In [3]:
user_info.head()

Unnamed: 0,_id,name,gender,academics
0,5d60098a653a331687083238,Nivesh Singh Chauhan,male,undergraduate
1,5d610ae1653a331687083239,Gaurav Sharma,male,graduate
2,5d618359fc5fcf3bdd9a0910,Akshay Mishra,male,undergraduate
3,5d6d2bb87fa40e1417a49315,Saksham Mathur,male,undergraduate
4,5d7c994d5720533e15c3b1e9,Varun Chowhan,male,undergraduate


In [4]:
user_and_post_info.head()

Unnamed: 0,user_id,post_id,timestamp
0,5df49b32cc709107827fb3c7,5ec821ddec493f4a2655889e,2020-06-01T10:46:45.131Z
1,5ed3748576027d35905ccaab,5ed4cbadbd514d602c1531a6,2020-06-01T09:39:20.021Z
2,5ed0defa76027d35905cc2de,5eac305f10426255a7aa9dd3,2020-06-01T08:12:42.682Z
3,5ed0defa76027d35905cc2de,5ed1ff0276027d35905cc60d,2020-06-01T08:10:23.880Z
4,5ed0defa76027d35905cc2de,5ed3820f76027d35905ccac8,2020-06-01T08:08:54.124Z


In [5]:
post_info.head()

Unnamed: 0,_id,title,category,post_type,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,5d62abaa65218653a132c956,hello there,Plant Biotechnology,blog,,,,,,
1,5d6d39567fa40e1417a4931c,Ml and AI,Artificial Intelligence|Machine Learning|Infor...,blog,,,,,,
2,5d7d23315720533e15c3b1ee,What is an Operating System ?,Operating Systems,blog,,,,,,
3,5d7d405e5720533e15c3b1f3,Lord Shiva,Drawings,artwork,,,,,,
4,5d80dfbc6c53455f896e600e,How Competition law evolved?,Competition Laws,blog,,,,,,


In [6]:
# Removing NaN columns (Unnamed)
post_info.drop(['Unnamed: '+str(x) for x in range(4, 10)], axis = 1, inplace = True)

In [7]:
user_info.describe()

Unnamed: 0,_id,name,gender,academics
count,118,118,118,118
unique,118,118,3,3
top,5d60098a653a331687083238,Shreya maji,male,undergraduate
freq,1,1,72,68


In [8]:
user_and_post_info.describe()

Unnamed: 0,user_id,post_id,timestamp
count,1449,1449,1449
unique,118,495,1449
top,5d60098a653a331687083238,5ec1fd0974f7660d73aa0fd5,2020-05-21T19:58:28.607Z
freq,230,18,1


In [9]:
df1 = user_info.loc[(user_info['gender'] != 'male') & (user_info['gender'] != 'female')]
df1

Unnamed: 0,_id,name,gender,academics
67,5ea43f7310426255a7aa9b1d,Apoorva Rana,undefined,undefined
74,5ea890e410426255a7aa9c16,Gretchen,undefined,undefined


In [10]:
df2 = user_info.loc[(user_info['academics'] != 'graduate') & (user_info['academics'] != 'undergraduate')]
df2

Unnamed: 0,_id,name,gender,academics
67,5ea43f7310426255a7aa9b1d,Apoorva Rana,undefined,undefined
74,5ea890e410426255a7aa9c16,Gretchen,undefined,undefined


# Content Based Filtering

### Combining Features (Bag of Words)

In [11]:
# Adding extra index column (to be used later)
post_info.insert(0, 'index', range(0, len(post_info)))

In [12]:
# Converting ids to string data type
post_info['_id'] = post_info['_id'].astype(str)

In [13]:
# Taking care of NaN values

post_info_cols = list(post_info.columns)
post_info_cols.pop(0) # removing '_id'

for feature in post_info_cols:
    post_info[feature] = post_info[feature].fillna("")

In [14]:
# function to make a single string by combining data in a row of post_info

def combine_features(row):
    try:
        return row['title'] + " " + row['category'] + " "+ row[' post_type']
    except:
        print ("Error:",row)

In [15]:
# New Column having concatenated data of all columns (except '_id')

post_info["combined_features"] = post_info.apply(combine_features, axis = 1)

post_info["combined_features"]

0                   hello there Plant Biotechnology blog
1      Ml and AI Artificial Intelligence|Machine Lear...
2      What is an Operating System ? Operating System...
3                            Lord Shiva Drawings artwork
4      How Competition law evolved? Competition Laws ...
                             ...                        
488    CI/CD using GitHub Actions Computer Technology...
489    Configure Docker with Django; PostgreSQL; Pg-a...
490                       Recommendation Engine  project
491    Face Recognition using Transfer Learning. Scie...
492    Configure CI/CD Pipeline in GitLab and deploym...
Name: combined_features, Length: 493, dtype: object

### Filtering Similar Posts

In [16]:
cv = CountVectorizer()

count_matrix = cv.fit_transform(post_info["combined_features"])

cosine_sim = cosine_similarity(count_matrix)

In [17]:
#cv.get_feature_names()

In [18]:
# Defining Utility Functions

def get_index_from_title(title):
    return post_info[post_info['title'] == title]['index'].values[0]

def get_title_from_index(index):
    return post_info[post_info['index'] == index]['title'].values[0]

def get_post_id_from_user_id(userid):
    return user_and_post_info[user_and_post_info['user_id'] == userid]['post_id'].values[0]

def get_title_from_post_id(postid):
    return post_info[post_info['_id'] == postid]['title'].values[0]

In [19]:
# USER

Id_of_user = '5e5dfbbefbc8805f69e02c91'
Id_of_post_user_likes = get_post_id_from_user_id(Id_of_user)
post_user_likes = get_title_from_post_id(Id_of_post_user_likes)

In [20]:
print(post_user_likes)

Dog❤️


In [21]:
post_index = get_index_from_title(post_user_likes)

In [22]:
similar_posts = list(enumerate(cosine_sim[post_index]))

In [23]:
sorted_similar_posts = sorted(similar_posts, key = lambda x:x[1], reverse=True)

In [24]:
# Top 20 similar posts

i = 0
for post in sorted_similar_posts:
    print(get_title_from_index(post[0]))
    i = i + 1
    if i > 20:
        break

Dog❤️
Spirituality
Travelling
screw2
Aesthetic
Quarantined
Bloom🌸
freeze
Bliss :)
Sunset
Peaceful
TOWER
PEACE
EYES
CANON
THOUGHTFUL
engrossed
Hermit
innocence
peace
trekking
