# Post Content Based Recommender

In [1]:
# import packages
%matplotlib inline
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet


In [2]:
contact_settings = pd.read_csv('data/contact_settings.csv')
ext_feeds = pd.read_csv('data/extfeeds.csv')
following = pd.read_csv('data/following.csv')
maillists = pd.read_csv('data/maillists.csv')
migrations = pd.read_csv('data/migrations.csv')
notifications = pd.read_csv('data/migrations.csv')
posts = pd.read_csv('data/posts.csv')
thoughts = pd.read_csv('data/thoughts.csv')
user_settings = pd.read_csv('data/user_settings.csv')
users = pd.read_csv('data/users.csv')

In [3]:
print(contact_settings.shape)
print(ext_feeds.shape)
print(following.shape)
print(maillists.shape)
print(migrations.shape)
print(notifications.shape)
print(posts.shape)
print(thoughts.shape)
print(user_settings.shape)
print(users.shape)

(249, 6)
(45, 12)
(5296, 3)
(41, 4)
(26, 3)
(26, 3)
(876, 12)
(16, 5)
(2, 10)
(2293, 12)


In [4]:
print('=== Contact Settings ===')
print(contact_settings.head())
print('=== Ext Feeds ===')
print(ext_feeds.head())
print('=== Following ===')
print(following.head())
print('=== MailLists ===')
print(maillists.head())
print('=== Migrations ===')
print(migrations.head())
print('=== Notifications ===')
print(notifications.head())
print('=== Posts ===')
print(posts.head())
print('=== Thoughts ===')
print(thoughts.head())
print('=== User Settings ===')
print(user_settings.head())
print('=== Users ===')
print(users.head())

=== Contact Settings ===
   id  user_id                        email                   display_message  \
0   1       78  emmanuelezenwigbo@gmail.com  Web Developer | Digital Marketer   
1   2       90           dward009@gmail.com                           Call me   
2   3       53            apeji93@gmail.com                             Yossi   
3   4      133          ireabueke@gmail.com                               NaN   
4   5       92          bindas.fs@gmail.com          Contact me on that email   

   created_at  updated_at  
0         NaN         NaN  
1         NaN         NaN  
2         NaN         NaN  
3         NaN         NaN  
4         NaN         NaN  
=== Ext Feeds ===
     id  user_id           site  \
0  2839        2  Elijah Okokon   
1  2840        2  Elijah Okokon   
2  2841        2  Elijah Okokon   
3  2842        2  Elijah Okokon   
4  2843        2  Elijah Okokon   

                                          site_image                 title  \
0  /storage/2

# Similar Post Content Based Recommender

In [5]:
# clean up content
posts['content'] = posts['content'].str.replace(r'<[^>]*>', '')
posts['content'] = posts['content'].str.replace(r'\s', ' ')
posts['content'] = posts['content'].str.replace(r'\\', ' ')
posts['content'] = posts['content'].str.replace(r'\~', ' ')
posts['content'] = posts['content'].str.replace(r'\[.*?\]', '')
posts['content'] = posts['content'].str.replace(r'\(.*?\)', '')
posts['content'].head()

0    I learnt how to use the table tag as i have us...
1     I am on this journey with start.ng, and here ...
2    I have not been attending classes on the HNG c...
3    My journey on **StartNG** pre-internship progr...
4     A Summary on The “idongesit.html” CV, Its Str...
Name: content, dtype: object

In [6]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
posts['content'] = posts['content'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(posts['content'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(876, 4597)

In [7]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
#Construct a reverse map of indices and posts titles
indices = pd.Series(posts.index, index=posts['title']).drop_duplicates()

In [9]:
# Function that takes in post title as input and outputs most similar posts
def get_article_recommendations_for_user(title, cosine_sim=cosine_sim):
    # Get the index of the post that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all posts with that post
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the posts based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar posts
    sim_scores = sim_scores[1:11]

    # Get the post indices
    post_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar posts
    return posts['title'].iloc[post_indices]

In [10]:
get_article_recommendations_for_user('What i have learnt so far on HTML')

258                      WHAT I'VE LEARNT SO FAR
739                                       TASK 2
740                                       TASK 2
260                              HTML Experience
5                             My Journey on HTML
13                            What I done so far
284                           My First HTML Page
47     Task two of my Start.NG internship so far
276                                CV Using html
100                 What you have learned so far
Name: title, dtype: object

# FOLLOWER RECOMMENDER

In [11]:
users.columns

Index(['id', 'name', 'username', 'email', 'image', 'provider', 'provider_id',
       'password', 'remember_token', 'created_at', 'updated_at', 'short_bio'],
      dtype='object')

In [12]:
users.drop(labels=['image', 'provider', 'provider', 'provider_id', 'password', 'remember_token', 'created_at', 'updated_at'], axis=1, inplace=True )
users.head(10)

Unnamed: 0,id,name,username,email,short_bio
0,1,Eniayomi Oluwaseyi,eniayomi,oluwaseyieniayomi@gmail.com,Software Developer | DevOPs Engineer
1,2,Elijah Okokon,DMatrix,okoelijah@gmail.com,Web Developer
2,3,Jeffrey Ogah,jeff.ogah,jeff.ogah@gmail.com,Front End Developer | React Developer | Mentor...
3,4,Oluwaseyi Oluwapelumi,nathan,nathanoluwaseyi@gmail.com,| Software Developer | DevOps Engineer | @linu...
4,5,PoRH,lamar,paulchibiukeigweze@gmail.com,I Am lamar and you don't think am real?
5,6,Seyi Onifade,xyluz,hngtechapps@gmail.com,My Open Diary
6,7,Mark Essien,markessien,markessien@gmail.com,An engineer
7,8,Alex Moses,ahlesswywk,ahlesswywk@gmail.com,"Software Developer hotelsng, Ex. Frontend deve..."
8,9,Edmund Ekott,26th_edmund,edmund.timfon@gmail.com,Front-end Engineer 💻 | FIFA advocate 🎮 | Ninja 🥋
9,10,Stefan aGz,stefanpongrz,stefanpongrz@gmail.com,Here


In [13]:
users.shape

(2293, 5)

In [14]:
users['short_bio'].head(5)

0                 Software Developer | DevOPs Engineer
1                                        Web Developer
2    Front End Developer | React Developer | Mentor...
3    | Software Developer | DevOps Engineer | @linu...
4              I Am lamar and you don't think am real?
Name: short_bio, dtype: object

In [15]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')
#Replace NaN with an empty string
users['short_bio'] = users['short_bio'].fillna('')
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(users['short_bio'])
#Output the shape of tfidf_matrix
# this produces the number of different words used by user in the short_bio
tfidf_matrix.shape

(2293, 2242)

In [16]:
# Compute the cosine similarity matrix
consine_sim = linear_kernel(tfidf_matrix, tfidf_matrix, True)
#Construct a reverse map of indices and user name
indices = pd.Series(users.index, index=users['id'])

In [17]:
def create_new_db(x):
    """
    this function takes the result from get_followers and transforms to a DB with other info
    paramx: recommendation Series
    """
    y = x.copy()
    z = x.copy()
    y.update(users['name'])
    z.update(users['short_bio'])
    xdf=pd.DataFrame(x)
    ydf=pd.DataFrame(y)
    zdf=pd.DataFrame(z)
    ydf.rename(columns={'id':'Name'}, inplace=True)
    xdf.rename(columns={'id':'User_Id'}, inplace=True)
    zdf.rename(columns={'id':'short_bio'}, inplace=True)
    frames = [xdf,ydf,zdf]
    new_df = pd.concat(frames, axis=1)
    new_df.set_index('User_Id', inplace=True)
    return new_df

In [18]:
# Function that takes in user name as input and outputs most similar users
def get_followers(id, consine_sim=consine_sim):
    # Get the index of the users that matches the user
    idx = indices[id]
    # Get the pairwsie similarity scores of all users with that user
    sim_scores = list(enumerate(consine_sim[idx]))
    # Sort the users based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar users
    sim_scores = sim_scores[1:11]
     # Get the user indices
    follow_indices = [i[0] for i in sim_scores]
      # Return the top 10 most similar user
    follower_id = users['id'].iloc[follow_indices]
    # Create a new data frame containing the user id and name 
    follower_rec = create_new_db(follower_id)
    return follower_rec

In [19]:
# Use this function to get details of the user with their user id
def check_user(id):
    name = users.loc[users['id'] == id, 'name']
    bio = users.loc[users['id'] == id, 'short_bio']
    return name.iloc[0], bio.iloc[0]

In [20]:
check_user(23)

('Oluwatomisin Lalude', 'Web Developer')

In [21]:
# Run Get_followers with user id as input
get_followers(23)

Unnamed: 0_level_0,Name,short_bio
User_Id,Unnamed: 1_level_1,Unnamed: 2_level_1
23,Oluwatomisin Lalude,Web Developer
66,Kev Chike,Web Developer
78,Emmanuel Ezenwigbo,Web developer
338,Stephen Oluwafemi,Web Developer
362,Hafizah Muhyideen,I am a web developer
725,Tochukwu Nwosu,I am a Web Developer
857,benjamin chibuzor,Web Developer
1012,OMIDIORA EMMANUEL,Web Developer
1167,Ogunjuyigbe Oluwaseyi,Web developer
1254,Adrian Roca,Web Developer


In [22]:
users.loc[users['id'] == 23, 'short_bio'].iloc[0]

'Web Developer'