<a href="https://colab.research.google.com/github/yordanovagabriela/recommendersys/blob/master/HW2_ContentBasedFiltering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [0]:
import pandas as pd
import math
import scipy
import numpy as np
import string
import re
import time

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

## Download nltk corpuses

In [0]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## Install additional libraries

In [0]:
# A package which efficiently applies any function to a pandas dataframe or series in the fastest available manner
!pip install swifter

Collecting swifter
  Downloading https://files.pythonhosted.org/packages/ad/55/984b3b380b2b20159cb6fbef6157e95b83332cf8a61d1b96338e3aa34112/swifter-0.304-py3-none-any.whl
Collecting psutil>=5.6.6
[?25l  Downloading https://files.pythonhosted.org/packages/c4/b8/3512f0e93e0db23a71d82485ba256071ebef99b227351f0f5540f744af41/psutil-5.7.0.tar.gz (449kB)
[K     |████████████████████████████████| 450kB 19.0MB/s 
[?25hCollecting numba>=0.49.0
[?25l  Downloading https://files.pythonhosted.org/packages/12/0c/4b4834f6cce91f64ecf5bf68f53ea7c60e4fe0af43910c9448cb8e1bc2b3/numba-0.49.0-cp36-cp36m-manylinux2014_x86_64.whl (3.6MB)
[K     |████████████████████████████████| 3.6MB 49.3MB/s 
Collecting partd>=0.3.10; extra == "complete"
  Downloading https://files.pythonhosted.org/packages/44/e1/68dbe731c9c067655bff1eca5b7d40c20ca4b23fd5ec9f3d17e201a6f36b/partd-1.1.0-py3-none-any.whl
Collecting distributed>=2.0; extra == "complete"
[?25l  Downloading https://files.pythonhosted.org/packages/d5/fb/8c47e

# Load dataset

In [0]:
articles_df = pd.read_csv('./shared_articles.csv')
interactions_df = pd.read_csv('./users_interactions.csv')

# Data Preprocessing


## Preprocess *articles* dataset

### Exclude **CONTENT REMOVED** from *articles* dataset
It is not really clear how to handle this data, so for simplicity I will not work with it.

In [0]:
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']

### Merge *title* and *text* columns in *articles* dataset

In [0]:
articles_df['data'] = articles_df['title'] + " " + articles_df['text']
articles_df = articles_df.drop('text', 1)

### Exclude useless columns from *articles* dataset
Will exclude some of the columns which are not used anywhere for simplicity.

In [0]:
articles_df = articles_df[['contentId', 'lang', 'title', 'url', 'data']]

### Preprocess *data* column in *articles* dataset

In [0]:
def lemmatize(word):
    lemmatizer = WordNetLemmatizer()

    result = pos_tag(word)
    if (result is None) or (len(result) == 0):
        return word
    
    tag = result[0][1]

    if tag.startswith("NN"):
        return lemmatizer.lemmatize(word, pos='n')
    elif tag.startswith('VB'):
        return lemmatizer.lemmatize(word, pos='v')
    elif tag.startswith('JJ'):
        return lemmatizer.lemmatize(word, pos='a')
    else:
        return word

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

# Runs ~ 15min for all the data, because of the join operation
def preprocess_text_with_lemmatization(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = map(lambda token: lemmatize(token), tokens)
    text = ' '.join(tokens)
    return text

In [0]:
import swifter

start_millis = int(round(time.time() * 1000))

articles_df['data'] = articles_df['data'].swifter.apply(preprocess_text)

end_millis = int(round(time.time() * 1000))

print("Preprocessing of 'data' finished for {} millis".format(end_millis - start_millis))

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=3047, style=ProgressStyle(description_widt…


Preprocessing of 'data' finished for 1406 millis


## Preprocess *interactions* dataset

### Assign specific weight to each interaction based to its type
Each type gives us different information on 'how much the user enjoys' a particular article (for example a **LIKE** gives us much more information than just a simple **VIEW**).\
It is useful to assign a *weight* for each interaction based on these types.

In [0]:
interaction_weight = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

interactions_df['weight'] = interactions_df['eventType'].apply(lambda x: interaction_weight[x])

In [0]:
interactions_df.head(4)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry,weight
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,,1.0
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US,1.0
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,,1.0
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,,3.0


### Remove users with little interactions
We cannot give good recommendations for users with no or little interactions, so we will remove all which have less than **MIN_USER_INTERACTIONS** interactions.

In [0]:
MIN_USER_INTERACTIONS = 5
users_interactions_count_df = interactions_df.groupby(['personId', 'contentId']).size().groupby('personId').size()
filtered_users_interactions_df = users_interactions_count_df[users_interactions_count_df >= MIN_USER_INTERACTIONS].reset_index()[['personId']]

In [0]:
interactions_from_selected_users_df = interactions_df.merge(filtered_users_interactions_df, 
               how = 'right',
               left_on = 'personId',
               right_on = 'personId')

### Summarize user interactions
As each user can interact with an article multiple times we can summarize these interactions as a sum of all weights.

In [0]:
interactions_summary_df = interactions_from_selected_users_df \
                    .groupby(['personId', 'contentId'])['weight'].sum().reset_index()

In [0]:
print("Min weight: {}".format(interactions_summary_df['weight'].min()))
print("Max weight: {}".format(interactions_summary_df['weight'].max()))

Min weight: 1.0
Max weight: 230.0


There is great deviation between the min and the max weights, so we can apply simple logarithmic function to smooth the weights.

In [0]:
def smooth(x):
    return math.log(1+x, 2)

In [0]:
interactions_full_df = interactions_from_selected_users_df \
                    .groupby(['personId', 'contentId'])['weight'].sum() \
                    .apply(smooth).reset_index()

In [0]:
interactions_full_df.head(5)

Unnamed: 0,personId,contentId,weight
0,-9223121837663643404,-8949113594875411859,1.0
1,-9223121837663643404,-8377626164558006982,1.0
2,-9223121837663643404,-8208801367848627943,1.0
3,-9223121837663643404,-8187220755213888616,1.0
4,-9223121837663643404,-7423191370472335463,3.169925


# Split *interactions* dataset into train and test
Splits the *interactions* dataset so that each person has 70% of its interactions in the train dataset and 30% in the test dataset.

In [0]:
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,
                                   stratify=interactions_full_df['personId'], 
                                   test_size=0.30,
                                   random_state=42)

# Create TF-IDF Vectorizer
TF-IDF Vectorizer with 5000 words - unigrams and bigrams

### Analyze languages
There are articles written in:
1. **English**
2. **Portuguese**
3. **Spanish**
4. Latin
5. Japanese

*nltk* does not have stopwords corpuses for latin and japanese so I will only ignore the stopwords in english, portuguese and spanish.



In [0]:
print(articles_df['lang'].unique())

['en' 'pt' 'es' 'la' 'ja']


In [0]:
stopwords_list = stopwords.words('english') + stopwords.words('portuguese') + stopwords.words('spanish')

vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=0.003,
                     max_df=0.5,
                     max_features=5000,
                     stop_words=stopwords_list)

tfidf_matrix = vectorizer.fit_transform(articles_df['data'])

In [0]:
content_ids = articles_df['contentId'].tolist()

# Build User and Item Profiles

In [0]:
def get_content_profile(content_id):
    index = content_ids.index(content_id)
    content_profile = tfidf_matrix[index:index+1]
    return content_profile

def get_content_profiles(content_ids):
    content_profiles_list = [get_content_profile(content_id) for content_id in content_ids]
    content_profiles = scipy.sparse.vstack(content_profiles_list)
    return content_profiles

In [0]:
def build_users_profile(person_id, interactions_df):
    # Filter interactions by personId
    interactions_person_df = interactions_df.loc[interactions_df['personId'] == person_id]
    # Get all contentId-s that the given personId had interacted with
    content_person_df = interactions_person_df['contentId']
    # Retreive the accumulated tfidf statistics for all articles that the given personId interacted with
    user_content_profiles = get_content_profiles(content_person_df)

    # Get the weights
    user_content_weights = np.array(interactions_person_df['weight']).reshape(-1,1)

    user_content_weights_average = np.sum(user_content_profiles.multiply(user_content_weights), axis=0) / np.sum(user_content_weights)
    user_profile_normalized = sklearn.preprocessing.normalize(user_content_weights_average)

    return user_profile_normalized

def build_users_profiles(): 
    interactions_shared_df = interactions_train_df[interactions_train_df['contentId'].isin(articles_df['contentId'])]
    user_profiles = {}

    for person_id in interactions_shared_df['personId'].unique():
        user_profiles[person_id] = build_users_profile(person_id, interactions_shared_df)
        
    return user_profiles

In [0]:
user_profiles = build_users_profiles()

# Build Content-Based Recommendation Model

In [0]:
class ContentBasedRecommendationModel:
    
    def __init__(self, items_df):
        self.content_ids = content_ids
        self.items_df = items_df
        
    def get_similar_items(self, person_id, topn=1000):
        cosine_similarities = cosine_similarity(user_profiles[person_id], tfidf_matrix)
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        similar_items = sorted([(content_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])

        return similar_items
        
    def recommend_items(self, person_id, topn=10, exclude_previous_interactions=True):
        similar_items = self.get_similar_items(person_id)
        
        # Exclude the articles that the user already interacted with
        if exclude_previous_interactions:
          interactions_person_df = interactions_df.loc[interactions_df['personId'] == person_id]
          content_person_list = interactions_person_df['contentId'].tolist()
          similar_items = list(filter(lambda x: x[0] not in content_person_list, similar_items))
        
        recommendations_df = pd.DataFrame(similar_items, columns=['contentId', 'recStrength']).head(topn)
        recommendations_df = recommendations_df.merge(self.items_df, how = 'left', left_on = 'contentId', right_on = 'contentId')
        
        return recommendations_df[['recStrength', 'contentId', 'lang', 'title', 'url']]

In [0]:
model = ContentBasedRecommendationModel(articles_df)

# Test Recommendation Model


In [0]:
def get_user_interactions(personId, df):
  interactions_person_df = df.loc[df['personId'] == PERSON_ID]
  interactions_person_df = interactions_person_df.merge(articles_df, how = 'left', left_on = 'contentId', right_on = 'contentId')
  return interactions_person_df.sort_values('weight', ascending = False)[['weight', 'contentId', 'lang', 'title', 'url']]

Recommend items based on the training dataset including the articles the user already interacted with.

In [0]:
# PERSON_ID = -9223121837663643404
PERSON_ID = -1479311724257856983
model.recommend_items(PERSON_ID, topn=30).head(20)

Unnamed: 0,recStrength,contentId,lang,title,url
0,0.606611,638282658987724754,en,Machine Learning for Designers,https://www.oreilly.com/learning/machine-learn...
1,0.554171,2220561310072186802,en,5 Skills You Need to Become a Machine Learning...,http://blog.udacity.com/2016/04/5-skills-you-n...
2,0.538079,-8068727428160395745,en,How real businesses are using machine learning,https://techcrunch.com/2016/03/19/how-real-bus...
3,0.534687,-9128652074338368262,en,Clarifying the uses of artificial intelligence...,http://techcrunch.com/2016/05/12/clarifying-th...
4,0.52219,-6940659689413147290,en,An Exclusive Look at How AI and Machine Learni...,https://backchannel.com/an-exclusive-look-at-h...
5,0.51497,54678605145828343,en,Is machine learning the next commodity?,http://readwrite.com/2016/04/18/machine-learni...
6,0.513579,3564394485543941353,en,Google Is About to Supercharge Its TensorFlow ...,http://www.wired.com/2016/04/google-supercharg...
7,0.506381,-7702672626132856079,en,Google supercharges machine learning tasks wit...,https://cloudplatform.googleblog.com/2016/05/G...
8,0.499098,365571143597993923,en,Power to the People: How One Unknown Group of ...,https://medium.com/@atduskgreg/power-to-the-pe...
9,0.481762,1146823593746606655,en,Apple Hiring for New Machine Learning Division...,http://www.macrumors.com/2016/08/31/apple-turi...


In [0]:
get_user_interactions(PERSON_ID, interactions_test_df).head(20)

Unnamed: 0,weight,contentId,lang,title,url
11,3.459432,-532999578436827210,en,IBM Seeks to Simplify Graph with New Titan Ser...,https://www.datanami.com/2016/07/27/ibm-seeks-...
5,3.459432,-5658245291907121574,en,Machine Learning and the VP Debate,https://medium.com/@srobtweets/machine-learnin...
9,3.459432,-8377626164558006982,en,Bad Writing Is Destroying Your Company's Produ...,https://hbr.org/2016/09/bad-writing-is-destroy...
15,3.0,-9033211547111606164,en,Google's Cloud Machine Learning service is now...,https://techcrunch.com/2016/09/29/googles-clou...
14,2.584963,1549650080907932816,en,Spark comparison: AWS vs. GCP,https://www.oreilly.com/ideas/spark-comparison...
18,2.584963,524776334673868069,en,Graph-powered Machine Learning at Google,https://research.googleblog.com/2016/10/graph-...
30,2.321928,-4127059794203205931,en,TPOT: A Python tool for automating data science,http://www.randalolson.com/2016/05/08/tpot-a-p...
6,2.321928,363798057559041921,en,To Be Continued: Helping you find shows you wa...,http://techblog.netflix.com/2016/10/to-be-cont...
8,2.321928,-1901742495252324928,en,Designing smart notifications,https://medium.com/@intercom/designing-smart-n...
27,2.0,-3040610224044779845,en,Things you probably didn't know you could do w...,https://medium.freecodecamp.com/10-tips-to-max...


In [0]:
get_user_interactions(PERSON_ID, interactions_train_df).head(20)

Unnamed: 0,weight,contentId,lang,title,url
58,4.285402,7342707578347442862,en,"At eBay, Machine Learning is Driving Innovativ...",https://www.ebayinc.com/stories/news/at-ebay-m...
39,4.129283,621816023396605502,en,AI Is Here to Help You Write Emails People Wil...,http://www.wired.com/2016/08/boomerang-using-a...
84,4.044394,-4460374799273064357,en,"Deep Learning for Chatbots, Part 1 - Introduction",http://www.wildml.com/2016/04/deep-learning-fo...
7,3.954196,-7959318068735027467,en,Auto-scaling scikit-learn with Spark,https://databricks.com/blog/2016/02/08/auto-sc...
29,3.906891,2589533162305407436,en,6 reasons why I like KeystoneML,http://radar.oreilly.com/2015/07/6-reasons-why...
90,3.70044,5258604889412591249,en,Machine Learning Is No Longer Just for Experts,https://hbr.org/2016/10/machine-learning-is-no...
56,3.70044,-398780385766545248,en,10 Stats About Artificial Intelligence That Wi...,http://www.fool.com/investing/2016/06/19/10-st...
11,3.643856,-6467708104873171151,en,5 reasons your employees aren't sharing their ...,http://justcuriousblog.com/2016/04/5-reasons-y...
47,3.523562,-4944551138301474550,en,Algorithms and architecture for job recommenda...,https://www.oreilly.com/ideas/algorithms-and-a...
66,3.459432,444378495316508239,en,How to choose algorithms for Microsoft Azure M...,https://azure.microsoft.com/en-us/documentatio...


## Conclusion
Recommendations given for the particular user seem really relevant. From the test set we could see that most of the user interactions are about machine learning/google/ai articles which reflects the recs. This could also be confirmed if we check the user's interactions from the train set. \
However, instead of only eye-balling the results it will be better to evaluate the model with some metric like Top-N Accuracy (I tried to implement it but unfortunately it did't work)

For better results it could be reasonable to try tuning some of the parameters like the number of words included in TF-IDF matrix, the min number of interactions per user etc. Also, more experiments could be performed regarding NLP as currently I am not pretty sure that lemmatization improves the results because of the multi languages.


# Hit Rate Evaluation

In [0]:
total_hitrate = 0
total_users = len(interactions_train_df.personId.unique())

for person_id in interactions_train_df.personId.unique():

  cosine_similarities = cosine_similarity(user_profiles[person_id], tfidf_matrix)
  similar_indices = cosine_similarities.argsort().flatten()[-1000:]

  items_to_ignore = interactions_train_df[interactions_train_df["personId"]==person_id]["contentId"].values
  similar_indices_filtered = list(filter(lambda x: content_ids[x] not in items_to_ignore, similar_indices))

  items_to_evaluate = interactions_test_df[interactions_test_df["personId"]==person_id]["contentId"].values
  
  person_interacted_items_len = len(similar_indices_filtered)
  hits = 0

  for i in range(person_interacted_items_len):
    if(content_ids[similar_indices_filtered[i]] in items_to_evaluate):
      hits = hits + 1

  current_hitrate = hits / person_interacted_items_len
  total_hitrate += current_hitrate
  print("hit rate for user {} is {}".format(person_id, hits / person_interacted_items_len))

print("TOTAL HIT RATE => {}".format(total_hitrate / total_users))

hit rate for user 3937943558206985686 is 0.0
hit rate for user 1874422396201148365 is 0.00101010101010101
hit rate for user 5598537709124463353 is 0.007253886010362694
hit rate for user -4165818767652094649 is 0.003054989816700611
hit rate for user 2416280733544962613 is 0.07033997655334115
hit rate for user -2979537012405607453 is 0.013844515441959531
hit rate for user -8020832670974472349 is 0.034292035398230086
hit rate for user -8853658195208337106 is 0.01583949313621964
hit rate for user 3005175913610348223 is 0.002014098690835851
hit rate for user -2626634673110551643 is 0.11054637865311309
hit rate for user 5127372011815639401 is 0.018338727076591153
hit rate for user 1949009070102523745 is 0.015991471215351813
hit rate for user 5660542693104786364 is 0.03260869565217391
hit rate for user -5394062070584740055 is 0.00202020202020202
hit rate for user 1598729374254679339 is 0.0030272452068617556
hit rate for user 3429602690322213789 is 0.01671891327063741
hit rate for user -320389

## Conclusion
The total hit rate is ~ 0.06, which seems to be really low. There are a lot of users with 0 hit rate.