### Importing packages

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
import math
import random

In [8]:

import sklearn
from sklearn.model_selection import  train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
import scipy
from scipy.sparse.linalg import svds

In [10]:
from nltk.corpus import stopwords

### Reading the data

In [11]:
articles_df = pd.read_csv('datasets/shared_articles.csv')

In [12]:
# using only "CONTENT SHARED" articles
articles_df = articles_df[articles_df.eventType=='CONTENT SHARED']

In [13]:
articles_df.head()

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en
5,1459194522,CONTENT SHARED,-2826566343807132236,4340306774493623681,8940341205206233829,,,,HTML,http://www.coindesk.com/ieee-blockchain-oxford...,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...,en


In [14]:
interactions_df = pd.read_csv('datasets/users_interactions.csv')
interactions_df.head(7)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,
5,1465413742,VIEW,310515487419366995,-8763398617720485024,1395789369402380392,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,MG,BR
6,1465415950,VIEW,-8864073373672512525,3609194402293569455,1143207167886864524,,,


### Preprocessing

In [15]:
interactions_df.eventType.unique()

array(['VIEW', 'FOLLOW', 'BOOKMARK', 'LIKE', 'COMMENT CREATED'],
      dtype=object)

In [16]:
# assigning weight to 'event_type'
event_type_strength = {
    'VIEW': 1.0,
    'LIKE': 2.0,
    'BOOKMARK': 2.5,
    'FOLLOW': 3.0,
    'COMMENT CREATED': 4.0
}


interactions_df['eventStrength'] = interactions_df['eventType']\
    .apply(
lambda x: event_type_strength[x])

In [17]:
interactions_df['eventStrength'].head(7)

0    1.0
1    1.0
2    1.0
3    3.0
4    1.0
5    1.0
6    1.0
Name: eventStrength, dtype: float64

In [18]:
users_interactions_count_df = interactions_df\
    .groupby(['personId', 'contentId'])\
    .size()\
    .groupby('personId')\
    .size()
print('# users: %d' %len(users_interactions_count_df))

users_with_enough_interactions_df = users_interactions_count_df\
    [users_interactions_count_df >= 5].reset_index()[['personId']]
print('# users with at least 5 interations: %d'
      % len(users_with_enough_interactions_df))

# users: 1895
# users with at least 5 interations: 1140


In [19]:
print('# of interactions: %d' % len(interactions_df))

interactions_from_selected_users_df = interactions_df\
    .merge(users_with_enough_interactions_df,
          how = 'right',
          left_on = 'personId',
          right_on = 'personId')
print('# of interactions from users with at least 5 interactions: %d' 
      % len(interactions_from_selected_users_df))

# of interactions: 72312
# of interactions from users with at least 5 interactions: 69868


In [20]:
# aggregating all interactions the user has performed in an intem
# by a weigted sum of interaction type strength 
# and smoothening the distribution using log transformation

def smooth_user_preference(x):
    return math.log(1+x, 2)

interactions_full_df = interactions_from_selected_users_df\
    .groupby(['personId', 'contentId'])['eventStrength']\
    .sum()\
    .apply(smooth_user_preference)\
    .reset_index()

In [21]:
print('# of unique/item interactions: %d'
     % len(interactions_full_df))
interactions_full_df.head(7)

# of unique/item interactions: 39106


Unnamed: 0,personId,contentId,eventStrength
0,-9223121837663643404,-8949113594875411859,1.0
1,-9223121837663643404,-8377626164558006982,1.0
2,-9223121837663643404,-8208801367848627943,1.0
3,-9223121837663643404,-8187220755213888616,1.0
4,-9223121837663643404,-7423191370472335463,3.169925
5,-9223121837663643404,-7331393944609614247,1.0
6,-9223121837663643404,-6872546942144599345,1.0


### Evaluation

In [22]:
interactions_train_df, interactions_test_df = train_test_split(
    interactions_full_df,
    stratify=interactions_full_df['personId'],
    test_size = 0.2,
    random_state=42)

In [23]:
print('# interactions - Train set: %d' %len(interactions_train_df))
print('# interactions - Test  set: %d' %len(interactions_test_df))

# interactions - Train set: 31284
# interactions - Test  set: 7822


In [24]:
# indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = interactions_full_df\
    .set_index('personId')
interactions_train_indexed_df = interactions_train_df\
    .set_index('personId')
interactions_test_indexed_df = interactions_test_df\
    .set_index('personId')

In [25]:
def get_items_interacted(person_id, interactions_df):
    '''
    Get the user's data and merge it with the movie information.
    '''
    interacted_items = interactions_df.loc[person_id]['contentId']
    
    return set(interacted_items 
               if type(interacted_items)==pd.Series
               else [interacted_items])

In [26]:
class ModelEvaluator:
    
    # Top-N accuracy metrics consts
    EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100
    
    def get_not_interacted_items_sample(self, person_id,
                                        sample_size, seed=42):
        '''
        Returns a sample set of all non-interacted items
        from a person's id.
        '''
        interacted_items = get_items_interacted(person_id,
                                    interactions_full_indexed_df)
        
        all_items = set(articles_df['contentId'])
        non_interacted_items = all_items - interacted_items
        
        random.seed(seed)
        
        non_interacted_items_sample = random.sample(
            non_interacted_items, sample_size)
        
        return set(non_interacted_items_sample)
    
    def _verify_hit_top_n(self, item_id, recommended_items, topn):
        '''
        Checks if a particular 'item_id' is in the 'recommended_items'
        Returns the index and a hit if the item is in the 'topn' indexes 
        '''
        try:
            index = next(i for i, c in enumerate(recommended_items)
                         if c == item_id)
        except:
            index = -1
        hit = int(index in range(0, topn))
        return hit, index
    
    def evaluate_model_for_user(self, model, person_id):
        
        '''
        Evaluate the Recommender model for a specific user(person_id)
        '''
        
        # getting the items in the test set
        interacted_values_testset = interactions_test_indexed_df\
        .loc[person_id]

        if type(interacted_values_testset['contentId']) == pd.Series:
            person_interacted_items_testset = \
                set(interacted_values_testset['contentId'])
        else:
            person_interacted_items_testset = \
                set([int(interacted_values_testset['contentId'])])
        
        interacted_items_count_testset = \
            len(person_interacted_items_testset)
        
        
        # getting a ranked recommnedation list from a model
        # for a given user
        person_recs_df = model.recommend_items\
                        (
                            person_id,
                            items_to_ignore = \
                                get_items_interacted
                                (
                                    person_id,
                                    interactions_train_indexed_df
                                ),
                            topn=10000000000
                        )
        
        hits_at_5_count = 0
        hits_at_10_count = 0
        
        # for each item the user has interacted in the test set
        for item_id in person_interacted_items_testset:
            
            # getting a random sample(100) items
            # the user has not interacted
            non_interacted_items_sample = self\
                .get_not_interacted_items_sample\
                    (
                        person_id,
                        sample_size=\
                            self.EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS,
                        seed=item_id%(2**32)
                    )
            
            # combining the current interacted item with
            # the 100 random items 
            items_to_filter_recs = non_interacted_items_sample\
                .union(set([item_id]))
            
            # Filtering only recommendations 
            # that are either the interacted item or
            # from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['contentId']\
                .isin(items_to_filter_recs)]
            valid_recs = valid_recs_df['contentId'].values
            
            # Verifying if the current interacted item is among
            # the Top-N recommneded items
            hit_at_5, index_at_5 = self\
                ._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            
            hit_at_10, index_at_10 = self\
                ._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10
            
        # RECALL is the rate of the interacted items
        # that are ranked among the Top-N recommended items
        # when mixed with a set of non-relevant items
        recall_at_5 = (hits_at_5_count /
                       float(interacted_items_count_testset))
        recall_at_10 = (hits_at_10_count /
                       float(interacted_items_count_testset))
        
        person_metrics = {
            'hits@5_count': hits_at_5_count,
            'hits@10_count': hits_at_10_count,
            'interacted_count': interacted_items_count_testset,
            'recall@5': recall_at_5,
            'recall@10': recall_at_10
        }
        
        return person_metrics
    
    def evaluate_model(self, model):
        '''
        Evaluates the whole model.
        '''
        people_metrics = []
        for idx, person_id in enumerate(list(
            interactions_full_indexed_df.index.unique().values)):
            
            person_metrics = self\
                .evaluate_model_for_user(model, person_id)
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('# users processed: %d' %idx)
        
        detailed_results_df = pd.DataFrame(people_metrics)\
                .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = \
            (detailed_results_df['hits@5_count'].sum() /
            float(detailed_results_df['interacted_count'].sum()))
        
        global_recall_at_10 = \
            (detailed_results_df['hits@10_count'].sum() /
            float(detailed_results_df['interacted_count'].sum()))
        
        global_metrics = {
            'modelName': model.get_model_name(),
            'recall@5': global_recall_at_5,
            'recall@10': global_recall_at_10
                         }
        return global_metrics, detailed_results_df

In [27]:
model_evaluator = ModelEvaluator()

In [28]:
# compute the most popular items
item_popularity_df = interactions_full_df\
    .groupby('contentId')['eventStrength']\
    .sum()\
    .sort_values(ascending=False)\
    .reset_index()
item_popularity_df.head(10)

Unnamed: 0,contentId,eventStrength
0,-4029704725707465084,307.733799
1,-6783772548752091658,233.762157
2,-133139342397538859,228.024567
3,-8208801367848627943,197.107608
4,-6843047699859121724,193.825208
5,8224860111193157980,189.04468
6,-2358756719610361882,183.110951
7,2581138407738454418,180.282876
8,7507067965574797372,179.094002
9,1469580151036142903,170.548969


In [29]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
    
    def recommend_items(self, user_id, items_to_ignore=[],
                        topn=10, verbose=False):
        # recommend the popular items the user hasn't seen yet
        recommendation_df = self\
            .popularity_df[~self.popularity_df['contentId']
            .isin(items_to_ignore)]\
            .sort_values('eventStrength', ascending=False)\
            .head(topn)
        
        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required \
                    in verbose mode')
            recommendation_df = recommendation_df\
                .merge(self.items_df, how='left',
                       left_on='contentId', right_on='contentId')\
            [['eventStrength', 'contentId', 'title', 'url', 'lang']]
            
        return recommendation_df

In [30]:
popularity_model = PopularityRecommender(item_popularity_df,
                                         articles_df)

In [31]:
print('Evaluating Popularity recommendation model...')
pop_global_metrics, pop_detailed_results_df = model_evaluator\
            .evaluate_model(popularity_model)
print('\nGlobal metrics:\n%s' % pop_global_metrics)
pop_detailed_results_df.head(10)

Evaluating Popularity recommendation model...
# users processed: 1139

Global metrics:
{'modelName': 'Popularity', 'recall@5': 0.2417540271030427, 'recall@10': 0.37292252620813093}


Unnamed: 0,_person_id,hits@10_count,hits@5_count,interacted_count,recall@10,recall@5
776,3609194402293569455,50,28,192,0.260417,0.145833
416,-2626634673110551643,25,12,134,0.186567,0.089552
496,-1032019229384696495,23,13,130,0.176923,0.1
476,-1443636648652872475,9,5,117,0.076923,0.042735
399,-2979881261169775358,40,25,88,0.454545,0.284091
362,-3596626804281480007,18,12,80,0.225,0.15
622,1116121227607581999,33,20,73,0.452055,0.273973
590,692689608292948411,23,17,69,0.333333,0.246377
13,-9016528795238256703,18,14,69,0.26087,0.202899
780,3636910968448833585,28,21,68,0.411765,0.308824


### Collaborative Filtering

In [32]:
# ignoring stopwords from English and Portugese
stopwords_list = stopwords.words('english') \
               + stopwords.words('portuguese')

In [59]:
# training a model whose vector size is 5000
# composed by the main unigrams & bigrams found in the corpus
# ignoring stopwords
vectorizer = TfidfVectorizer(analyzer='word',
                            ngram_range=(1,2),
                            min_df=0.003,
                            max_df=0.5,
                            max_features=5000,
                            stop_words=stopwords_list)
items_ids = articles_df['contentId'].tolist()
tfidf_matrix = vectorizer.fit_transform(articles_df['title'] + "" +
               articles_df['text'])
tfidf_features_names = vectorizer.get_feature_names()
tfidf_matrix

<3047x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 638928 stored elements in Compressed Sparse Row format>

In [None]:
class build_user_profile:
    
    def __init__(self, items_ids, tfidf_matrix,
                interactions_indexed_df):
        
        self.items_ids=items_ids
        self.tfidf_matrix = tfidf_matrix
        
    def get_item_profile(self, item_id):
        idx = items_ids.index(item_id)
        item_profile = tfidf_matrix[idx:idx+1]
        return item_profile

    def get_item_profiles(self, ids):
        item_profiles_list = [get_item_profile(x) for x in ids]
        item_profiles = scipy.sparse.vstack(item_profiles_list)
        return item_profiles
    
    def build_users_profile(self, person_id,
                            interactions_indexed_df):
        
        interactions_person_df = interactions_indexed_df\
                                    .loc[person_id]
        user_item_profiles = get_item_profile(
            interactions_person_df['contentId'])
        user_item_strengths = np.array(
            interactions_person_df['eventStrength'])\
                .reshape(-1,1)
        
        # weighted average of item profiles
        # by the interactions strength