##### References
https://towardsdatascience.com/building-a-collaborative-filtering-recommender-system-with-clickstream-data-dffc86c8c65

https://www.kaggle.com/rsadiq/recommender-system-for-implicit-feedback

https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe


In [37]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
import random
import implicit
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


In [2]:
cars = pd.read_csv('CarIds.csv')
users = pd.read_csv('users.csv')

In [3]:
cars.drop('Unnamed: 0', axis=1, inplace=True)
users['car_id'] = users['carId']
users.drop('carId',axis=1, inplace=True)

In [4]:
users

Unnamed: 0,user_id,car_id
0,0,YPSILON_2000
1,0,LAGUNA_2001
2,0,206_2000
3,1,C5_2002
4,1,TWINGO_2002
...,...,...
18970,4276,3_2017
18971,4276,INSIGNIA_2018
18972,4276,INSIGNIA_2017
18973,4277,INSIGNIA_2017


See how many users we have in our dataset

In [5]:
users['user_id'].nunique()

4278

See how many cars we have in our dataset

In [6]:
cars

Unnamed: 0,Brend,Cena,Godiste,Gorivo,Karoserija,Kilometraza,Kubikaza,Model,Snaga,car_id
0,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,147_2007
1,ALFA ROMEO,2850,2006,Dizel,Hecbek,222000,1910,147,150,147_2006
2,ALFA ROMEO,1850,2004,Dizel,Limuzina,178000,1910,147,116,147_2004
3,ALFA ROMEO,1700,2004,Dizel,Hecbek,156906,1910,147,116,147_2004
4,ALFA ROMEO,1700,2002,Dizel,Hecbek,272000,1900,147,116,147_2002
...,...,...,...,...,...,...,...,...,...,...
35234,VOLKSWAGEN,4200,2006,Dizel,Monovolumen (MiniVan),251000,1900,TOURAN,105,TOURAN_2006
35235,VOLKSWAGEN,3550,2005,Dizel,Monovolumen (MiniVan),259000,1896,TOURAN,105,TOURAN_2005
35236,VOLKSWAGEN,7700,2011,Dizel,Monovolumen (MiniVan),214000,1598,TOURAN,105,TOURAN_2011
35237,VOLKSWAGEN,4490,2007,Dizel,Monovolumen (MiniVan),210000,1890,TOURAN,105,TOURAN_2007


Out of 35239, we have a lot of models with the same id because it is set only for model and year of a car. 

Now we can change that and reduce the number of cars by removing duplicate cars based on year and volume,  so we are left only with the cars that has same year of production and different engine volume.

In [7]:
cars.drop_duplicates(subset = ['Snaga', 'car_id'], inplace=True)

In [8]:
cars

Unnamed: 0,Brend,Cena,Godiste,Gorivo,Karoserija,Kilometraza,Kubikaza,Model,Snaga,car_id
0,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,147_2007
1,ALFA ROMEO,2850,2006,Dizel,Hecbek,222000,1910,147,150,147_2006
2,ALFA ROMEO,1850,2004,Dizel,Limuzina,178000,1910,147,116,147_2004
4,ALFA ROMEO,1700,2002,Dizel,Hecbek,272000,1900,147,116,147_2002
5,ALFA ROMEO,2000,2005,Dizel,Kupe,189500,1910,147,150,147_2005
...,...,...,...,...,...,...,...,...,...,...
35178,VOLKSWAGEN,5000,2007,Benzin,Monovolumen (MiniVan),156000,1400,TOURAN,150,TOURAN_2007
35205,VOLKSWAGEN,3600,2006,Dizel,Limuzina,226800,1980,TOURAN,150,TOURAN_2006
35206,VOLKSWAGEN,11300,2014,Dizel,Monovolumen (MiniVan),118700,1968,TOURAN,140,TOURAN_2014
35212,VOLKSWAGEN,3699,2006,Dizel,Monovolumen (MiniVan),209659,1900,TOURAN,120,TOURAN_2006


In [9]:
cars['car_id'].value_counts()

MEGANE_2004          26
GOLF 5_2004          25
MEGANE_2005          25
MEGANE_2002          23
GOLF 5_2005          23
                     ..
B 180_2015            1
GRANDE PUNTO_2004     1
ASTRA F_2000          1
ML KLASA_2012         1
118_2016              1
Name: car_id, Length: 1266, dtype: int64

In [10]:
df = pd.merge(cars, users[['car_id','user_id']], how = 'inner', on = 'car_id')

In [11]:
df

Unnamed: 0,Brend,Cena,Godiste,Gorivo,Karoserija,Kilometraza,Kubikaza,Model,Snaga,car_id,user_id
0,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,147_2007,283
1,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,147_2007,325
2,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,147_2007,326
3,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,147_2007,411
4,ALFA ROMEO,4000,2007,Dizel,Hecbek,168683,1900,147,150,147_2007,283
...,...,...,...,...,...,...,...,...,...,...,...
147725,VOLKSWAGEN,13200,2017,Dizel,Monovolumen (MiniVan),149341,1598,TOURAN,116,TOURAN_2017,2513
147726,VOLKSWAGEN,13200,2017,Dizel,Monovolumen (MiniVan),149341,1598,TOURAN,116,TOURAN_2017,2514
147727,VOLKSWAGEN,13200,2017,Dizel,Monovolumen (MiniVan),149341,1598,TOURAN,116,TOURAN_2017,2577
147728,VOLKSWAGEN,13200,2017,Dizel,Monovolumen (MiniVan),149341,1598,TOURAN,116,TOURAN_2017,2579


In [12]:
df['event_strength'] = 1

In [13]:
df['car_id'] = df['car_id'].apply(lambda x: int(str(hash(x))[0:5]))

In [14]:
df

Unnamed: 0,Brend,Cena,Godiste,Gorivo,Karoserija,Kilometraza,Kubikaza,Model,Snaga,car_id,user_id,event_strength
0,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,-3731,283,1
1,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,-3731,325,1
2,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,-3731,326,1
3,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,-3731,411,1
4,ALFA ROMEO,4000,2007,Dizel,Hecbek,168683,1900,147,150,-3731,283,1
...,...,...,...,...,...,...,...,...,...,...,...,...
147725,VOLKSWAGEN,13200,2017,Dizel,Monovolumen (MiniVan),149341,1598,TOURAN,116,-5933,2513,1
147726,VOLKSWAGEN,13200,2017,Dizel,Monovolumen (MiniVan),149341,1598,TOURAN,116,-5933,2514,1
147727,VOLKSWAGEN,13200,2017,Dizel,Monovolumen (MiniVan),149341,1598,TOURAN,116,-5933,2577,1
147728,VOLKSWAGEN,13200,2017,Dizel,Monovolumen (MiniVan),149341,1598,TOURAN,116,-5933,2579,1


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147730 entries, 0 to 147729
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Brend           147730 non-null  object
 1   Cena            147730 non-null  int64 
 2   Godiste         147730 non-null  int64 
 3   Gorivo          147730 non-null  object
 4   Karoserija      147730 non-null  object
 5   Kilometraza     147730 non-null  int64 
 6   Kubikaza        147730 non-null  int64 
 7   Model           147730 non-null  object
 8   Snaga           147730 non-null  int64 
 9   car_id          147730 non-null  int64 
 10  user_id         147730 non-null  int64 
 11  event_strength  147730 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 14.7+ MB


In [16]:
grouped_df = df.groupby(['user_id', 'car_id','Model','Godiste'])['event_strength'].sum().reset_index()
grouped_df

Unnamed: 0,user_id,car_id,Model,Godiste,event_strength
0,0,-6379,YPSILON,2000,3
1,0,-5276,LAGUNA,2001,14
2,0,54456,206,2000,9
3,1,-4623,TWINGO,2002,4
4,1,-2254,C5,2002,17
...,...,...,...,...,...
18970,4276,-3559,INSIGNIA,2017,5
18971,4276,52354,3,2017,2
18972,4276,78371,INSIGNIA,2018,1
18973,4277,-8363,PASSAT B8,2017,3


In [40]:
interactions_train_df, interactions_test_df = train_test_split(df,
                                   stratify=df['user_id'], 
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 118184
# interactions on Test set: 29546


In [45]:
#Indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = df.set_index('user_id')
interactions_train_indexed_df = df.set_index('user_id')
interactions_test_indexed_df = df.set_index('user_id')

In [52]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.
    interacted_items = df.loc[person_id]['car_id']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [53]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interactions_full_indexed_df)
        all_items = set(df['car_id'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['car_id']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['car_id'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['car_id'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_interacted(person_id, 
                                                                                    interactions_train_indexed_df), 
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['car_id'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['car_id'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator()

In [17]:
grouped_df['car_id'] = grouped_df['car_id'].astype('category')
grouped_df['user_id'] = grouped_df['user_id'].astype('category')

grouped_df['car_id'] = grouped_df['car_id'].cat.codes
grouped_df['user_id'] = grouped_df['user_id'].cat.codes

sparse_content_person = sparse.csr_matrix((grouped_df['event_strength'].astype(float), (grouped_df['car_id'], grouped_df['user_id'])))
sparse_person_content = sparse.csr_matrix((grouped_df['event_strength'].astype(float), (grouped_df['user_id'], grouped_df['car_id'])))
print(sparse_content_person)

  (0, 29)	2.0
  (0, 31)	2.0
  (0, 3089)	2.0
  (0, 3090)	2.0
  (0, 3092)	2.0
  (0, 3093)	2.0
  (1, 2509)	1.0
  (1, 2513)	1.0
  (1, 2514)	1.0
  (1, 2516)	1.0
  (1, 2582)	1.0
  (1, 2586)	1.0
  (2, 753)	4.0
  (2, 755)	4.0
  (2, 757)	4.0
  (2, 849)	4.0
  (2, 939)	4.0
  (2, 940)	4.0
  (2, 949)	4.0
  (2, 3743)	4.0
  (2, 3747)	4.0
  (2, 3748)	4.0
  (2, 3904)	4.0
  (2, 3907)	4.0
  (3, 664)	8.0
  :	:
  (951, 1047)	6.0
  (951, 1073)	6.0
  (951, 1083)	6.0
  (951, 1157)	6.0
  (951, 1166)	6.0
  (951, 1175)	6.0
  (951, 1178)	6.0
  (951, 1181)	6.0
  (951, 2222)	6.0
  (951, 2242)	6.0
  (951, 2313)	6.0
  (951, 2413)	6.0
  (951, 2419)	6.0
  (951, 3925)	6.0
  (951, 3929)	6.0
  (951, 3931)	6.0
  (951, 3933)	6.0
  (951, 3987)	6.0
  (951, 3996)	6.0
  (951, 4001)	6.0
  (951, 4056)	6.0
  (951, 4064)	6.0
  (951, 4070)	6.0
  (951, 4080)	6.0
  (951, 4081)	6.0


In [18]:
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

alpha = 15
data = (sparse_content_person * alpha).astype('double')
model.fit(data)



HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




## Finding similar cars

In [19]:
content_id = 69 
n_similar = 5

person_vecs = model.user_factors
content_vecs = model.item_factors

content_norms = np.sqrt((content_vecs * content_vecs).sum(axis=1))

scores = content_vecs.dot(content_vecs[content_id]) / content_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / content_norms[content_id]), key=lambda x: -x[1])

for content in similar:
    idx, score = content
    print(grouped_df[grouped_df['car_id']==idx][['Model','Godiste']].iloc[0].to_string())
    print('\n')

Model      ASTRA J
Godiste       2010


Model      LEON
Godiste    2010


Model      CIVIC
Godiste     2009


Model      IBIZA
Godiste     2009


Model      FIESTA
Godiste      2010




## Recommending to users

### Built in function from implicit library

In [20]:
# recommend items for a user
user_items = data.T.tocsr()


def recommend(user_id,user_items):
    recommendations = model.recommend(user_id, user_items)
    models = []
    years = []
    scores = []
    for idx in recommendations:
            # Append titles and scores to the list
            models.append(grouped_df[grouped_df['car_id']==idx[0]]['Model'].iloc[0])
            years.append(grouped_df[grouped_df['car_id']==idx[0]]['Godiste'].iloc[0])
            scores.append(idx[1])
    recommendations = pd.DataFrame({'Models': models, 'Year': years, 'Score': scores})

    return recommendations

print(recommend(69,user_items))
# find related items
#related = model.similar_items(itemid)

    Models  Year     Score
0  ASTRA G  2002  1.062671
1      307  2003  1.033928
2  ASTRA G  2003  0.894495
3      206  2002  0.867884
4      206  2003  0.845941
5   MEGANE  2002  0.789503
6  CORSA C  2002  0.752325
7      307  2002  0.668895
8   GOLF 4  2002  0.639328
9      307  2005  0.632701


### Manual function

In [21]:
def recommend(person_id, sparse_person_content, person_vecs, content_vecs, num_contents=10):
    # Get the interactions scores from the sparse person content matrix
    person_interactions = sparse_person_content[person_id,:].toarray()
    # Add 1 to everything, so that articles with no interaction yet become equal to 1
    person_interactions = person_interactions.reshape(-1) + 1
    # Make articles already interacted zero
    person_interactions[person_interactions > 1] = 0
    # Get dot product of person vector and all content vectors
    rec_vector = person_vecs[person_id,:].dot(content_vecs.T).toarray()
    
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    # Content already interacted have their recommendation multiplied by zero
    recommend_vector = person_interactions * rec_vector_scaled
    # Sort the indices of the content into order of best recommendations
    content_idx = np.argsort(recommend_vector)[::-1][:num_contents]
    
    # Start empty list to store titles and scores
    models = []
    years = []
    scores = []

    for idx in content_idx:
        # Append titles and scores to the list
        models.append(grouped_df[grouped_df['car_id']==idx]['Model'].iloc[0])
        years.append(grouped_df[grouped_df['car_id']==idx]['Godiste'].iloc[0])
        scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'Models': models, 'Year': years, 'Score': scores})

    return recommendations
    
# Get the trained person and content vectors. We convert them to csr matrices
person_vecs = sparse.csr_matrix(model.user_factors)
content_vecs = sparse.csr_matrix(model.item_factors)

# Create recommendations for person with id 50
person_id = 69

recommendations = recommend(person_id, sparse_person_content, person_vecs, content_vecs)

print(recommendations)

    Models  Year     Score
0  ASTRA G  2002  1.000000
1      307  2003  0.982952
2  ASTRA G  2003  0.900250
3      206  2002  0.884467
4      206  2003  0.871452
5   MEGANE  2002  0.837977
6  CORSA C  2002  0.815925
7      307  2002  0.766441
8   GOLF 4  2002  0.748904
9      307  2005  0.744973


What did user with id=69 interacted with?

In [22]:
grouped_df[grouped_df['user_id']==69]

Unnamed: 0,user_id,car_id,Model,Godiste,event_strength
312,69,165,147,2001,8
313,69,566,PRIUS +,2003,22
314,69,821,A3,2003,8
315,69,846,CIVIC,2002,4


Let's try another one

In [23]:
# Create recommendations for person with id 
person_id = 1000

recommendations = recommend(person_id, sparse_person_content, person_vecs, content_vecs)

print(recommendations)

      Models  Year     Score
0  PASSAT B8  2015  0.922294
1        520  2015  0.883718
2        508  2015  0.870946
3         A6  2015  0.826038
4          6  2015  0.803775
5     MONDEO  2015  0.801395
6        520  2014  0.793535
7         A6  2014  0.782096
8     GOLF 7  2014  0.746197
9  PASSAT CC  2014  0.727268


In [24]:
grouped_df[grouped_df['user_id']==1000]

Unnamed: 0,user_id,car_id,Model,Godiste,event_strength
4421,1000,316,C 220,2014,1
4422,1000,615,B 180,2015,1
4423,1000,801,GOLF 7,2015,11


## Evaluate the Recommender System

In [42]:
from sklearn import metrics
import random

def make_train(ratings, pct_test = 0.2):
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of item,user index into list

    
    random.seed(0) # Set the random seed to zero for reproducibility
    
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of item-user pairs without replacement

    content_inds = [index[0] for index in samples] # Get the item row indices

    person_inds = [index[1] for index in samples] # Get the user column indices

    
    training_set[content_inds, person_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    
    return training_set, test_set, list(set(person_inds))

In [26]:
content_train, content_test, content_persons_altered = make_train(sparse_content_person, pct_test = 0.2)

In [27]:
def auc_score(predictions, test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)

In [28]:
def calc_mean_auc(training_set, altered_persons, predictions, test_set):
    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_contents = np.array(test_set.sum(axis = 1)).reshape(-1) # Get sum of item iteractions to find most popular
    content_vecs = predictions[1]
    for person in altered_persons: # Iterate through each user that had an item altered
        training_column = training_set[:,person].toarray().reshape(-1) # Get the training set column
        zero_inds = np.where(training_column == 0) # Find where the interaction had not yet occurred
        
        # Get the predicted values based on our user/item vectors
        person_vec = predictions[0][person,:]
        pred = person_vec.dot(content_vecs).toarray()[0,zero_inds].reshape(-1)
        
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[:,person].toarray()[zero_inds,0].reshape(-1)
        
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_contents[zero_inds] # Get the item popularity for our chosen items
        
        store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
        
        popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
    # End users iteration
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))

In [29]:
calc_mean_auc(content_train, content_persons_altered,
              [person_vecs, content_vecs.T], content_test)

(0.992, 0.744)

### Popularity model

In [30]:
#Computes the most popular items
item_popularity_df = grouped_df.groupby(['car_id','Model','Godiste'])['event_strength'].sum().sort_values(ascending=False).reset_index()
item_popularity_df.head(10)

Unnamed: 0,car_id,Model,Godiste,event_strength
0,567,MEGANE,2004,2158
1,617,307,2004,1780
2,113,PASSAT B8,2015,1572
3,556,GOLF 5,2005,1518
4,844,206,2004,1476
5,486,GOLF 7,2014,1424
6,491,STILO,2004,1311
7,341,PRIUS +,2008,1298
8,88,ASTRA H,2006,1220
9,585,GOLF 4,2002,1188


In [50]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df[~self.popularity_df['car_id'].isin(items_to_ignore)] \
                               .sort_values('event_strength', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'car_id', 
                                                          right_on = 'car_id')[['event_strength', 'car_id', 'Model', 'Godiste', 'Cena']]


        return recommendations_df
    
popularity_model = PopularityRecommender(item_popularity_df, df)

In [54]:
model_evaluator = ModelEvaluator()
print('Evaluating Popularity recommendation model...')
pop_global_metrics, pop_detailed_results_df = model_evaluator.evaluate_model(popularity_model)
print('\nGlobal metrics:\n%s' % pop_global_metrics)
pop_detailed_results_df.head(10)

Evaluating Popularity recommendation model...


KeyboardInterrupt: 