##### References
https://towardsdatascience.com/building-a-collaborative-filtering-recommender-system-with-clickstream-data-dffc86c8c65


In [8]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
import random
import implicit
from sklearn.preprocessing import MinMaxScaler

In [9]:
cars = pd.read_csv('CarIds.csv')
users = pd.read_csv('users.csv')

In [10]:
cars.drop('Unnamed: 0', axis=1, inplace=True)
users['car_id'] = users['carId']
users.drop('carId',axis=1, inplace=True)

In [11]:
users

Unnamed: 0,user_id,car_id
0,0,YPSILON_2000
1,0,LAGUNA_2001
2,0,206_2000
3,1,C5_2002
4,1,TWINGO_2002
...,...,...
18970,4276,3_2017
18971,4276,INSIGNIA_2018
18972,4276,INSIGNIA_2017
18973,4277,INSIGNIA_2017


See how many users we have in our dataset

In [12]:
users['user_id'].nunique()

4278

See how many cars we have in our dataset

In [13]:
cars

Unnamed: 0,Brend,Cena,Godiste,Gorivo,Karoserija,Kilometraza,Kubikaza,Model,Snaga,car_id
0,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,147_2007
1,ALFA ROMEO,2850,2006,Dizel,Hecbek,222000,1910,147,150,147_2006
2,ALFA ROMEO,1850,2004,Dizel,Limuzina,178000,1910,147,116,147_2004
3,ALFA ROMEO,1700,2004,Dizel,Hecbek,156906,1910,147,116,147_2004
4,ALFA ROMEO,1700,2002,Dizel,Hecbek,272000,1900,147,116,147_2002
...,...,...,...,...,...,...,...,...,...,...
35234,VOLKSWAGEN,4200,2006,Dizel,Monovolumen (MiniVan),251000,1900,TOURAN,105,TOURAN_2006
35235,VOLKSWAGEN,3550,2005,Dizel,Monovolumen (MiniVan),259000,1896,TOURAN,105,TOURAN_2005
35236,VOLKSWAGEN,7700,2011,Dizel,Monovolumen (MiniVan),214000,1598,TOURAN,105,TOURAN_2011
35237,VOLKSWAGEN,4490,2007,Dizel,Monovolumen (MiniVan),210000,1890,TOURAN,105,TOURAN_2007


Out of 35239, we have a lot of models with the same id because it is set only for model and year of a car. 

Now we can change that and reduce the number of cars by removing duplicate cars based on year and volume,  so we are left only with the cars that has same year of production and different engine volume.

In [14]:
cars.drop_duplicates(subset = ['Snaga', 'car_id'], inplace=True)

In [15]:
cars

Unnamed: 0,Brend,Cena,Godiste,Gorivo,Karoserija,Kilometraza,Kubikaza,Model,Snaga,car_id
0,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,147_2007
1,ALFA ROMEO,2850,2006,Dizel,Hecbek,222000,1910,147,150,147_2006
2,ALFA ROMEO,1850,2004,Dizel,Limuzina,178000,1910,147,116,147_2004
4,ALFA ROMEO,1700,2002,Dizel,Hecbek,272000,1900,147,116,147_2002
5,ALFA ROMEO,2000,2005,Dizel,Kupe,189500,1910,147,150,147_2005
...,...,...,...,...,...,...,...,...,...,...
35178,VOLKSWAGEN,5000,2007,Benzin,Monovolumen (MiniVan),156000,1400,TOURAN,150,TOURAN_2007
35205,VOLKSWAGEN,3600,2006,Dizel,Limuzina,226800,1980,TOURAN,150,TOURAN_2006
35206,VOLKSWAGEN,11300,2014,Dizel,Monovolumen (MiniVan),118700,1968,TOURAN,140,TOURAN_2014
35212,VOLKSWAGEN,3699,2006,Dizel,Monovolumen (MiniVan),209659,1900,TOURAN,120,TOURAN_2006


In [16]:
cars['car_id'].value_counts()

MEGANE_2004      26
GOLF 5_2004      25
MEGANE_2005      25
MEGANE_2003      23
MEGANE_2002      23
                 ..
BRAVO_2014        1
B 180_2016        1
E 220_2012        1
WAGON R+_2017     1
C5_2014           1
Name: car_id, Length: 1266, dtype: int64

In [17]:
df = pd.merge(cars, users[['car_id','user_id']], how = 'inner', on = 'car_id')

In [18]:
df

Unnamed: 0,Brend,Cena,Godiste,Gorivo,Karoserija,Kilometraza,Kubikaza,Model,Snaga,car_id,user_id
0,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,147_2007,283
1,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,147_2007,325
2,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,147_2007,326
3,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,147_2007,411
4,ALFA ROMEO,4000,2007,Dizel,Hecbek,168683,1900,147,150,147_2007,283
...,...,...,...,...,...,...,...,...,...,...,...
147725,VOLKSWAGEN,13200,2017,Dizel,Monovolumen (MiniVan),149341,1598,TOURAN,116,TOURAN_2017,2513
147726,VOLKSWAGEN,13200,2017,Dizel,Monovolumen (MiniVan),149341,1598,TOURAN,116,TOURAN_2017,2514
147727,VOLKSWAGEN,13200,2017,Dizel,Monovolumen (MiniVan),149341,1598,TOURAN,116,TOURAN_2017,2577
147728,VOLKSWAGEN,13200,2017,Dizel,Monovolumen (MiniVan),149341,1598,TOURAN,116,TOURAN_2017,2579


In [19]:
df['event_strength'] = 1

In [20]:
df['car_id'] = df['car_id'].apply(lambda x: int(str(hash(x))[0:5]))

In [21]:
df

Unnamed: 0,Brend,Cena,Godiste,Gorivo,Karoserija,Kilometraza,Kubikaza,Model,Snaga,car_id,user_id,event_strength
0,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,-2921,283,1
1,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,-2921,325,1
2,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,-2921,326,1
3,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,-2921,411,1
4,ALFA ROMEO,4000,2007,Dizel,Hecbek,168683,1900,147,150,-2921,283,1
...,...,...,...,...,...,...,...,...,...,...,...,...
147725,VOLKSWAGEN,13200,2017,Dizel,Monovolumen (MiniVan),149341,1598,TOURAN,116,-1118,2513,1
147726,VOLKSWAGEN,13200,2017,Dizel,Monovolumen (MiniVan),149341,1598,TOURAN,116,-1118,2514,1
147727,VOLKSWAGEN,13200,2017,Dizel,Monovolumen (MiniVan),149341,1598,TOURAN,116,-1118,2577,1
147728,VOLKSWAGEN,13200,2017,Dizel,Monovolumen (MiniVan),149341,1598,TOURAN,116,-1118,2579,1


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147730 entries, 0 to 147729
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Brend           147730 non-null  object
 1   Cena            147730 non-null  int64 
 2   Godiste         147730 non-null  int64 
 3   Gorivo          147730 non-null  object
 4   Karoserija      147730 non-null  object
 5   Kilometraza     147730 non-null  int64 
 6   Kubikaza        147730 non-null  int64 
 7   Model           147730 non-null  object
 8   Snaga           147730 non-null  int64 
 9   car_id          147730 non-null  int64 
 10  user_id         147730 non-null  int64 
 11  event_strength  147730 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 14.7+ MB


In [23]:
grouped_df = df.groupby(['user_id', 'car_id','Model','Godiste'])['event_strength'].sum().reset_index()
grouped_df

Unnamed: 0,user_id,car_id,Model,Godiste,event_strength
0,0,-3547,206,2000,9
1,0,-1082,LAGUNA,2001,14
2,0,84150,YPSILON,2000,3
3,1,-9146,TWINGO,2002,4
4,1,-7340,MEGANE,2002,23
...,...,...,...,...,...
18970,4276,-3877,INSIGNIA,2018,1
18971,4276,75410,INSIGNIA,2017,5
18972,4276,86902,3,2017,2
18973,4277,-4115,PASSAT B8,2017,3


In [77]:
grouped_df['car_id'] = grouped_df['car_id'].astype('category')
grouped_df['user_id'] = grouped_df['user_id'].astype('category')

grouped_df['car_id'] = grouped_df['car_id'].cat.codes
grouped_df['user_id'] = grouped_df['user_id'].cat.codes

sparse_content_person = sparse.csr_matrix((grouped_df['event_strength'].astype(float), (grouped_df['car_id'], grouped_df['user_id'])))
sparse_person_content = sparse.csr_matrix((grouped_df['event_strength'].astype(float), (grouped_df['user_id'], grouped_df['car_id'])))
print(sparse_content_person.count_nonzero())

18975


In [74]:
np.isnan(sparse_content_person.toarray()).sum()/np.prod(sparse_content_person.shape)

0.0

In [25]:
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

alpha = 15
data = (sparse_content_person * alpha).astype('double')
model.fit(data)



HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




## Finding similar cars

In [68]:
content_id = 69 
n_similar = 5

person_vecs = model.user_factors
content_vecs = model.item_factors

content_norms = np.sqrt((content_vecs * content_vecs).sum(axis=1))

scores = content_vecs.dot(content_vecs[content_id]) / content_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / content_norms[content_id]), key=lambda x: -x[1])

for content in similar:
    idx, score = content
    print(grouped_df[grouped_df['car_id']==idx][['Model','Godiste']].iloc[0].to_string())
    print('\n')

Model        A4
Godiste    2013


Model      C 200
Godiste     2013


Model         6
Godiste    2013


Model      B 180
Godiste     2013


Model      PASSAT CC
Godiste         2013




## Reccomending to users

### Built in function from implicit library

In [69]:
# recommend items for a user
user_items = data.T.tocsr()


def recommend(user_id,user_items):
    recommendations = model.recommend(user_id, user_items)
    models = []
    years = []
    scores = []
    for idx in recommendations:
            # Append titles and scores to the list
            models.append(grouped_df[grouped_df['car_id']==idx[0]]['Model'].iloc[0])
            years.append(grouped_df[grouped_df['car_id']==idx[0]]['Godiste'].iloc[0])
            scores.append(idx[1])
    recommendations = pd.DataFrame({'Models': models, 'Year': years, 'Score': scores})

    return recommendations

print(recommend(69,user_items))
# find related items
#related = model.similar_items(itemid)

    Models  Year     Score
0  ASTRA G  2002  1.098739
1      307  2003  1.080114
2  ASTRA G  2003  1.012805
3      206  2002  0.976417
4      206  2003  0.927238
5  CORSA C  2002  0.924675
6   MEGANE  2002  0.810943
7   GOLF 4  2002  0.773007
8      307  2002  0.683582
9   MEGANE  2004  0.673328


### Manual function

In [43]:
def recommend(person_id, sparse_person_content, person_vecs, content_vecs, num_contents=10):
    # Get the interactions scores from the sparse person content matrix
    person_interactions = sparse_person_content[person_id,:].toarray()
    # Add 1 to everything, so that articles with no interaction yet become equal to 1
    person_interactions = person_interactions.reshape(-1) + 1
    # Make articles already interacted zero
    person_interactions[person_interactions > 1] = 0
    # Get dot product of person vector and all content vectors
    rec_vector = person_vecs[person_id,:].dot(content_vecs.T).toarray()
    
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    # Content already interacted have their recommendation multiplied by zero
    recommend_vector = person_interactions * rec_vector_scaled
    # Sort the indices of the content into order of best recommendations
    content_idx = np.argsort(recommend_vector)[::-1][:num_contents]
    
    # Start empty list to store titles and scores
    models = []
    years = []
    scores = []

    for idx in content_idx:
        # Append titles and scores to the list
        models.append(grouped_df[grouped_df['car_id']==idx]['Model'].iloc[0])
        years.append(grouped_df[grouped_df['car_id']==idx]['Godiste'].iloc[0])
        scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'Models': models, 'Year': years, 'Score': scores})

    return recommendations
    
# Get the trained person and content vectors. We convert them to csr matrices
person_vecs = sparse.csr_matrix(model.user_factors)
content_vecs = sparse.csr_matrix(model.item_factors)

# Create recommendations for person with id 50
person_id = 69

recommendations = recommend(person_id, sparse_person_content, person_vecs, content_vecs)

print(recommendations)

    Models  Year     Score
0  ASTRA G  2002  1.000000
1      307  2003  0.988992
2  ASTRA G  2003  0.949211
3      206  2002  0.927704
4      206  2003  0.898638
5  CORSA C  2002  0.897123
6   MEGANE  2002  0.829905
7   GOLF 4  2002  0.807484
8      307  2002  0.754631
9   MEGANE  2004  0.748571


What did user with id=69 interacted with?

In [45]:
grouped_df[grouped_df['user_id']==69]

Unnamed: 0,user_id,car_id,Model,Godiste,event_strength
312,69,118,PRIUS +,2003,22
313,69,501,147,2001,8
314,69,554,A3,2003,8
315,69,571,CIVIC,2002,4


Let's try another one

In [48]:
# Create recommendations for person with id 
person_id = 1000

recommendations = recommend(person_id, sparse_person_content, person_vecs, content_vecs)

print(recommendations)

      Models  Year     Score
0  PASSAT B8  2015  0.930196
1        508  2015  0.900358
2        520  2015  0.865676
3          6  2015  0.818392
4        520  2014  0.806992
5         A6  2014  0.797883
6         A6  2015  0.794534
7     MONDEO  2015  0.789984
8     GOLF 7  2014  0.739318
9  PASSAT CC  2014  0.736043


In [49]:
grouped_df[grouped_df['user_id']==1000]

Unnamed: 0,user_id,car_id,Model,Godiste,event_strength
4421,1000,278,C 220,2014,1
4422,1000,340,B 180,2015,1
4423,1000,878,GOLF 7,2015,11


## Evaluate the Recommender System

In [55]:
from sklearn import metrics
import random

def make_train(ratings, pct_test = 0.2):
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of item,user index into list

    
    random.seed(0) # Set the random seed to zero for reproducibility
    
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of item-user pairs without replacement

    content_inds = [index[0] for index in samples] # Get the item row indices

    person_inds = [index[1] for index in samples] # Get the user column indices

    
    training_set[content_inds, person_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    
    return training_set, test_set, list(set(person_inds))

In [51]:
content_train, content_test, content_persons_altered = make_train(sparse_content_person, pct_test = 0.2)

In [52]:
def auc_score(predictions, test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)

In [53]:
def calc_mean_auc(training_set, altered_persons, predictions, test_set):
    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_contents = np.array(test_set.sum(axis = 1)).reshape(-1) # Get sum of item iteractions to find most popular
    content_vecs = predictions[1]
    for person in altered_persons: # Iterate through each user that had an item altered
        training_column = training_set[:,person].toarray().reshape(-1) # Get the training set column
        zero_inds = np.where(training_column == 0) # Find where the interaction had not yet occurred
        
        # Get the predicted values based on our user/item vectors
        person_vec = predictions[0][person,:]
        pred = person_vec.dot(content_vecs).toarray()[0,zero_inds].reshape(-1)
        
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[:,person].toarray()[zero_inds,0].reshape(-1)
        
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_contents[zero_inds] # Get the item popularity for our chosen items
        
        store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
        
        popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
    # End users iteration
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))

In [56]:
calc_mean_auc(content_train, content_persons_altered,
              [person_vecs, content_vecs.T], content_test)

(0.992, 0.744)