##### References
https://towardsdatascience.com/building-a-collaborative-filtering-recommender-system-with-clickstream-data-dffc86c8c65


In [21]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
import random
import implicit
from sklearn.preprocessing import MinMaxScaler

In [22]:
cars = pd.read_csv('CarIds.csv')
users = pd.read_csv('users.csv')

In [23]:
cars.drop('Unnamed: 0', axis=1, inplace=True)
users['car_id'] = users['carId']
users.drop('carId',axis=1, inplace=True)

In [24]:
users

Unnamed: 0,user_id,car_id
0,0,GOLF 4_1999
1,0,156_2000
2,0,WAGON R+_1999
3,0,CIVIC_2001
4,0,FIESTA_1999
...,...,...
17134,4372,TOUAREG_2008
17135,4372,ML KLASA_2006
17136,4372,PRIUS +_2007
17137,4373,X5_2006


See how many users we have in our dataset

In [25]:
users['user_id'].nunique()

4374

See how many cars we have in our dataset

In [26]:
cars

Unnamed: 0,Brend,Cena,Godiste,Gorivo,Karoserija,Kilometraza,Kubikaza,Model,Snaga,car_id
0,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,147_2007
1,ALFA ROMEO,2850,2006,Dizel,Hecbek,222000,1910,147,150,147_2006
2,ALFA ROMEO,1850,2004,Dizel,Limuzina,178000,1910,147,116,147_2004
3,ALFA ROMEO,1700,2004,Dizel,Hecbek,156906,1910,147,116,147_2004
4,ALFA ROMEO,1700,2002,Dizel,Hecbek,272000,1900,147,116,147_2002
...,...,...,...,...,...,...,...,...,...,...
35234,VOLKSWAGEN,4200,2006,Dizel,Monovolumen (MiniVan),251000,1900,TOURAN,105,TOURAN_2006
35235,VOLKSWAGEN,3550,2005,Dizel,Monovolumen (MiniVan),259000,1896,TOURAN,105,TOURAN_2005
35236,VOLKSWAGEN,7700,2011,Dizel,Monovolumen (MiniVan),214000,1598,TOURAN,105,TOURAN_2011
35237,VOLKSWAGEN,4490,2007,Dizel,Monovolumen (MiniVan),210000,1890,TOURAN,105,TOURAN_2007


Out of 35239, we have a lot of models with the same id because it is set only for model and year of a car. 

Now we can change that and reduce the number of cars by removing duplicate cars based on year and volume,  so we are left only with the cars that has same year of production and different engine volume.

In [27]:
cars.drop_duplicates(subset = ['Snaga', 'car_id'], inplace=True)

In [28]:
cars

Unnamed: 0,Brend,Cena,Godiste,Gorivo,Karoserija,Kilometraza,Kubikaza,Model,Snaga,car_id
0,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,147_2007
1,ALFA ROMEO,2850,2006,Dizel,Hecbek,222000,1910,147,150,147_2006
2,ALFA ROMEO,1850,2004,Dizel,Limuzina,178000,1910,147,116,147_2004
4,ALFA ROMEO,1700,2002,Dizel,Hecbek,272000,1900,147,116,147_2002
5,ALFA ROMEO,2000,2005,Dizel,Kupe,189500,1910,147,150,147_2005
...,...,...,...,...,...,...,...,...,...,...
35178,VOLKSWAGEN,5000,2007,Benzin,Monovolumen (MiniVan),156000,1400,TOURAN,150,TOURAN_2007
35205,VOLKSWAGEN,3600,2006,Dizel,Limuzina,226800,1980,TOURAN,150,TOURAN_2006
35206,VOLKSWAGEN,11300,2014,Dizel,Monovolumen (MiniVan),118700,1968,TOURAN,140,TOURAN_2014
35212,VOLKSWAGEN,3699,2006,Dizel,Monovolumen (MiniVan),209659,1900,TOURAN,120,TOURAN_2006


In [29]:
cars['car_id'].value_counts()

MEGANE_2004       26
MEGANE_2005       25
GOLF 5_2004       25
MEGANE_2002       23
MEGANE_2003       23
                  ..
PASSAT CC_2016     1
RAV 4_2016         1
MONDEO_2017        1
ML KLASA_2000      1
PASSAT B8_2018     1
Name: car_id, Length: 1266, dtype: int64

In [30]:
df = pd.merge(cars, users[['car_id','user_id']], how = 'inner', on = 'car_id')

In [31]:
df

Unnamed: 0,Brend,Cena,Godiste,Gorivo,Karoserija,Kilometraza,Kubikaza,Model,Snaga,car_id,user_id
0,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,147_2007,448
1,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,147_2007,512
2,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,147_2007,539
3,ALFA ROMEO,4000,2007,Dizel,Hecbek,168683,1900,147,150,147_2007,448
4,ALFA ROMEO,4000,2007,Dizel,Hecbek,168683,1900,147,150,147_2007,512
...,...,...,...,...,...,...,...,...,...,...,...
147131,VOLKSWAGEN,10000,2014,Dizel,Monovolumen (MiniVan),225000,1596,TOURAN,105,TOURAN_2014,1111
147132,VOLKSWAGEN,13500,2014,Metan CNG,Monovolumen (MiniVan),77000,1390,TOURAN,150,TOURAN_2014,1111
147133,VOLKSWAGEN,11500,2014,Dizel,Monovolumen (MiniVan),190000,2000,TOURAN,177,TOURAN_2014,1111
147134,VOLKSWAGEN,6499,2014,Dizel,Hecbek,299648,1900,TOURAN,107,TOURAN_2014,1111


In [32]:
df['event_strength'] = 1

In [33]:
df['car_id'] = df['car_id'].apply(lambda x: int(str(hash(x))[0:5]))

In [35]:
df

Unnamed: 0,Brend,Cena,Godiste,Gorivo,Karoserija,Kilometraza,Kubikaza,Model,Snaga,car_id,user_id,event_strength
0,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,-8255,448,1
1,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,-8255,512,1
2,ALFA ROMEO,2150,2007,Dizel,Hecbek,215000,1910,147,120,-8255,539,1
3,ALFA ROMEO,4000,2007,Dizel,Hecbek,168683,1900,147,150,-8255,448,1
4,ALFA ROMEO,4000,2007,Dizel,Hecbek,168683,1900,147,150,-8255,512,1
...,...,...,...,...,...,...,...,...,...,...,...,...
147131,VOLKSWAGEN,10000,2014,Dizel,Monovolumen (MiniVan),225000,1596,TOURAN,105,90113,1111,1
147132,VOLKSWAGEN,13500,2014,Metan CNG,Monovolumen (MiniVan),77000,1390,TOURAN,150,90113,1111,1
147133,VOLKSWAGEN,11500,2014,Dizel,Monovolumen (MiniVan),190000,2000,TOURAN,177,90113,1111,1
147134,VOLKSWAGEN,6499,2014,Dizel,Hecbek,299648,1900,TOURAN,107,90113,1111,1


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147136 entries, 0 to 147135
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Brend           147136 non-null  object
 1   Cena            147136 non-null  int64 
 2   Godiste         147136 non-null  int64 
 3   Gorivo          147136 non-null  object
 4   Karoserija      147136 non-null  object
 5   Kilometraza     147136 non-null  int64 
 6   Kubikaza        147136 non-null  int64 
 7   Model           147136 non-null  object
 8   Snaga           147136 non-null  int64 
 9   car_id          147136 non-null  int64 
 10  user_id         147136 non-null  int64 
 11  event_strength  147136 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 14.6+ MB


In [38]:
grouped_df = df.groupby(['user_id', 'car_id','Model','Godiste'])['event_strength'].sum().reset_index()
grouped_df

Unnamed: 0,user_id,car_id,Model,Godiste,event_strength
0,0,-8542,CIVIC,2001,2
1,0,68383,156,2000,7
2,0,81744,530,2000,3
3,1,-6430,VECTRA B,2000,8
4,1,-4821,QASHQAI + 2,2000,8
...,...,...,...,...,...
16553,4372,-7975,TOUAREG,2008,6
16554,4372,-1646,PRIUS +,2007,17
16555,4372,82482,X5,2007,5
16556,4373,-9267,ML KLASA,2006,4


In [39]:
df['car_id'] = df['car_id'].astype('category')
df['user_id'] = df['user_id'].astype('category')

df['car_id'] = df['car_id'].cat.codes
df['user_id'] = df['user_id'].cat.codes

sparse_content_person = sparse.csr_matrix((df['event_strength'].astype(float), (df['car_id'], df['user_id'])))
sparse_person_content = sparse.csr_matrix((df['event_strength'].astype(float), (df['user_id'], df['car_id'])))

In [18]:
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

alpha = 15
data = (sparse_content_person * alpha).astype('double')
model.fit(data)



HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [19]:
grouped_df[grouped_df['Model']=='CIVIC']

Unnamed: 0,user_id,car_id,Model,Godiste,event_strength
0,0,-8542,CIVIC,2001,2
10,3,-8542,CIVIC,2001,2
123,45,-2075,CIVIC,2002,4
171,58,28795,CIVIC,2000,1
466,135,-4390,CIVIC,2003,5
499,143,-4390,CIVIC,2003,5
510,147,-8542,CIVIC,2001,2
515,148,-8542,CIVIC,2001,2
779,215,-4390,CIVIC,2003,5
785,216,-4390,CIVIC,2003,5


In [20]:
content_id = 864
n_similar = 5

person_vecs = model.user_factors
content_vecs = model.item_factors

content_norms = np.sqrt((content_vecs * content_vecs).sum(axis=1))

scores = content_vecs.dot(content_vecs[content_id]) / content_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / content_norms[content_id]), key=lambda x: -x[1])

for content in similar:
    idx, score = content
    print(grouped_df[grouped_df['car_id']==idx][['Model','Godiste']].iloc[0].to_string())
    print('\n')

IndexError: single positional indexer is out-of-bounds