In [217]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import scipy.sparse as sparse
from implicit.als import AlternatingLeastSquares

In [218]:
utb = pd.read_csv('./hetrec2011-delicious-2k/user_taggedbookmarks.dat', sep='\t')
bookmarks = pd.read_csv('./hetrec2011-delicious-2k/bookmarks.dat', sep='\t', encoding='iso-8859-15')

In [219]:
utb = utb[['userID', 'bookmarkID', 'tagID']]
utb.head()

Unnamed: 0,userID,bookmarkID,tagID
0,8,1,1
1,8,2,1
2,8,7,1
3,8,7,6
4,8,7,7


### Составим словарь соответствия названия закладки и ее идентификатора

In [220]:
bookmarks = bookmarks[['id', 'urlPrincipal']]
bookmarks.columns = ['bookmarkID', 'bookmark']

bookmark_id_name = {}
for index, row in tqdm_notebook(bookmarks.iterrows()):
    bookmark_id_name[row.bookmarkID] = row.bookmark

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




### Сгруппируем кол-во проставленных тегов пользователями каждой закладке

In [221]:
utb_grouped = utb.groupby(['userID', 'bookmarkID'])['tagID'].count().reset_index(name='tags')

### Назначим новые идентификаторы пользователей и закладок

In [222]:
utb_grouped['user_id'] = utb_grouped['userID'].astype("category").cat.codes
utb_grouped['bookmark_id'] = utb_grouped['bookmarkID'].astype("category").cat.codes

In [223]:
bookm_new_old = utb_grouped[['bookmarkID', 'bookmark_id']]

In [224]:
users_new_old = utb_grouped[['userID', 'user_id']]

In [225]:
utb_grouped = utb_grouped.drop(['userID', 'bookmarkID'], axis=1)

In [226]:
usr = utb_grouped['user_id'].unique().tolist()
bkm = utb_grouped['bookmark_id'].unique().tolist()
values = utb_grouped['tags'].tolist()
rows = utb_grouped['user_id'].tolist()
columns = utb_grouped['bookmark_id'].tolist()

### Формируем матрицу

In [227]:
user_data = sparse.csr_matrix((values, (rows, columns)), shape = (len(usr), len(bkm)))

### Строим рекоммендательную систему

In [228]:
als = AlternatingLeastSquares(factors=50)

In [229]:
als.fit(user_data)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




### Тестируем

In [230]:
userid = 0

user_items = user_data.T.tocsr()
recommendations = als.recommend(userid, user_items)

In [231]:
recommendations

[(342, 0.062230244),
 (1455, 0.042787075),
 (820, 0.040206846),
 (240, 0.038243446),
 (1405, 0.03194939),
 (1061, 0.030695967),
 (1843, 0.030123824),
 (1838, 0.02787574),
 (341, 0.026094625),
 (1770, 0.025996001)]

### Выведем рекоммендации

In [232]:
for r in recommendations:
    bm = bookm_new_old[bookm_new_old['bookmark_id']==r[0]]['bookmarkID'].unique()[0]
    print(bookmark_id_name[bm])

www.scientificamerican.com
joseluisavilaherrera.blogspot.com
www.paulgraham.com
patterns.littlespringsdesign.com
nvie.com
networkcultures.org
en.wikipedia.org
www.artelino.com
www.adamsmithesq.com
www.w3.org


### Сравним с тем, что пользователь сохранил сам

In [233]:
us_test = users_new_old[users_new_old['user_id']==r[0]]['userID'].unique()[0]
bm_test = utb[utb['userID']==us_test]['bookmarkID'].unique()

for i in bm_test:
    print(bookmark_id_name[i])

www.canlitforkids.com
media.commonsensemedia.org
www.lappscoachshop.com
www.focus.com
download.cnet.com
blog.tagliaerbe.com
www.43places.com
www.amazon.co.uk
www.fonyou.com
www.reddit.com
speedanatomy.blogspot.com
startingstrength.wikia.com
activitatsbcn.com
www.bmibaby.com
www.youtube.com
yclist.com
unfuddle.com
tbaggery.com
jeffkreeftmeijer.com
www.adwhirl.com
www.padpressed.com
mashable.com
www.appmakr.com
www.philcoffman.com
milkandeggsco.com
www.apphalloffame.com
dannpetty.com
silentuproar.com
www.projectthirtythree.com
bjango.com
verticalrhythm.org
adamgrano.com
lequick.net
mauiguidebook.com
www.effektivedesign.co.uk
www.larsahrens.com
www.goodfuckingdesignadvice.com
www.kellerhouse.com
inspiredology.com
gettingreal.37signals.com
vimeo.com
www.diesel.com
www.struckaxiom.com
www.thegroop.net
narrowdesign.com
eatocracy.cnn.com
www.google.com
www.geekanthem.com
albumtacos.tumblr.com
igniteshow.com
www.zurb.com
www.jordanbutcher.com
getcloudapp.com
diythemes.com
pick.im
mixergy.com
v