In [41]:
from utils import load_data
import pandas as pd
import numpy as np
from scipy import sparse
import implicit
%matplotlib inline

In [42]:
df = load_data("./active1000")
df["time"] = df["time"].apply(
    lambda date: pd.Timestamp(date, unit="s", tz="Europe/Oslo")
)
df.drop(index=df[df.url == "http://adressa.no"].index, inplace=True)


In [43]:
def dataframe_to_user_item_matrix(df, common_users):
    df = df.drop_duplicates(subset=['userId', 'documentId'])
    df = df.sort_values(by=['userId', 'time'])

    n_users = df['userId'].nunique()
    n_items = df['documentId'].count()

    ratings = np.zeros((n_users, n_items))

    new_user = df['userId'].values[1:] != df['userId'].values[:-1]
    new_user = np.r_[True, new_user]

    df['uid'] = np.cumsum(new_user)
    item_ids = df['documentId'].unique().tolist()

    new_df = pd.DataFrame({'documentId': item_ids, 'tid': range(1, len(item_ids) + 1)})

    df = pd.merge(df, new_df, on='documentId', how='outer')
    df_ext = df[['uid', 'tid']]

    users_df = df[df["userId"].isin(common_users)]["uid"].unique()
    user_ids = set()

    for row in df_ext.itertuples():
        ratings[row[1] - 1, row[2] - 1] = 1.0

        if row[1] in users_df:
            user_ids.add(row[1] - 1)

    print(f"Matrix created with {ratings.shape[0]} users and {ratings.shape[1]} items")
    return ratings, user_ids, item_ids

user_item_data, common_user_ids, item_ids = dataframe_to_user_item_matrix(df, df['userId'])
user_item_data = sparse.csr_matrix(user_item_data)

Matrix created with 1000 users and 679355 items


In [33]:
model = implicit.als.AlternatingLeastSquares(factors=50)
model.fit(user_item_data)

100%|██████████| 15/15 [00:06<00:00,  2.43it/s]


In [44]:
recommendations = model.recommend(0, user_item_data[0], filter_already_liked_items=True)
ids = list(map(lambda x: item_ids[x], list(recommendations[0])))
print(list(zip(ids, list(recommendations[1]))))

#print(df[df['documentId'] == ids[0]])

[('dff139999bfd211c174369acab297bf0e72542ba', 0.8169841), ('d967e881e0f31e0041e36748286e648c2c0e6deb', 0.7669275), ('b36546800ae916692cfcbbe0a3685626cbc08dd7', 0.70704335), ('4b66c82c4be5d899376b929dc50a915ae7e66adf', 0.701559), ('094e3ca8251f2a81626da7af88e25ef03ae7bd86', 0.68695766), ('8136cd1ab60e0480c620873342a93dee57c737a2', 0.67059726), ('9c5c3a14fb5632edf4e2cb22d74c15fbf9c74062', 0.6554182), ('ad6c71c5fdf87d6f1eb11ab12182582b01d14fd2', 0.63521385), ('58b7e6996c181156f5b066587fb349f5680dceed', 0.6328052), ('96556526ad338ff1faec5d9d804f80b8a8e18ca3', 0.61792165)]


In [45]:

df[df['userId'] == 'cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7'].head()

Unnamed: 0,eventId,category,activeTime,title,url,userId,publishtime,time,documentId
16839,2102219828,nyheter|trondheim,,Følg Kystad-saken i kontrollkomiteen,http://adressa.no/nyheter/trondheim/2017/03/21...,cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,2017-03-21T12:46:38.000Z,2017-03-21 17:27:35+01:00,b3a814c5b94fd49685d1e14c526b37c8bc91426a
16843,1908908574,nyheter|trondheim,1.0,Følg Kystad-saken i kontrollkomiteen,http://adressa.no/nyheter/trondheim/2017/03/21...,cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,2017-03-21T12:46:38.000Z,2017-03-21 17:27:40+01:00,b3a814c5b94fd49685d1e14c526b37c8bc91426a
20191,1823302050,nyheter|trondheim,30.0,Følg Kystad-saken i kontrollkomiteen,http://adressa.no/nyheter/trondheim/2017/03/21...,cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,2017-03-21T12:46:38.000Z,2017-03-21 19:31:10+01:00,b3a814c5b94fd49685d1e14c526b37c8bc91426a
20248,714858874,nyheter|okonomi,42.0,Jon Uthus slutter i NHO Trøndelag,http://adressa.no/nyheter/okonomi/2017/03/21/j...,cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,2017-03-21T16:35:30.000Z,2017-03-21 19:32:59+01:00,cca8c05905de83c08cf6d9e9dd5b0ab0a5b3bdc2
20269,1903458095,pluss|okonomi,148.0,Naboene på Singsaker protesterte mot utbygging...,http://adressa.no/pluss/okonomi/2017/03/21/van...,cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,2017-03-21T17:16:20.000Z,2017-03-21 19:33:50+01:00,8755fc8cac27dd259bef58e36c955b793c8ee27a


In [38]:
itemid = 140
related = model.similar_items(itemid)
related

(array([ 140, 1697, 2606,  710, 2365, 3211, 2508, 1083, 1172, 2835],
       dtype=int32),
 array([0.9999999 , 0.69081384, 0.6547633 , 0.6425652 , 0.6399577 ,
        0.6256969 , 0.62289613, 0.61278063, 0.6058873 , 0.6052868 ],
       dtype=float32))

In [37]:
user_id = 1
related_users = model.similar_users(user_id)
related_users

(array([  1, 630, 617, 246, 645, 685, 757, 842, 559, 880], dtype=int32),
 array([1.0000001 , 0.6046029 , 0.60427755, 0.5785596 , 0.5541975 ,
        0.55398583, 0.5290943 , 0.5240099 , 0.5210381 , 0.5182514 ],
       dtype=float32))