In [58]:
from utils import load_data
import pandas as pd
from lenskit.algorithms import Recommender, user_knn as knn
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.metrics.predict import user_metric, rmse
import numpy as np
from scipy import sparse
import implicit
%matplotlib inline

In [59]:
df = load_data("./active1000")
df["time"] = df["time"].apply(
    lambda date: pd.Timestamp(date, unit="s", tz="Europe/Oslo")
)
df.head()


Unnamed: 0,eventId,category,activeTime,title,url,userId,publishtime,time,documentId
0,2029361762,,,,http://adressa.no,cx:i9i5zdr4ns9bm4ky:2by1rj0hmjgy,,2017-01-01 00:00:27+01:00,
1,351617637,,41.0,,http://adressa.no,cx:iimz2wwcwxu7d721:2r8odp9zhg5yp,,2017-01-01 00:02:00+01:00,
2,483221791,,,,http://adressa.no/nyheter/trondheim/2016/12/31...,cx:iimz2wwcwxu7d721:2r8odp9zhg5yp,,2017-01-01 00:02:08+01:00,
3,929295193,,11.0,,http://adressa.no,cx:idlec1eyfbpru62o:2yhui5zv6k12b,,2017-01-01 00:02:14+01:00,
4,1908339310,,,,http://adressa.no/pluss/kultur/2016/12/31/bare...,cx:iimz2wwcwxu7d721:2r8odp9zhg5yp,,2017-01-01 00:03:07+01:00,


In [63]:
def split_train_test(df, num_test_days):
    last_day = df['time'].iloc[-1].date()
    test_window = pd.date_range(end=last_day, periods=num_test_days, freq="D").date

    # Split into test and train dataset
    test_mask = df["time"].dt.date.isin(test_window)
    train_df = df[-test_mask]
    test_df = df[test_mask]

    # Find common users
    common_users = set(test_df["userId"]).intersection(train_df["userId"])
    return train_df, test_df, common_users

def Dataframe2UserItemMatrix(df, common_users):
    """
    @author: zhanglemei and peng -  Sat Jan  5 13:48:20 2019
    Convert dataframe to user-item-interaction matrix, which is used for
    Matrix Factorization based recommendation.
    ROWS: users
    COLUMNS: items
    In rating matrix, clicked events are refered as 1 and others are refered as 0.
    :param df: Pandas Dataframe
    :return: ratings in a User-Item matrix
    """
    df = df[~df['documentId'].isnull()]
    df = df.drop_duplicates(subset=['userId', 'documentId'])
    df = df.sort_values(by=['userId', 'time'])

    n_users = df['userId'].nunique()
    n_items = df['documentId'].nunique()

    ratings = np.zeros((n_users, n_items))

    new_user = df['userId'].values[1:] != df['userId'].values[:-1]
    new_user = np.r_[True, new_user]

    df['uid'] = np.cumsum(new_user)
    item_ids = df['documentId'].unique().tolist()

    new_df = pd.DataFrame({'documentId': item_ids, 'tid': range(1, len(item_ids) + 1)})

    df = pd.merge(df, new_df, on='documentId', how='outer')
    df_ext = df[['uid', 'tid']]

    # Find indexes of common users
    common_users_df = df[df["userId"].isin(common_users)]["uid"].unique()
    common_idx = set()
    event_idx = set()

    for row in df_ext.itertuples():
        ratings[row[1] - 1, row[2] - 1] = 1.0

        if row[1] in common_users_df:
            common_idx.add(row[1] - 1)

    # Print ratings matrix
    print(f"\nThe User-Item Matrix has been generated ({ratings.shape[0]} users and {ratings.shape[1]} items)")

    # Print ratings available (1s)
    unique, counter = np.unique(ratings, return_counts=True)
    ratings_available = dict(zip(unique, counter))
    sparsity = round(100 * (ratings_available[1] / ratings_available[0]), 2)
    print(f"Number of ratings available (1s): {ratings_available[1]} "
          f"(~ {sparsity} %, total = {sum(ratings_available.values())}]")

    return ratings, common_idx, item_ids

train_df, test_df, common_users = split_train_test(
    df, num_test_days=30)

# Create the User-Item matrix
user_item_data, common_user_idx, item_idx = Dataframe2UserItemMatrix(train_df, common_users)
user_item_data = sparse.csr_matrix(user_item_data)


The User-Item Matrix has been generated (1000 users and 14248 items)
Number of ratings available (1s): 452708 (~ 3.28 %, total = 14248000]


In [64]:
model = implicit.als.AlternatingLeastSquares(factors=50)
model.fit(user_item_data)

100%|██████████| 15/15 [00:00<00:00, 56.76it/s]


In [65]:
recommendations = model.recommend(0, user_item_data[0])
recommendations

(array([ 920, 1774, 1252, 2063, 1105, 2788, 1112,  948, 1682, 2446]),
 array([0.80372345, 0.6912389 , 0.67220396, 0.6185734 , 0.5963423 ,
        0.5959573 , 0.59411585, 0.5854756 , 0.5658058 , 0.5487966 ],
       dtype=float32))

In [71]:

#df[df['userId'] == 'cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7'].head()

Unnamed: 0,eventId,category,activeTime,title,url,userId,publishtime,time,documentId
18149,1788714434,,3.0,,http://adressa.no,cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,,2017-01-01 18:56:10+01:00,
18151,1500434790,100sport|vintersport,17.0,Norges landslagssjef ville ha russisk leder ut...,http://adressa.no/100sport/langrenn_old/norges...,cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,2017-01-01T17:01:11.000Z,2017-01-01 18:56:14+01:00,70a19fd7c9f6827feb3eb4f3df95121664491fa7
18154,208227622,,3.0,,http://adressa.no,cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,,2017-01-01 18:56:32+01:00,
18157,1424389459,,3.0,Arsenal-spissens spektakulære scoring hylles: ...,http://adressa.no/100sport/fotball/arsenal-spi...,cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,,2017-01-01 18:56:37+01:00,49b538a570b59c6fb564da7dfeace13ddd4f26f5
18159,1893920872,,5.0,,http://adressa.no,cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,,2017-01-01 18:56:41+01:00,


In [None]:
related = model.similar_items(itemid)

In [11]:
dataset = original.copy()
dataset = dataset[pd.notnull(dataset["documentId"])]
dataset = dataset.rename(columns={'userId': 'user', 'documentId': 'item'})
test = dataset.iloc[:1000]
dataset = dataset.iloc[1000:]
dataset.head()

Unnamed: 0,eventId,category,activeTime,title,url,user,publishtime,time,item
4295,1130416241,100sport|vintersport,13.0,Norsk løper reiser hjem fra Tour de Ski,http://adressa.no/100sport/vintersport/langren...,cx:ifuyqg9khho82m89:1asgsscedk17x,2016-12-31T10:29:14.000Z,1483255350,1aa86dce47dcf5062416996e6a35e5aa6a42de2e
4297,1364575462,100sport|sjakk,40.0,På dette bildet skiller Magnus Carlsen seg ut:...,http://adressa.no/100sport/sjakk/pa-dette-bild...,cx:i1qnn9olehi8a57s:1ss5cx0z22huo,2016-12-31T16:12:10.000Z,1483255355,05e4209b296dc7dfc9d4e0c401d79256bcea4cb5
4304,1950305475,nyheter|sortrondelag,,Nødbluss sendt gjennom vindu startet branntilløp,http://adressa.no/nyheter/sortrondelag/2017/01...,cx:i7kjws2hj90yyv44:cj4j7uuorssw,2016-12-31T23:42:06.000Z,1483255386,a60c0b9a0ba539404271d0d51ffd209760a42cff
4305,102732781,nyheter|trondheim,,Åpenbart beruset mann i trafikkulykke på Byåsen,http://adressa.no/nyheter/trondheim/2017/01/01...,cx:2pbe4x3fx0xjw3lpcpeplvswd0:36hjaz9wpwdkh,2017-01-01T03:47:35.000Z,1483255391,b16b516eefb647edec256ad2f9b2c7a897b9785b
4313,872839225,nyheter|trondheim,,Beboere føler seg maktesløse,http://adressa.no/nyheter/trondheim/2016/12/28...,cx:hs5ncw84i0m2x8x6:2rmae9c8kjdxw,2016-12-28T21:44:51.000Z,1483255423,f5c8c88d6dc8833ce61d38953e8c8873d833e558


In [29]:
algo_ii = knn.UserUser(20, feedback='implicit')
def eval(aname, algo, train, test):
    fittable = util.clone(algo)
    fittable = Recommender.adapt(fittable)
    fittable.fit(train)
    users = test.user.unique()
    # now we run the recommender
    recs = batch.recommend(fittable, users, 100)
    # add the algorithm name for analyzability
    recs['Algorithm'] = aname
    return recs

In [34]:
all_recs = []
test_data = []
for train, test in xf.partition_users(dataset[['user', 'item']], 5, xf.SampleFrac(0.2)):
    test_data.append(test)
    all_recs.append(eval('ItemItem', algo_ii, train, test))

In [35]:
all_recs = pd.concat(all_recs, ignore_index=True)
test = all_recs.copy()
test['score'] = test['score'] / test.groupby('user')['score'].transform('sum')
test.head()

Unnamed: 0,item,score,user,rank,Algorithm
0,b791d1e6c3355e64e0c33b5e8580ea3fb1fc9ae1,0.011256,cx:13230374570232135078687:3ng976ttdk2la,1,ItemItem
1,c1ae8b1a04855fedd09ac1fbf45df07b0c7a2434,0.011256,cx:13230374570232135078687:3ng976ttdk2la,2,ItemItem
2,f5135438d8194dc19dbbc2f165e222e0b8f851eb,0.011256,cx:13230374570232135078687:3ng976ttdk2la,3,ItemItem
3,972802ba3cd65400a01de3f391f6451b95c52f9d,0.011205,cx:13230374570232135078687:3ng976ttdk2la,4,ItemItem
4,ee06631cf8f3211127bd57926eac2f4271a8c02d,0.011199,cx:13230374570232135078687:3ng976ttdk2la,5,ItemItem


In [22]:
algo_ii.user_index_

AttributeError: 'UserUser' object has no attribute 'user_index_'

In [None]:
rla = topn.RecListAnalysis()
rla.add_metric(topn.ndcg)
results = rla.compute(all_recs, test_data)
results.head()

topn.recall(all_recs, test_data)


NameError: name 'topn' is not defined