In [105]:
from utils import load_data
import pandas as pd
import numpy as np
from scipy import sparse
import implicit
from sklearn.model_selection import train_test_split
from implicit import evaluation

In [2]:
df = load_data("./active1000")
df["time"] = df["time"].apply(
    lambda date: pd.Timestamp(date, unit="s", tz="Europe/Oslo")
)
df.drop(index=df[df.url == "http://adressa.no"].index, inplace=True)


In [83]:
def dataframe_to_user_item_matrix(df):
    df = df.drop_duplicates(subset=['userId', 'documentId'])
    df = df.sort_values(by=['userId', 'time'])

    n_users = df['userId'].nunique()
    n_items = df['documentId'].count()

    ratings = np.zeros((n_users, n_items))

    new_user = df['userId'].values[1:] != df['userId'].values[:-1]
    new_user = np.r_[True, new_user]

    df['uid'] = np.cumsum(new_user)
    item_ids = df['documentId'].unique().tolist()

    new_df = pd.DataFrame({'documentId': item_ids, 'tid': range(1, len(item_ids) + 1)})

    df = pd.merge(df, new_df, on='documentId', how='outer')
    user_id_document_df = df[['documentId', 'uid', 'userId']]
    df_ext = df[['uid', 'tid']]

    users_df = df["uid"].unique()
    user_indexes = set()

    for row in df_ext.itertuples():
        ratings[row[1] - 1, row[2] - 1] = 1.0

        if row[1] in users_df:
            user_indexes.add(row[1] - 1)

    print(f"Matrix created with {ratings.shape[0]} users and {ratings.shape[1]} items")
    return ratings, user_indexes, item_ids, user_id_document_df

user_item_data, user_indexes, item_ids, user_id_document_df = dataframe_to_user_item_matrix(df)

user_item_sparse_matrix = sparse.csr_matrix(user_item_data)
train_data, test_data = train_test_split(user_item_sparse_matrix, random_state=23, test_size=0.25)

Matrix created with 1000 users and 679355 items


In [84]:
model = implicit.als.AlternatingLeastSquares(factors=50)
model.fit(train_data)

100%|██████████| 15/15 [00:04<00:00,  3.05it/s]


In [110]:
recommendations = model.recommend(0, test_data[0], filter_already_liked_items=True, recalculate_user=True, N=50)
ids = list(map(lambda x: item_ids[x], list(recommendations[0])))
print(list(zip(ids, list(recommendations[1]))))

[('aa6a5862cb2ae9fb8996f35a692192559b9083e1', 0.7888073), ('0a8c6555c75b8fd530be97356c8b62f6a6a4ad83', 0.7564964), ('1ae48cc9bf00e7a94c6fb70e7beb74ee9b926998', 0.7064083), ('50c3b54c898d2c76681b4e4c02bad53b7a26b01b', 0.67255443), ('cd625da5a4f18ba4912ce7c6a50340b364c2c7f1', 0.6566198), ('0bad5f842d38601fe714f625d46be941fb27f28c', 0.6334733), ('82bab6b33d01bf5c93200ba51fc84283c18da102', 0.62665105), ('7b37701ba1916986c0d63c074df278f9ea9117cb', 0.6241478), ('37bcd74a9f1aafaec15b4e01870c694636076147', 0.59894073), ('3580bacc6ca0f02842dedb898871778b4b264c9a', 0.5935142), ('2ba425ed450c17c98bb10837f6ab902dd3e4df6d', 0.5791403), ('2febaa0c1a2bb66a3dda745c33ba3da7cc81a55b', 0.57832664), ('f1846d55be374246d0b9a76e0027936642ca3f1a', 0.5759356), ('55cb612b1b7f09e817909fb95156ef74f1711b70', 0.5758415), ('594d6458ecbd19fbb0657314bb5c742606ef57b9', 0.5705431), ('b85abfb0937cf948a48ca22758a4e03407bc3fcc', 0.56808734), ('794793820080e2f3949a4a8209de055297004031', 0.5650357), ('f6ee74f5aaf7c73366438cf

In [113]:

def evaluate_recall(ids, user_index):
    user_id = user_id_document_df[user_id_document_df.uid == user_index].iloc[0]['userId']
    true_positive = 0
    total = 0
    current_df = user_id_document_df.drop_duplicates(subset=['documentId', 'userId'])
    mask = current_df.userId.apply(lambda x: x == user_id)
    current_df = current_df[mask]

    docs_recommended_properly = []
    docs_not_recommended_properly = []

    for id in ids:
        try:
            if current_df[current_df.documentId == id].documentId.values[0] == id:
                docs_recommended_properly.append(id)
                true_positive += 1
        except:
            docs_not_recommended_properly.append(id)
        total += 1
    false_negative = current_df.count().uid - true_positive
    print("Recall: " + str(true_positive / (true_positive + false_negative)) + "\n")
    print("True positives: " + str(docs_recommended_properly))
    print("False positives: " + str(docs_not_recommended_properly))
    return true_positive / (true_positive + false_negative)
evaluate_recall(ids, 1)
    

Recall: 0.02073365231259968

True positives: ['aa6a5862cb2ae9fb8996f35a692192559b9083e1', 'cd625da5a4f18ba4912ce7c6a50340b364c2c7f1', '37bcd74a9f1aafaec15b4e01870c694636076147', '2febaa0c1a2bb66a3dda745c33ba3da7cc81a55b', '55cb612b1b7f09e817909fb95156ef74f1711b70', 'b85abfb0937cf948a48ca22758a4e03407bc3fcc', '794793820080e2f3949a4a8209de055297004031', 'e9a8deeda6a04df6afb887619a3a1880250aed7a', '60a5adfd53e73e44d1fdd9ed07c890879640b2e6', 'ffe7344b2550c475de286de0f34da337b1af1851', '87497bca5ef49ff1ae7bbf31c6b8286bd6c27bf8', '485f228342a41190f7fe77e9816daaed2024d0fe', 'a1085cc32a1a03b2c1cb32965afe44440af5b2a1']
False positives: ['0a8c6555c75b8fd530be97356c8b62f6a6a4ad83', '1ae48cc9bf00e7a94c6fb70e7beb74ee9b926998', '50c3b54c898d2c76681b4e4c02bad53b7a26b01b', '0bad5f842d38601fe714f625d46be941fb27f28c', '82bab6b33d01bf5c93200ba51fc84283c18da102', '7b37701ba1916986c0d63c074df278f9ea9117cb', '3580bacc6ca0f02842dedb898871778b4b264c9a', '2ba425ed450c17c98bb10837f6ab902dd3e4df6d', 'f1846d55be3

0.02073365231259968

In [114]:
evaluation.mean_average_precision_at_k(model, train_data, test_data)

100%|██████████| 250/250 [00:00<00:00, 556.03it/s]


0.2891466666666667