In [18]:
#%pip install lightfm
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
import matplotlib.pyplot as plt
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

In [19]:
# Load data
def map_event(event):
    if event == 'view':
        return 1
    elif event == 'addtocart':
        return 2
    elif event == 'transaction':
        return 3
    else:
        return 0

basepath = '../../datasets/preprocessed_datasets/retailrocket/'
events = pd.read_pickle(basepath +'events_10k.pkl')
events['event_mapped'] = events['event'].apply(map_event)
events = events.drop(columns=['timestamp', 'event', 'transactionid', 'datetime'])
item_properties = pd.read_pickle(basepath +'item_data_extracted.pkl')
item_properties = item_properties[item_properties.itemid.isin(events.itemid)]
display(events, item_properties)

Unnamed: 0,visitorid,itemid,event_mapped
2482751,1046539,373805,1
2204101,1383579,287405,1
1964828,474264,153625,1
2369299,1079433,287356,1
640471,865010,113440,1
...,...,...,...
560807,269455,459735,1
1158627,228194,160499,1
2074769,435897,346429,1
1911438,1099927,261011,1


Unnamed: 0,itemid,categoryid,available,properties,property_values
6,6,1091,1,"[112, 159, 19, 202, 227, 28, 283, 364, 521, 55...","[679677, 519769, 1297729 n72.000 309206, 60935..."
31,32,1173,0,"[1036, 1052, 1066, 112, 159, 202, 227, 230, 28...","[726612, 1116693, n973.200 424566, 679677, 519..."
40,42,84,1,"[1036, 1052, 1066, 112, 159, 202, 227, 230, 28...","[726612, 1116693, n68.400 424566, 679677, 5197..."
137,147,646,1,"[1092, 112, 159, 202, 283, 348, 364, 461, 491,...","[291010, 679677, 519769, 229273 388993 1246541..."
153,163,407,0,"[112, 159, 202, 227, 283, 364, 376, 397, 483, ...","[679677, 519769, 62992 n7440.000 925243, 92933..."
...,...,...,...,...,...
416890,466685,1400,1,"[102, 1028, 112, 159, 202, 227, 275, 28, 283, ...","[769062, 769062, 679677, 519769, 1109436 45934..."
416941,466740,967,0,"[1008, 1036, 112, 120, 140, 159, 202, 227, 283...","[124229 n336.000 1144008, 1154859, 679677, 115..."
416960,466760,1549,0,"[1036, 1066, 112, 159, 202, 210, 227, 230, 283...","[1318567, n720.000 424566, 679677, 519769, 123..."
417047,466861,1051,0,"[1036, 1066, 112, 159, 202, 227, 230, 283, 300...","[1318567, 732011 424566, 679677, 519769, 10769..."


In [20]:
events[events['event_mapped'] == 1]

Unnamed: 0,visitorid,itemid,event_mapped
2482751,1046539,373805,1
2204101,1383579,287405,1
1964828,474264,153625,1
2369299,1079433,287356,1
640471,865010,113440,1
...,...,...,...
560807,269455,459735,1
1158627,228194,160499,1
2074769,435897,346429,1
1911438,1099927,261011,1


In [21]:
item_properties[item_properties['itemid'].isin([363736, 420943, 174719])]

Unnamed: 0,itemid,categoryid,available,properties,property_values
156152,174719,1073,0,"[112, 159, 202, 283, 348, 364, 506, 551, 6, 67...","[679677, 519769, 145210 182495 632043 1250065,..."
376089,420943,1384,1,"[1009, 1025, 112, 124, 159, 202, 227, 283, 348...","[769062, n4800.000 337002, 679677, 873905, 519..."


In [22]:
def flatten_column_and_get_unique_entries(column):
    unique_elements = []

    # Iterate through the DataFrame column
    for int_list in column:
        for item in list(int_list):
            unique_elements.append(item)
    # Convert the list to a set to remove duplicates
    unique_integers_list = np.unique(unique_elements)
    return unique_integers_list

def flatten_column_and_get_unique_entries_for_property_values(column):
    unique_elements = set()

    # Iterate through the DataFrame column
    for int_list in column:
        for item in list(int_list):
            unique_elements.update(item)
    # Convert the list to a set to remove duplicates
    unique_integers_list = list(unique_elements)
    return unique_integers_list

def convert_to_tuple(column):
    return [tuple(x) for x in column]

def convert_to_tuple_2(column):
    return [tuple(str(x) for x in row) for row in column]


all_product_ids = item_properties['itemid'].unique()
all_categories = item_properties['categoryid'].unique()
all_available = item_properties['available'].unique()
all_product_properties = convert_to_tuple(item_properties['properties'])
all_product_values = convert_to_tuple_2(item_properties['property_values'])

items_that_have_an_event_but_are_not_in_all_products = [element for element in events.itemid.unique() if element not in all_product_ids]
#drop all events with products, where no product data is available
events.drop(events[events.itemid.isin(items_that_have_an_event_but_are_not_in_all_products)].index, inplace=True)

In [23]:
# all_product_properties, all_product_values

In [24]:
merged_features = list(all_categories) + list(all_available)# + all_product_properties + all_product_values

dataset = Dataset()
dataset.fit(users=events['visitorid'].unique(), items=all_product_ids, item_features=merged_features)

(interactions, weights) = dataset.build_interactions(
    (row.visitorid, row.itemid, row.event_mapped)
    for row in events.itertuples()
)

In [25]:
# Ensure that your productNumber is in a list (making it an iterable)
item_features = dataset.build_item_features((
    (row.itemid, [row.categoryid, row.available])#, tuple(row.properties), tuple(str(x) for x in row.property_values)])
    for row in item_properties.itertuples()))

In [26]:
random_state = 27
num_epochs = 10
model = model = LightFM(loss='bpr', random_state=random_state, learning_rate=0.01, item_alpha=1e-6, user_alpha=1e-6)
# Split the interactions into training and testing sets
train_interactions, test_interactions = random_train_test_split(interactions, test_percentage=0.2)

In [27]:
plot_auc = False
if not plot_auc:
    model.fit(train_interactions, epochs=num_epochs, item_features=item_features, num_threads=4)
else: 
    auc_by_epoch = []
    epochs = []
    
    # Run several epochs, computing AUC after each epoch
    for epoch in range(num_epochs):
        model.fit_partial(train_interactions, epochs=1)
        auc = auc_score(model, test_interactions).mean()
        print(f"Epoch {epoch}: AUC = {auc}")
    
        epochs.append(epoch)
        auc_by_epoch.append(auc)
    # Plotting
    plt.figure(figsize=(10, 5))
    plt.plot(epochs, auc_by_epoch)
    plt.xlabel('Epochs')
    plt.ylabel('AUC')
    plt.title('AUC by Epoch')
    plt.show()

In [28]:
k_test = 2

def test_model(model, interactions, k_test, item_features, prefix): 
    train_precision = precision_at_k(model, interactions, k=k_test, item_features=item_features).mean()
    print(prefix + 'Precision: {:.4f}'.format(train_precision))
    train_recall = recall_at_k(model, interactions, k=k_test, item_features=item_features).mean()
    print(prefix + 'Recall: {:.4f}'.format(train_recall))
    train_auc = auc_score(model, interactions, item_features=item_features).mean()
    print(prefix + 'AUC: {:.4f}'.format(train_auc))
    
# test_model(model, train_interactions, k_test, item_features, 'train ')
# display('------')
test_model(model, test_interactions, k_test, item_features, 'test ')

test Precision: 0.0000
test Recall: 0.0000
test AUC: 0.4954


In [29]:
def predict_best_n_products(user_id, n = 5, no_outputs = False):
    item_ids = events[events.visitorid ==  user_id].itemid.tolist()
    user_mapping, _, item_mapping, _ = dataset.mapping()
    if not no_outputs:
        print('user ' + str(user_id) + ' bought following items')
        display(item_properties[item_properties.itemid.isin(item_ids) ])
    
    
    # Convert the original user_id to internal user index
    internal_user_id = user_mapping[user_id]
    
    all_item_ids = np.array(all_product_ids.tolist())
    # Get the internal item indices for these IDs
    internal_all_item_ids = np.array([item_mapping[item] for item in all_item_ids])
    # Convert the list of item_ids that the user has already interacted with to internal item indices
    internal_item_ids = [item_mapping[item] for item in item_ids]
    # Remove the items the user has already interacted with
    recommendable_item_ids = np.setdiff1d(internal_all_item_ids, internal_item_ids)
    # Make predictions for the remaining items
    scores = model.predict(internal_user_id, recommendable_item_ids)
    indices = np.flip(np.argsort(scores)[-n:])
    scores_sorted = [scores[i] for i in indices]
    
    top_recommended_item_internal = [recommendable_item_ids[i] for i in scores.argsort()[::-1]]
    
    top_recommended_items = []
    count = 0
    for internal_id in top_recommended_item_internal:
        top_recommended_items.append((list(item_mapping.keys()) [list(item_mapping.values()).index(internal_id)]))
        count += 1
        if count >= n: 
            break
    
    if not no_outputs:
        print('user got recommended following ' + str(n) + ' items: ')
        display(item_properties[item_properties.itemid.isin(top_recommended_items)])
    return scores_sorted, top_recommended_items


In [30]:
user_id = 325780
display(predict_best_n_products(user_id, 5))

user 325780 bought following items


Unnamed: 0,itemid,categoryid,available,properties,property_values
385170,431099,746,1,"[112, 159, 202, 227, 28, 283, 319, 364, 454, 4...","[679677, 519769, 602490 857333, 1263557 150169..."


user got recommended following 5 items: 


Unnamed: 0,itemid,categoryid,available,properties,property_values
4855,5411,789,1,"[112, 135, 159, 193, 202, 234, 283, 336, 348, ...","[679677, 1116693, 519769, 1116693, 596532, 111..."
26978,30217,1228,0,"[1036, 105, 112, 159, 202, 227, 28, 283, 348, ...","[1318567, 769062, 679677, 519769, 232403, 8051..."
181432,202976,56,0,"[104, 112, 123, 159, 188, 202, 227, 283, 335, ...","[1297729 n1200.000 10317, 679677, 769062, 5197..."
311339,348455,1231,0,"[1036, 112, 159, 160, 202, 225, 227, 283, 364,...","[961511, 679677, 519769, 769062, 1018570 10714..."
399106,446755,484,1,"[112, 159, 202, 227, 28, 283, 293, 30, 348, 36...","[679677, 519769, 750871 n2592.000 1299781, 435..."


([-0.01597132, -0.017958352, -0.018576628, -0.018824417, -0.019815294],
 [5411, 446755, 348455, 202976, 30217])

In [31]:
from collections import defaultdict

# Assuming 'interaction_data.customer_id.unique()' gives us a list of unique user IDs
# and 'predict_best_n_products(user, n)' returns the top 'n' recommended product IDs for a given user

# Step 1: Store Recommended Items
recommended_products = defaultdict(list)
print('calculating recommendations for ' + str(len(events.visitorid.unique())) + ' users')
count = 0
all_scores = []
for user in events.visitorid.unique():
    scores, top_recommended_items = predict_best_n_products(user, 5, True)
    recommended_products[user].extend(top_recommended_items)
    all_scores.extend(scores)
    count += 1
    if count % 1000 == 0:
        print('calculated recommendations for ' + str(count) + ' users')


calculating recommendations for 8684 users
calculated recommendations for 1000 users
calculated recommendations for 2000 users
calculated recommendations for 3000 users
calculated recommendations for 4000 users
calculated recommendations for 5000 users
calculated recommendations for 6000 users
calculated recommendations for 7000 users
calculated recommendations for 8000 users


In [32]:
# Step 2: Calculate Catalog Coverage
unique_recommended_items = set()
for user, items in recommended_products.items():
    unique_recommended_items.update(items)

catalog_coverage = len(unique_recommended_items) / len(item_properties.itemid.unique())  # Replace with your catalog size

# Step 3: Calculate Item Coverage
item_recommendation_counts = defaultdict(int)
for items in recommended_products.values():
    for item in items:
        item_recommendation_counts[item] += 1

average_item_coverage = sum(item_recommendation_counts.values()) / len(recommended_products)

# Print the results
print(f"Catalog Coverage: {catalog_coverage:.2f}")
print(f"Item Coverage: {average_item_coverage:.2f}")
print(f"Average Score: {np.mean(all_scores):.2f}, Min Score: {np.min(all_scores):.2f}, Max Score: {np.max(all_scores):.2f}")

Catalog Coverage: 0.03
Item Coverage: 5.00
Average Score: -0.01, Min Score: -0.26, Max Score: 0.03


In [33]:
print(f"recommended items {len(unique_recommended_items)}; num of items: {len(item_properties.itemid.unique())}")

recommended items 219; num of items: 7705


In [34]:
import pickle

#with open('../../models/retailrocket/lightfm_full_dataset.pkl', 'wb') as file:
#    pickle.dump(model, file)
