In [1]:
#%pip install lightfm
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
import matplotlib.pyplot as plt
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score



In [2]:
# Load data
def map_event(event):
    if event == 'view':
        return 1
    elif event == 'addtocart':
        return 2
    elif event == 'transaction':
        return 3
    else:
        return 0

basepath = '../../datasets/preprocessed_datasets/retailrocket/'
events = pd.read_pickle(basepath +'events.pkl')
events['event_mapped'] = events['event'].apply(map_event)
events = events.drop(columns=['timestamp', 'event', 'transactionid', 'datetime'])
item_properties = pd.read_pickle(basepath +'item_data_extracted.pkl')
display(events, item_properties)

Unnamed: 0,visitorid,itemid,event_mapped
0,257597,355908,1
1,992329,248676,1
2,111016,318965,1
3,483717,253185,1
4,951259,367447,1
...,...,...,...
2756096,591435,261427,1
2756097,762376,115946,1
2756098,1251746,78144,1
2756099,1184451,283392,1


Unnamed: 0,itemid,categoryid,available,properties,property_values
0,0,209,0,"[1036, 1056, 11, 112, 127, 139, 159, 177, 189,...","[1276750, n3.168 1144008, n15360.000 628176 n1..."
1,1,1114,1,"[0, 1036, 112, 159, 185, 202, 227, 238, 280, 2...","[769062, 1154859, 679677, 519769, 769062, 1726..."
2,2,1305,0,"[1063, 112, 159, 202, 282, 283, 318, 332, 364,...","[n396.000 145688, 679677, 519769, 648485 n6000..."
3,3,1171,0,"[1025, 1080, 112, 159, 202, 227, 250, 283, 30,...","[769062, 769062, 679677, 519769, 261419, 13057..."
4,4,1038,0,"[112, 115, 159, 202, 227, 28, 283, 33, 364, 59...","[679677, n24.000, 519769, 371058 71429, 588652..."
...,...,...,...,...,...
417048,466862,1376,0,"[112, 139, 159, 186, 202, 227, 28, 283, 29, 34...","[679677, 769062, 519769, 575816 n432.000, 7173..."
417049,466863,173,0,"[1014, 112, 159, 202, 227, 283, 364, 400, 47, ...","[1075463, 679677, 519769, 1194687 550565 24255..."
417050,466864,373,1,"[1036, 112, 152, 159, 202, 227, 230, 283, 348,...","[1154859, 679677, 1071492, 519769, 1262739 205..."
417051,466865,421,0,"[1031, 112, 159, 202, 227, 277, 28, 283, 332, ...","[1088309, 679677, 519769, 150169 780351 820477..."


In [3]:
events[events['event_mapped'] == 1]

Unnamed: 0,visitorid,itemid,event_mapped
0,257597,355908,1
1,992329,248676,1
2,111016,318965,1
3,483717,253185,1
4,951259,367447,1
...,...,...,...
2756096,591435,261427,1
2756097,762376,115946,1
2756098,1251746,78144,1
2756099,1184451,283392,1


In [4]:
item_properties[item_properties['itemid'].isin([363736, 420943, 174719])]

Unnamed: 0,itemid,categoryid,available,properties,property_values
156152,174719,1073,0,"[112, 159, 202, 283, 348, 364, 506, 551, 6, 67...","[679677, 519769, 145210 182495 632043 1250065,..."
325008,363736,1483,0,"[1, 1016, 1032, 1036, 1037, 1079, 1090, 112, 1...","[1200832, 1086273 n1200.000 628176 n1200.000 1..."
376089,420943,1384,1,"[1009, 1025, 112, 124, 159, 202, 227, 283, 348...","[769062, n4800.000 337002, 679677, 873905, 519..."


In [5]:
def flatten_column_and_get_unique_entries(column):
    unique_elements = []

    # Iterate through the DataFrame column
    for int_list in column:
        for item in list(int_list):
            unique_elements.append(item)
    # Convert the list to a set to remove duplicates
    unique_integers_list = np.unique(unique_elements)
    return unique_integers_list

def flatten_column_and_get_unique_entries_for_property_values(column):
    unique_elements = set()

    # Iterate through the DataFrame column
    for int_list in column:
        for item in list(int_list):
            unique_elements.update(item)
    # Convert the list to a set to remove duplicates
    unique_integers_list = list(unique_elements)
    return unique_integers_list

def convert_to_tuple(column):
    return [tuple(x) for x in column]

def convert_to_tuple_2(column):
    return [tuple(str(x) for x in row) for row in column]


all_product_ids = item_properties['itemid'].unique()
all_categories = item_properties['categoryid'].unique()
all_available = item_properties['available'].unique()
all_product_properties = convert_to_tuple(item_properties['properties'])
all_product_values = convert_to_tuple_2(item_properties['property_values'])

items_that_have_an_event_but_are_not_in_all_products = [element for element in events.itemid.unique() if element not in all_product_ids]
#drop all events with products, where no product data is available
events.drop(events[events.itemid.isin(items_that_have_an_event_but_are_not_in_all_products)].index, inplace=True)

In [6]:
# all_product_properties, all_product_values

In [7]:
merged_features = list(all_categories) + list(all_available)# + all_product_properties + all_product_values

dataset = Dataset()
dataset.fit(users=events['visitorid'].unique(), items=all_product_ids, item_features=merged_features)

(interactions, weights) = dataset.build_interactions(
    (row.visitorid, row.itemid, row.event_mapped)
    for row in events.itertuples()
)

In [8]:
# Ensure that your productNumber is in a list (making it an iterable)
item_features = dataset.build_item_features((
    (row.itemid, [row.categoryid, row.available])#, tuple(row.properties), tuple(str(x) for x in row.property_values)])
    for row in item_properties.itertuples()))

In [9]:
random_state = 27
num_epochs = 10
model = model = LightFM(loss='bpr', random_state=random_state, learning_rate=0.01, item_alpha=1e-6, user_alpha=1e-6)
# Split the interactions into training and testing sets
train_interactions, test_interactions = random_train_test_split(interactions, test_percentage=0.2)

In [10]:
plot_auc = False
if not plot_auc:
    model.fit(train_interactions, epochs=num_epochs, item_features=item_features, num_threads=4)
else: 
    auc_by_epoch = []
    epochs = []
    
    # Run several epochs, computing AUC after each epoch
    for epoch in range(num_epochs):
        model.fit_partial(train_interactions, epochs=1)
        auc = auc_score(model, test_interactions).mean()
        print(f"Epoch {epoch}: AUC = {auc}")
    
        epochs.append(epoch)
        auc_by_epoch.append(auc)
    # Plotting
    plt.figure(figsize=(10, 5))
    plt.plot(epochs, auc_by_epoch)
    plt.xlabel('Epochs')
    plt.ylabel('AUC')
    plt.title('AUC by Epoch')
    plt.show()

In [11]:
k_test = 2

def test_model(model, interactions, k_test, item_features, prefix): 
    train_precision = precision_at_k(model, interactions, k=k_test, item_features=item_features).mean()
    print(prefix + 'Precision: {:.4f}'.format(train_precision))
    train_recall = recall_at_k(model, interactions, k=k_test, item_features=item_features).mean()
    print(prefix + 'Recall: {:.4f}'.format(train_recall))
    train_auc = auc_score(model, interactions, item_features=item_features).mean()
    print(prefix + 'AUC: {:.4f}'.format(train_auc))
    
# test_model(model, train_interactions, k_test, item_features, 'train ')
# display('------')
test_model(model, test_interactions, k_test, item_features, 'test ')

test Precision: 0.0000
test Recall: 0.0000
test AUC: 0.5866


In [12]:
def predict_best_n_products(user_id, n = 5, no_outputs = False):
    item_ids = events[events.visitorid ==  user_id].itemid.tolist()
    user_mapping, _, item_mapping, _ = dataset.mapping()
    if not no_outputs:
        print('user ' + str(user_id) + ' bought following items')
        display(item_properties[item_properties.itemid.isin(item_ids) ])
    
    
    # Convert the original user_id to internal user index
    internal_user_id = user_mapping[user_id]
    
    all_item_ids = np.array(all_product_ids.tolist())
    # Get the internal item indices for these IDs
    internal_all_item_ids = np.array([item_mapping[item] for item in all_item_ids])
    # Convert the list of item_ids that the user has already interacted with to internal item indices
    internal_item_ids = [item_mapping[item] for item in item_ids]
    # Remove the items the user has already interacted with
    recommendable_item_ids = np.setdiff1d(internal_all_item_ids, internal_item_ids)
    # Make predictions for the remaining items
    scores = model.predict(internal_user_id, recommendable_item_ids)
    indices = np.flip(np.argsort(scores)[-n:])
    scores_sorted = [scores[i] for i in indices]
    
    top_recommended_item_internal = [recommendable_item_ids[i] for i in scores.argsort()[::-1]]
    
    top_recommended_items = []
    count = 0
    for internal_id in top_recommended_item_internal:
        top_recommended_items.append((list(item_mapping.keys()) [list(item_mapping.values()).index(internal_id)]))
        count += 1
        if count >= n: 
            break
    
    if not no_outputs:
        print('user got recommended following ' + str(n) + ' items: ')
        display(item_properties[item_properties.itemid.isin(top_recommended_items)])
    return scores_sorted, top_recommended_items


In [13]:
user_id = 325780
display(predict_best_n_products(user_id, 5))

user 325780 bought following items


Unnamed: 0,itemid,categoryid,available,properties,property_values
7421,8296,746,0,"[112, 159, 202, 227, 28, 283, 319, 364, 454, 4...","[679677, 519769, 839165, 722378 1263557 150169..."
12352,13854,746,1,"[112, 159, 202, 227, 28, 283, 319, 364, 454, 4...","[679677, 519769, 1322737, 1263557 150169 81963..."
28410,31833,746,1,"[112, 159, 202, 227, 28, 283, 319, 364, 454, 4...","[679677, 519769, 321534, 374540 150169 819637 ..."
28462,31889,746,1,"[112, 159, 202, 227, 28, 283, 319, 364, 454, 4...","[679677, 519769, 623705 1159389 952006, 126355..."
77221,86368,746,1,"[112, 159, 202, 227, 28, 283, 319, 364, 454, 4...","[679677, 519769, 1322737, 1263557 150169 81963..."
92891,103918,746,1,"[112, 159, 202, 227, 28, 283, 319, 364, 454, 4...","[679677, 519769, 38608, 1263557 150169 819637 ..."
157004,175681,1679,1,"[1032, 1036, 1037, 1079, 1090, 112, 120, 159, ...","[769062, 1154859, 769062, 769062, 769062, 6796..."
187747,210086,746,1,"[112, 159, 202, 227, 28, 283, 319, 364, 454, 4...","[679677, 519769, 822899, 722378 1263557 150169..."
244279,273383,746,1,"[112, 159, 202, 227, 28, 283, 319, 364, 454, 4...","[679677, 519769, 239875 1073381, 1263557 15016..."
267576,299440,1359,0,"[112, 159, 202, 227, 283, 314, 33, 364, 569, 5...","[679677, 519769, 251375, 8496, 8496 357513 375..."


user got recommended following 5 items: 


Unnamed: 0,itemid,categoryid,available,properties,property_values
8820,9877,858,1,"[112, 159, 19, 202, 227, 28, 283, 325, 364, 52...","[679677, 519769, 1297729 n60.000 350726 30603 ..."
107040,119736,57,1,"[1081, 112, 159, 202, 227, 283, 364, 470, 6, 6...","[769062, 679677, 519769, 1278980 n38496.000, 1..."
134872,150882,808,0,"[1081, 112, 159, 202, 227, 283, 364, 470, 6, 6...","[n6000.000 739952, 679677, 519769, n40860.000,..."
393055,439963,793,0,"[1081, 112, 159, 202, 227, 283, 364, 470, 591,...","[769062, 679677, 519769, 485154 162220 n363600..."
412415,461686,1037,1,"[112, 159, 19, 202, 227, 28, 283, 364, 521, 55...","[679677, 519769, 769062, 245814 237874 171308,..."


([0.13117296, 0.055038914, -0.0021936656, -0.08592069, -0.11469769],
 [119736, 461686, 439963, 9877, 150882])

In [14]:
from collections import defaultdict

# Assuming 'interaction_data.customer_id.unique()' gives us a list of unique user IDs
# and 'predict_best_n_products(user, n)' returns the top 'n' recommended product IDs for a given user

# Step 1: Store Recommended Items
recommended_products = defaultdict(list)
print('calculating recommendations for ' + str(len(events.visitorid.unique())) + ' users')
count = 0
all_scores = []
for user in events.visitorid.unique():
    scores, top_recommended_items = predict_best_n_products(user, 5, True)
    recommended_products[user].extend(top_recommended_items)
    all_scores.extend(scores)
    count += 1
    if count % 1000 == 0:
        print('calculated recommendations for ' + str(count) + ' users')


calculating recommendations for 1236032 users
calculated recommendations for 1000 users
calculated recommendations for 2000 users
calculated recommendations for 3000 users
calculated recommendations for 4000 users
calculated recommendations for 5000 users
calculated recommendations for 6000 users
calculated recommendations for 7000 users
calculated recommendations for 8000 users
calculated recommendations for 9000 users
calculated recommendations for 10000 users
calculated recommendations for 11000 users
calculated recommendations for 12000 users
calculated recommendations for 13000 users
calculated recommendations for 14000 users
calculated recommendations for 15000 users
calculated recommendations for 16000 users
calculated recommendations for 17000 users
calculated recommendations for 18000 users
calculated recommendations for 19000 users
calculated recommendations for 20000 users
calculated recommendations for 21000 users
calculated recommendations for 22000 users
calculated recomm

KeyboardInterrupt: 

In [15]:
# Step 2: Calculate Catalog Coverage
unique_recommended_items = set()
for user, items in recommended_products.items():
    unique_recommended_items.update(items)

catalog_coverage = len(unique_recommended_items) / len(item_properties.itemid.unique())  # Replace with your catalog size

# Step 3: Calculate Item Coverage
item_recommendation_counts = defaultdict(int)
for items in recommended_products.values():
    for item in items:
        item_recommendation_counts[item] += 1

average_item_coverage = sum(item_recommendation_counts.values()) / len(recommended_products)

# Print the results
print(f"Catalog Coverage: {catalog_coverage:.2f}")
print(f"Item Coverage: {average_item_coverage:.2f}")
print(f"Average Score: {np.mean(all_scores):.2f}, Min Score: {np.min(all_scores):.2f}, Max Score: {np.max(all_scores):.2f}")

Catalog Coverage: 0.00
Item Coverage: 5.00
Average Score: 0.04, Min Score: -2.27, Max Score: 0.31


In [16]:
print(f"recommended items {len(unique_recommended_items)}; num of items: {len(item_properties.itemid.unique())}")

recommended items 55; num of items: 417053


In [18]:
import pickle

#with open('../../models/retailrocket/lightfm_full_dataset.pkl', 'wb') as file:
#    pickle.dump(model, file)
