In [1]:
#%pip install lightfm
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
import matplotlib.pyplot as plt
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score



In [2]:
# Load data
def map_event(event):
    if event == 'view':
        return 1
    elif event == 'addtocart':
        return 2
    elif event == 'transaction':
        return 3
    else:
        return 0

basepath = '../../datasets/preprocessed_datasets/retailrocket/'
events = pd.read_pickle(basepath +'events.pkl')
events['event_mapped'] = events['event'].apply(map_event)
events = events.drop(columns=['timestamp', 'event', 'transactionid', 'datetime'])
events = events.sample(10000)
item_properties = pd.read_pickle(basepath +'item_data_extracted.pkl')
display(events, item_properties)

Unnamed: 0,visitorid,itemid,event_mapped
548831,845271,402871,1
2405168,811623,366186,1
924701,889024,304180,1
2282135,1154516,451346,1
2054490,747053,333201,1
...,...,...,...
2408923,166973,32267,1
14487,359470,27441,1
2341090,979030,379744,1
1232198,1038087,187200,1


Unnamed: 0,itemid,categoryid,available,properties,property_values
0,0,209,0,"[1036, 1056, 11, 112, 127, 139, 159, 177, 189,...","[1276750, n3.168 1144008, n15360.000 628176 n1..."
1,1,1114,1,"[0, 1036, 112, 159, 185, 202, 227, 238, 280, 2...","[769062, 1154859, 679677, 519769, 769062, 1726..."
2,2,1305,0,"[1063, 112, 159, 202, 282, 283, 318, 332, 364,...","[n396.000 145688, 679677, 519769, 648485 n6000..."
3,3,1171,0,"[1025, 1080, 112, 159, 202, 227, 250, 283, 30,...","[769062, 769062, 679677, 519769, 261419, 13057..."
4,4,1038,0,"[112, 115, 159, 202, 227, 28, 283, 33, 364, 59...","[679677, n24.000, 519769, 371058 71429, 588652..."
...,...,...,...,...,...
417048,466862,1376,0,"[112, 139, 159, 186, 202, 227, 28, 283, 29, 34...","[679677, 769062, 519769, 575816 n432.000, 7173..."
417049,466863,173,0,"[1014, 112, 159, 202, 227, 283, 364, 400, 47, ...","[1075463, 679677, 519769, 1194687 550565 24255..."
417050,466864,373,1,"[1036, 112, 152, 159, 202, 227, 230, 283, 348,...","[1154859, 679677, 1071492, 519769, 1262739 205..."
417051,466865,421,0,"[1031, 112, 159, 202, 227, 277, 28, 283, 332, ...","[1088309, 679677, 519769, 150169 780351 820477..."


In [3]:
def flatten_column_and_get_unique_entries(column):
    unique_elements = []

    # Iterate through the DataFrame column
    for int_list in column:
        for item in list(int_list):
            unique_elements.append(item)
    # Convert the list to a set to remove duplicates
    unique_integers_list = np.unique(unique_elements)
    return unique_integers_list

def flatten_column_and_get_unique_entries_for_property_values(column):
    unique_elements = set()

    # Iterate through the DataFrame column
    for int_list in column:
        for item in list(int_list):
            unique_elements.update(item)
    # Convert the list to a set to remove duplicates
    unique_integers_list = list(unique_elements)
    return unique_integers_list

def convert_to_tuple(column):
    return [tuple(x) for x in column]

def convert_to_tuple_2(column):
    return [tuple(str(x) for x in row) for row in column]


all_product_ids = item_properties['itemid'].unique()
all_categories = item_properties['categoryid'].unique()
all_available = item_properties['available'].unique()
all_product_properties = convert_to_tuple(item_properties['properties'])
all_product_values = convert_to_tuple_2(item_properties['property_values'])

items_that_have_an_event_but_are_not_in_all_products = [element for element in events.itemid.unique() if element not in all_product_ids]
#drop all events with products, where no product data is available
events.drop(events[events.itemid.isin(items_that_have_an_event_but_are_not_in_all_products)].index, inplace=True)

In [4]:
# all_product_properties, all_product_values

In [5]:
merged_features = list(all_categories) + list(all_available)# + all_product_properties + all_product_values

dataset = Dataset()
dataset.fit(users=events['visitorid'].unique(), items=all_product_ids, item_features=merged_features)

(interactions, weights) = dataset.build_interactions(
    (row.visitorid, row.itemid, row.event_mapped)
    for row in events.itertuples()
)

In [6]:
# Ensure that your productNumber is in a list (making it an iterable)
item_features = dataset.build_item_features((
    (row.itemid, [row.categoryid, row.available])#, tuple(row.properties), tuple(str(x) for x in row.property_values)])
    for row in item_properties.itertuples()))

In [7]:
random_state = 27
num_epochs = 5
model = LightFM(loss='bpr', random_state=random_state)
# Split the interactions into training and testing sets
train_interactions, test_interactions = random_train_test_split(interactions, test_percentage=0.2)

In [8]:
plot_auc = False
if not plot_auc:
    model.fit(train_interactions, epochs=num_epochs, item_features=item_features, num_threads=4)
else: 
    auc_by_epoch = []
    epochs = []
    
    # Run several epochs, computing AUC after each epoch
    for epoch in range(num_epochs):
        model.fit_partial(train_interactions, epochs=1)
        auc = auc_score(model, test_interactions).mean()
        print(f"Epoch {epoch}: AUC = {auc}")
    
        epochs.append(epoch)
        auc_by_epoch.append(auc)
    # Plotting
    plt.figure(figsize=(10, 5))
    plt.plot(epochs, auc_by_epoch)
    plt.xlabel('Epochs')
    plt.ylabel('AUC')
    plt.title('AUC by Epoch')
    plt.show()

In [9]:
k_test = 2

def test_model(model, interactions, k_test, item_features, prefix): 
    train_precision = precision_at_k(model, interactions, k=k_test, item_features=item_features).mean()
    print(prefix + 'Precision: {:.4f}'.format(train_precision))
    train_recall = recall_at_k(model, interactions, k=k_test, item_features=item_features).mean()
    print(prefix + 'Recall: {:.4f}'.format(train_recall))
    train_auc = auc_score(model, interactions, item_features=item_features).mean()
    print(prefix + 'AUC: {:.4f}'.format(train_auc))
    
test_model(model, train_interactions, k_test, item_features, 'train ')
display('------')
test_model(model, test_interactions, k_test, item_features, 'test ')

train Precision: 0.0002
train Recall: 0.0004
train AUC: 0.4454


'------'

test Precision: 0.0000
test Recall: 0.0000
test AUC: 0.4067


In [10]:
def predict_best_n_products(user_id, n = 5):
    item_ids = interaction_data[interaction_data.customer_id ==  user_id].main_product_id.tolist()
    user_mapping, _, item_mapping, _ = dataset.mapping()
    print('user ' + str(user_id) + ' bought following items')
    display(product_data[product_data.main_product_id.isin(item_ids) ])
    
    
    # Convert the original user_id to internal user index
    internal_user_id = user_mapping[user_id]
    
    all_item_ids = np.array(all_product_ids.tolist())
    # Get the internal item indices for these IDs
    internal_all_item_ids = np.array([item_mapping[item] for item in all_item_ids])
    # Convert the list of item_ids that the user has already interacted with to internal item indices
    internal_item_ids = [item_mapping[item] for item in item_ids]
    # Remove the items the user has already interacted with
    recommendable_item_ids = np.setdiff1d(internal_all_item_ids, internal_item_ids)
    # Make predictions for the remaining items
    scores = model.predict(internal_user_id, recommendable_item_ids)
    indices = np.flip(np.argsort(scores)[-n:])
    scores_sorted = [scores[i] for i in indices]
    
    top_recommended_item_internal = [recommendable_item_ids[i] for i in scores.argsort()[::-1]]
    
    top_recommended_items = []
    count = 0
    for internal_id in top_recommended_item_internal:
        top_recommended_items.append((list(item_mapping.keys()) [list(item_mapping.values()).index(internal_id)]))
        count += 1
        if count > n: 
            break
    
    print('user got recommended following ' + str(n) + ' items: ')
    display(product_data[product_data.main_product_id.isin(top_recommended_items)])
    return scores_sorted, top_recommended_items


In [11]:
user_id = 1940761
display(predict_best_n_products(user_id, 5))

NameError: name 'interaction_data' is not defined

1 epoch: 
user: 1940761
([0.013858412, 0.010622009, 0.00874101, 0.006170107, 0.0012614947],
 [7701799, 5015505, 6554561, 7811874, 3711496, 7593580])
 
30 epochs: 
user: 1940761
([1.2001197, 1.1783248, 1.0208428, 0.9025923, 0.88648444],
 [4125076, 7023883, 3473970, 3847486, 3692883, 6110179])