In [1]:
#%pip install lightfm
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
import matplotlib.pyplot as plt
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score



In [2]:
# Load data
basepath = '../../datasets/preprocessed_datasets/gabor/'
interaction_data = pd.read_pickle(basepath + 'user_item_interactions_only_main_products_10k.pkl')
#interaction_data = interaction_data.sample(1000)
product_data = pd.read_pickle(basepath + 'no_product_variants_with_duplicate_ids.pkl')
encoder = LabelEncoder()
interaction_data.main_product_id = interaction_data.main_product_id.astype(int)
product_data.main_product_id = product_data.main_product_id.astype(int)

display(interaction_data, product_data)

Unnamed: 0,customer_id,main_product_id,amount
325791,7347782,7023883,1
133655,4395746,4233880,1
128155,4315328,4234094,1
231063,5839435,5015580,1
42949,2070488,3480813,1
...,...,...,...
187060,5196436,4823122,1
471462,9626549,3846282,1
79554,2204900,4773983,1
158028,4807671,3849864,1


Unnamed: 0,main_product_id,productNumber,productName,productColorName,articleNumber,brand__id,mainCategory__id,gender,originCountry,line,...,heelHeightGroup,sizeEu,sizeUk,sizeIndex,shaftLength__value,sole,isTransferee,isSuccessor,duplicate_product__ids,product_sizes
0,557559,4058394021466,sportliche Ballerinas Glattleder schwarz,schwarz,02.643.57,6590678,315571,w,PT,F-S|H-W,...,bis 3 cm,40.0,6.5,11.0,0.0,Gummi,0.0,0.0,"[557559, 549733, 549734, 557553, 589488, 58948...","[35.5, 36.0, 37.0, 37.5, 38.0, 38.5, 39.0, 40...."
1,551622,4054452768212,elegante Pumps Glattleder schwarz,schwarz,05.160.37,6590677,315573,w,PT,F-S|H-W,...,3 cm - 5 cm,38.0,5.0,8.0,0.0,EVA,0.0,0.0,"[551622, 551505, 551510, 551509, 552973, 55162...","[35.0, 35.5, 36.0, 37.0, 37.5, 38.0, 38.5, 39...."
2,547193,4054452768427,Slipper Glattleder schwarz,schwarz,04.443.27,6590677,315576,w,SK,F-S|H-W,...,3 cm - 5 cm,42.0,8.0,14.0,0.0,PU-TPU,0.0,0.0,"[547193, 547191, 547196, 547187, 547189, 54719...","[35.0, 35.5, 36.0, 37.0, 37.5, 38.0, 38.5, 39...."
3,549776,4059701687894,Sneaker low Rauleder blau,blau,06.968.46,4997827,315567,w,VN,F-S|H-W,...,3 cm - 5 cm,39.0,6.0,10.0,0.0,Gummi-EVA,0.0,0.0,"[549776, 550599, 550596, 550593, 550601, 54978...","[35.0, 35.5, 36.0, 37.0, 37.5, 38.0, 38.5, 39...."
4,550726,4054452851594,elegante Ballerinas Materialmix Lederimitat sc...,schwarz,06.102.67,6590678,315574,w,SK,F-S|H-W,...,bis 3 cm,37.5,4.5,7.0,0.0,EVA,0.0,0.0,"[550726, 550736, 550735, 550732, 550728, 55072...","[35.0, 37.0, 37.5, 38.0, 38.5, 39.0, 40.0, 40...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3292,10603553,4066558951733,Sandale met plateauzool Suède blauw,blauw,24.764.36,6590677,315568,w,SK,F-S,...,5 cm - 8 cm,35.0,2.5,3.0,0.0,PU-TPU,0.0,0.0,"[10603553, 10544662]","[35.0, 37.0]"
3293,10442948,4065171827272,Mokassin Materialmix Leder pink,pink,26.090.21,6590678,315590,w,PT,F-S,...,bis 3 cm,40.5,7.0,12.0,0.0,Gummi,0.0,0.0,"[10442948, 10442949]","[40.5, 41.0]"
3294,10782704,4251234499207,Shopper ANDIE blau,blau,921453,363013,363017,w,DE,F-S,...,,,,,,,,,[10782704],[]
3295,10679703,4066558303617,Elegante pumps Glad leer wit,wit,21.450.60,6590677,315573,w,PT,F-S,...,5 cm - 8 cm,35.0,2.5,3.0,0.0,TPU,0.0,0.0,[10679703],[35.0]


In [3]:
all_product_ids = pd.concat([interaction_data['main_product_id'], product_data['main_product_id']]).drop_duplicates()
all_product_names = product_data['productName'].drop_duplicates()
all_product_colors = product_data['productColorName'].drop_duplicates()
all_brand_ids = product_data['brand__id'].drop_duplicates()
all_genders = product_data['gender'].drop_duplicates()
all_lines = product_data['line'].drop_duplicates()

merged_features = np.concatenate((all_product_names.to_numpy(), all_product_colors.to_numpy(), all_brand_ids.to_numpy(), all_genders.to_numpy(), all_lines.to_numpy()))
merged_features

dataset = Dataset()
dataset.fit(users=interaction_data['customer_id'], items=all_product_ids, item_features=merged_features)

(interactions, weights) = dataset.build_interactions(
    (row.customer_id, row.main_product_id, row.amount)
    for row in interaction_data.itertuples()
)

In [4]:
# Ensure that your productNumber is in a list (making it an iterable)
item_features = dataset.build_item_features((
    (row.main_product_id, [row.productName, row.line, row.gender, row.brand__id, row.productColorName])
    for row in product_data.itertuples()))

In [5]:
random_state = 27
num_epochs = 10
model = LightFM(loss='bpr', random_state=random_state, learning_rate=0.01, item_alpha=1e-6, user_alpha=1e-6)
# Split the interactions into training and testing sets
train_interactions, test_interactions = random_train_test_split(interactions, test_percentage=0.2)

In [6]:
plot_auc = False
if not plot_auc:
    model.fit(train_interactions, epochs=num_epochs, item_features=item_features, num_threads=4)
else: 
    auc_by_epoch = []
    epochs = []
    
    # Run several epochs, computing AUC after each epoch
    for epoch in range(num_epochs):
        model.fit_partial(train_interactions, epochs=1)
        auc = auc_score(model, test_interactions).mean()
        print(f"Epoch {epoch}: AUC = {auc}")
    
        epochs.append(epoch)
        auc_by_epoch.append(auc)
    # Plotting
    plt.figure(figsize=(10, 5))
    plt.plot(epochs, auc_by_epoch)
    plt.xlabel('Epochs')
    plt.ylabel('AUC')
    plt.title('AUC by Epoch')
    plt.show()

In [7]:
k_test = 2

def test_model(model, interactions, k_test, item_features, prefix): 
    train_precision = precision_at_k(model, interactions, k=k_test, item_features=item_features).mean()
    print(prefix + 'Precision: {:.4f}'.format(train_precision))
    train_recall = recall_at_k(model, interactions, k=k_test, item_features=item_features).mean()
    print(prefix + 'Recall: {:.4f}'.format(train_recall))
    train_auc = auc_score(model, interactions, item_features=item_features).mean()
    print(prefix + 'AUC: {:.4f}'.format(train_auc))
    
test_model(model, train_interactions, k_test, item_features, 'train ')
display('------')
test_model(model, test_interactions, k_test, item_features, 'test ')

train Precision: 0.0095
train Recall: 0.0190
train AUC: 0.5590


'------'

test Precision: 0.0095
test Recall: 0.0191
test AUC: 0.5595


In [8]:
def predict_best_n_products(user_id, n = 5, no_outputs = False):
    item_ids = interaction_data[interaction_data.customer_id ==  user_id].main_product_id.tolist()
    user_mapping, _, item_mapping, _ = dataset.mapping()
    if not no_outputs:
        print('user ' + str(user_id) + ' bought following items')
        display(product_data[product_data.main_product_id.isin(item_ids) ])
    
    
    # Convert the original user_id to internal user index
    internal_user_id = user_mapping[user_id]
    
    all_item_ids = np.array(all_product_ids.tolist())
    # Get the internal item indices for these IDs
    internal_all_item_ids = np.array([item_mapping[item] for item in all_item_ids])
    # Convert the list of item_ids that the user has already interacted with to internal item indices
    internal_item_ids = [item_mapping[item] for item in item_ids]
    # Remove the items the user has already interacted with
    recommendable_item_ids = np.setdiff1d(internal_all_item_ids, internal_item_ids)
    # Make predictions for the remaining items
    scores = model.predict(internal_user_id, recommendable_item_ids)
    indices = np.flip(np.argsort(scores)[-n:])
    scores_sorted = [scores[i] for i in indices]
    
    top_recommended_item_internal = [recommendable_item_ids[i] for i in scores.argsort()[::-1]]
    
    top_recommended_items = []
    count = 0
    for internal_id in top_recommended_item_internal:
        top_recommended_items.append((list(item_mapping.keys()) [list(item_mapping.values()).index(internal_id)]))
        count += 1
        if count >= n: 
            break
    
    if not no_outputs:
        print('user got recommended following ' + str(n) + ' items: ')
        display(product_data[product_data.main_product_id.isin(top_recommended_items)])
    return scores_sorted, top_recommended_items


In [9]:
user_id = 1940761
display(predict_best_n_products(user_id, 5))

user 1940761 bought following items


Unnamed: 0,main_product_id,productNumber,productName,productColorName,articleNumber,brand__id,mainCategory__id,gender,originCountry,line,...,heelHeightGroup,sizeEu,sizeUk,sizeIndex,shaftLength__value,sole,isTransferee,isSuccessor,duplicate_product__ids,product_sizes


KeyError: 1940761

1 epoch: 
user: 1940761
([0.013858412, 0.010622009, 0.00874101, 0.006170107, 0.0012614947],
 [7701799, 5015505, 6554561, 7811874, 3711496, 7593580])
 
30 epochs: 
user: 1940761
([1.2001197, 1.1783248, 1.0208428, 0.9025923, 0.88648444],
 [4125076, 7023883, 3473970, 3847486, 3692883, 6110179])

In [10]:
from collections import defaultdict

# Assuming 'interaction_data.customer_id.unique()' gives us a list of unique user IDs
# and 'predict_best_n_products(user, n)' returns the top 'n' recommended product IDs for a given user

# Step 1: Store Recommended Items
recommended_products = defaultdict(list)
print('calculating recommendations for ' + str(len(interaction_data.customer_id.unique())) + ' users')
count = 0
all_scores = []
for user in interaction_data.customer_id.unique():
    scores, top_recommended_items = predict_best_n_products(user, 5, True)
    recommended_products[user].extend(top_recommended_items)
    all_scores.extend(scores)
    count += 1
    if count % 10000 == 0: 
        print('calculated recommendations for ' + str(count) + ' users')


# Step 2: Calculate Catalog Coverage
unique_recommended_items = set()
for user, items in recommended_products.items():
    unique_recommended_items.update(items)

catalog_coverage = len(unique_recommended_items) / len(product_data.main_product_id.unique())  # Replace with your catalog size

# Step 3: Calculate Item Coverage
item_recommendation_counts = defaultdict(int)
for items in recommended_products.values():
    for item in items:
        item_recommendation_counts[item] += 1

average_item_coverage = sum(item_recommendation_counts.values()) / len(recommended_products)

# Print the results
print(f"Catalog Coverage: {catalog_coverage:.2f}")
print(f"Item Coverage: {average_item_coverage:.2f}")
print(f"Average Score: {np.mean(all_scores):.2f}, Min Score: {np.min(all_scores):.2f}, Max Score: {np.max(all_scores):.2f}")

calculating recommendations for 9814 users
Catalog Coverage: 0.02
Item Coverage: 5.00
Average Score: -0.01, Min Score: -0.06, Max Score: 0.03


In [11]:
print(sum(item_recommendation_counts.values()))

49070


In [12]:
print(f"recommended items {len(unique_recommended_items)}; num of items: {len(product_data.main_product_id.unique())}")

recommended items 72; num of items: 3297


In [None]:
unique_recommended_items

In [None]:
interaction_data.to_csv(basepath + 'user_item_interactions_only_main_products.csv')
