## IMPORTS

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!pip install tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from google.colab import drive
drive.mount('/content/gdrive')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# Import necessary libraries

import torch

# Load pre-trained BERT model and tokenizer
trainFolder = "gdrive/My Drive/MLDM - Carrefour Project/data-train"

import os
for dirname, _, filenames in os.walk(f'{trainFolder}'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Mounted at /content/gdrive
gdrive/My Drive/MLDM - Carrefour Project/data-train/products_data.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/test_data.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_1.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_10.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_2.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_3.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_4.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_5.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_6.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_7.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_8.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_9.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/products_embe.csv
gdrive/My Drive/MLDM - Car

In [2]:
# Import libraries
from tqdm import tqdm
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

### Data Description

This project uses three datasets:

#### `train_data.csv`:
This dataset contains two years (2022 & 2023) of historical transactions for 100,000 Carrefour customers. It has 10 columns:

* ***date***: Date of the transaction
* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: Product purchased
* ***has_loyality_card***: Flag indicating whether the customer has a loyalty card
* ***store_id***: Store where the purchase was made
* ***is_promo***: Flag indicating whether there was a discount on the product
* ***quantity***: Quantity purchased of the product
* ***format***: Ecommerce activity format (clcv, lex, or DRIVE)
  - clcv : courses livrées chez vous
  - lex : livraison express
  - DRIVE.
* ***orderChannelCode***: Indicates whether the online activity was made through the website or mobile app

#### `products_data.csv`:
This dataset contains detailed information about the products. The following columns are relevant to this project:

* ***product_id*** : Product name
* ***product_description*** : Product description
* ***department_key***: Department key
* ***class_key***: Class key
* ***subclass_key***: Subclass key
* ***sector***: sector name
* ***brand_key***: Brand name
* ***shelf_level1***: Top-level shelf category
* ***shelf_level2***: Second-level shelf category
* ***shelf_level3***: Third-level shelf category
* ***shelf_level4***: Fourth-level shelf category
* ***sector***: Sector
* ***bio***: Flag indicating whether the product is organic
* ***sugar_free***: Flag indicating whether the product is sugar-free
* ***aspartame_free***: Flag indicating whether the product is aspartame-free
* ***gluten_free***: Flag indicating whether the product is gluten-free
* ***halal***: Flag indicating whether the product is halal
* ***casher***: Flag indicating whether the product is kosher
* ***eco_friendly***: Flag indicating whether the product is eco-friendly
* ***local_french***: Flag indicating whether the product is locally produced in France
* ***artificial_coloring_free***: Flag indicating whether the product is free of artificial coloring
* ***taste_enhancer_free***: Flag indicating whether the product is free of taste enhancers
* ***naturality***: Naturality score
* ***antibiotic_free***: Flag indicating whether the product is antibiotic-free
* ***reduced_sugar***: Flag indicating whether the product has reduced sugar content
* ***vegetarian***: Flag indicating whether the product is vegetarian
* ***pesticide_free***: Flag indicating whether the product is pesticide-free
* ***grain_free***: Flag indicating whether the product is grain-free
* ***no_added_sugar***: Flag indicating whether the product has no added sugar
* ***salt_reduced***: Flag indicating whether the product has reduced salt content
* ***nitrite_free***: Flag indicating whether the product is nitrite-free
* ***fed_without_ogm***: Flag indicating whether the animals were fed without GMOs
* ***no_added_salt***: Flag indicating whether the product has no added salt
* ***no_artificial_flavours***: Flag indicating whether the product has no artificial flavors
* ***porc***: Flag indicating whether the product contains pork
* ***vegan***: Flag indicating whether the product is vegan
* ***frozen***: Flag indicating whether the product is frozen
* ***fat_free***: Flag indicating whether the product is fat-free
* ***reduced_fats***: Flag indicating whether the product has reduced fat content
* ***fresh***: Flag indicating whether the product is fresh
* ***alcool***: Flag indicating whether the product contains alcohol
* ***lactose_free***: Flag indicating whether the product is lactose-free
* ***phenylalanine_free***: Flag indicating whether the product is phenylalanine-free
* ***palm_oil_free***: Flag indicating whether the product is palm oil-free
* ***ecoscore***: Ecoscore
* ***produits_du_monde***: Flag indicating whether the product is an international product
* ***regional_product***: Flag indicating whether the product is a regional product
* ***national_brand***: Flag indicating whether the product is a national brand
* ***first_price_brand***: Flag indicating whether the product is a first-price brand
* ***carrefour_brand***: Flag indicating whether the product is a Carrefour brand

#### `test_data.csv`:
This dataset contains the actual purchases of the first 80,000 customers in 2024. It has three columns:

* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: the id of the purchased product

### Load data

* Load *train_data.csv*, *products_data.csv* and *test_data.csv* using pandas.

In [3]:
# This code reads the data from CSV files named "train_data_part_i.csv" for all i from 1 to 10
# and concatenates them into a single pandas DataFrame
train_dataframes = []
for i in tqdm(range(1, 11)): ## 1 - 11
    train_dataframes.append(pd.read_csv(f'{trainFolder}/train_data_part_{i}.csv'))
train_data = pd.concat(train_dataframes, ignore_index=True)

# free up memory by deleting the dataframes we no longer need
del train_dataframes

100%|██████████| 10/10 [03:30<00:00, 21.03s/it]


In [4]:
#train_data = train_data.sample(frac=0.1, random_state=1)  # frac=0.1 takes 10% of the data

In [5]:
# This code reads the data from a CSV file named "products_data.csv" into a pandas DataFrame
products_data = pd.read_csv(f'{trainFolder}/products_data.csv')

  products_data = pd.read_csv(f'{trainFolder}/products_data.csv')


In [6]:
# This code reads the data from a CSV file named "test_data.csv" into a pandas DataFrame
test_data = pd.read_csv(f'{trainFolder}/test_data.csv')

In [7]:
# Load product embeddings
product_embeddings_np = np.load(f'{trainFolder}/product_embeddings.npy')

# Load product IDs
product_ids = np.load(f'{trainFolder}/product_ids.npy', allow_pickle=True)  # Use allow_pickle=True if product_ids is an array of objects (e.g., strings)

# Create the product embedding dictionary
product_embedding_dict = dict(zip(product_ids, product_embeddings_np))

In [8]:
train_data

Unnamed: 0,date,transaction_id,customer_id,product_id,has_loyality_card,store_id,is_promo,quantity,format,order_channel
0,2023-11-15,Transaction_1730125,Household_39,Product_5362,0,Store_2,0,1.0,DRIVE,MOBILE_APP
1,2022-07-20,Transaction_1560535,Household_39,Product_67174,0,Store_2,0,2.0,DRIVE,WEBSITE
2,2022-07-20,Transaction_1560535,Household_39,Product_82254,0,Store_2,0,2.0,DRIVE,WEBSITE
3,2023-11-15,Transaction_1730125,Household_39,Product_3895,0,Store_2,0,1.0,DRIVE,MOBILE_APP
4,2022-07-20,Transaction_1560535,Household_39,Product_34014,0,Store_2,0,1.0,DRIVE,WEBSITE
...,...,...,...,...,...,...,...,...,...,...
87037457,2022-01-07,Transaction_1162379,Household_96742,Product_28756,0,Store_2542,0,1.0,DRIVE,MOBILE_APP
87037458,2023-08-17,Transaction_2306043,Household_96742,Product_62786,0,Store_2542,0,1.0,DRIVE,MOBILE_APP
87037459,2022-10-12,Transaction_1524531,Household_96742,Product_16362,0,Store_2542,0,1.0,DRIVE,MOBILE_APP
87037460,2023-03-10,Transaction_1972306,Household_96742,Product_78870,0,Store_2542,0,1.0,DRIVE,MOBILE_APP


In [9]:
# prompt: print from train_date rows with customer id Household_80001
import pandas as pd

# Create a list of customer IDs within the specified range
#customer_ids_new = [f'Household_{i}' for i in range(80001, 100002)]
customer_ids_new = [f'Household_{i}' for i in range(60001, 60050)]


In [10]:
# prompt: keep in train_data and test_data the rows that have the cusomer_id in customer_ids

# Filter train_data
train_data = train_data[train_data['customer_id'].isin(customer_ids_new)]

# Filter test_data
test_data = test_data[test_data['customer_id'].isin(customer_ids_new)]

In [11]:
train_data

Unnamed: 0,date,transaction_id,customer_id,product_id,has_loyality_card,store_id,is_promo,quantity,format,order_channel
52320398,2022-08-28,Transaction_2131644,Household_60015,Product_8687,0,Store_47,0,2.0,DRIVE,WEBSITE
52320399,2022-07-01,Transaction_1333604,Household_60015,Product_66953,0,Store_47,0,3.0,DRIVE,WEBSITE
52320400,2023-08-04,Transaction_2010081,Household_60015,Product_28140,0,Store_47,0,1.0,DRIVE,WEBSITE
52320401,2023-08-04,Transaction_2010081,Household_60015,Product_10007,0,Store_47,0,1.0,DRIVE,WEBSITE
52320402,2022-07-01,Transaction_1333604,Household_60015,Product_48076,0,Store_47,0,1.0,DRIVE,WEBSITE
...,...,...,...,...,...,...,...,...,...,...
61062903,2022-03-14,Transaction_774797,Household_60047,Product_31008,1,Store_2429,0,3.0,CLCV,WEBSITE
61062904,2022-03-14,Transaction_774797,Household_60047,Product_32038,1,Store_2429,0,1.0,CLCV,WEBSITE
61069612,2023-11-13,Transaction_2022527,Household_60039,Product_14130,0,Store_2484,0,1.0,CLCV,WEBSITE
61069613,2023-10-31,Transaction_2748767,Household_60040,Product_31040,0,Store_2484,0,1.0,CLCV,WEBSITE


In [12]:
test_data

Unnamed: 0,transaction_id,customer_id,product_id
12590,Transaction_2024_3058,Household_60003,Product_19449
12591,Transaction_2024_3058,Household_60003,Product_24101
12592,Transaction_2024_3058,Household_60003,Product_502
12593,Transaction_2024_3058,Household_60003,Product_31203
12594,Transaction_2024_3058,Household_60003,Product_14634
...,...,...,...
1118787,Transaction_2024_90351,Household_60049,Product_9489
1118788,Transaction_2024_90351,Household_60049,Product_27979
1118789,Transaction_2024_90351,Household_60049,Product_52209
1118790,Transaction_2024_90351,Household_60049,Product_30962


In [13]:
# Ensure all embeddings are numpy arrays of floats
for key in product_embedding_dict:
    product_embedding_dict[key] = np.array(product_embedding_dict[key], dtype=np.float32)

## Training Data Preproccesing

In [14]:
train_data['date'] = pd.to_datetime(train_data['date'])

# Add recency attribute
latest_date = train_data['date'].max()  # Find the latest date in the dataset
train_data['recency'] = (latest_date - train_data['date']).dt.days  # Calculate days since last purchase

# Group by customer_id and product_id to calculate quantity and most recent purchase
customer_product_data = train_data.groupby(['customer_id', 'product_id']).agg({
    'quantity': 'sum',
    'recency': 'min'  # Minimum days since purchase (most recent)
}).reset_index()

# Normalize quantity and recency scores
customer_product_data['quantity_score'] = customer_product_data['quantity'] / customer_product_data['quantity'].max()
customer_product_data['recency_score'] = 1 - (customer_product_data['recency'] / customer_product_data['recency'].max())  # Recent = higher score

best_alpha = 0.97

# Calculate the final score with this alpha value
customer_product_data['final_score'] = (
    best_alpha * customer_product_data['quantity_score'] +
    (1 - best_alpha) * customer_product_data['recency_score']
)

# Rank products for each customer with unique ranks
customer_product_data['rank'] = customer_product_data.sort_values(
    by=['customer_id', 'final_score', 'quantity', 'product_id'],  # Add secondary sort keys
    ascending=[True, False, False, True]  # Ensure consistent order
).groupby('customer_id').cumcount() + 1  # Assign unique incremental ranks


# Sort the DataFrame by customer_id and rank
customer_product_data = customer_product_data.sort_values(by=['customer_id', 'rank'])

# Filter out products with a rank greater than 200 for each customer
#customer_product_data = customer_product_data[customer_product_data['rank'] <= 20]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['date'] = pd.to_datetime(train_data['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['recency'] = (latest_date - train_data['date']).dt.days  # Calculate days since last purchase


In [15]:
customer_product_data

Unnamed: 0,customer_id,product_id,quantity,recency,quantity_score,recency_score,final_score,rank
134,Household_60001,Product_57552,312.0,19,0.896552,0.973937,0.898873,1
151,Household_60001,Product_63354,96.0,19,0.275862,0.973937,0.296804,2
32,Household_60001,Product_2445,82.0,19,0.235632,0.973937,0.257781,3
140,Household_60001,Product_58744,63.0,19,0.181034,0.973937,0.204822,4
72,Household_60001,Product_4053,55.0,19,0.158046,0.973937,0.182523,5
...,...,...,...,...,...,...,...,...
15699,Household_60049,Product_63590,1.0,285,0.002874,0.609053,0.021059,276
15709,Household_60049,Product_67974,1.0,285,0.002874,0.609053,0.021059,277
15740,Household_60049,Product_76553,1.0,285,0.002874,0.609053,0.021059,278
15755,Household_60049,Product_80048,1.0,285,0.002874,0.609053,0.021059,279


In [16]:
import numpy as np
import pandas as pd

# All unique product IDs
all_product_ids = set(products_data['product_id'].unique())

# Dictionary to store negative samples
negative_samples = []

# Group by customer for efficient processing
customer_groups = train_data.groupby('customer_id')

# Precompute purchased products per customer
for customer_id, group in tqdm(customer_groups, desc="Generating negative samples"):
    # Purchased products for the customer
    purchased_products = set(group['product_id'])

    # Non-purchased products
    non_purchased_products = list(all_product_ids - purchased_products)

    # Number of negative samples to match positives
    num_negatives = len(group)

    # Randomly sample negatives
    sampled_negatives = np.random.choice(non_purchased_products, num_negatives, replace=False)

    # Append negative samples
    negative_samples.extend([{
        'customer_id': customer_id,
        'product_id': product_id,
        'quantity': 0
    } for product_id in sampled_negatives])

# Convert to DataFrame
negative_samples_df = pd.DataFrame(negative_samples)

# Append and update relevance
customer_product_data = pd.concat([customer_product_data, negative_samples_df], ignore_index=True)
customer_product_data['relevance'] = (customer_product_data['quantity'] > 0).astype(int)


Generating negative samples: 100%|██████████| 49/49 [00:02<00:00, 24.43it/s]


In [17]:
# Check for missing values
print("Missing values in train_data:")
print(customer_product_data.isnull().sum())

print("Missing values in test_data:")
print(test_data.isnull().sum())

# Fill remaining missing values with zeros as a safety measure
customer_product_data = customer_product_data.fillna(0)
test_data = test_data.fillna(0)

Missing values in train_data:
customer_id           0
product_id            0
quantity              0
recency           43846
quantity_score    43846
recency_score     43846
final_score       43846
rank              43846
relevance             0
dtype: int64
Missing values in test_data:
transaction_id    0
customer_id       0
product_id        0
dtype: int64


In [18]:
customer_product_data

Unnamed: 0,customer_id,product_id,quantity,recency,quantity_score,recency_score,final_score,rank,relevance
0,Household_60001,Product_57552,312.0,19.0,0.896552,0.973937,0.898873,1.0,1
1,Household_60001,Product_63354,96.0,19.0,0.275862,0.973937,0.296804,2.0,1
2,Household_60001,Product_2445,82.0,19.0,0.235632,0.973937,0.257781,3.0,1
3,Household_60001,Product_58744,63.0,19.0,0.181034,0.973937,0.204822,4.0,1
4,Household_60001,Product_4053,55.0,19.0,0.158046,0.973937,0.182523,5.0,1
...,...,...,...,...,...,...,...,...,...
59615,Household_60049,Product_80250,0.0,0.0,0.000000,0.000000,0.000000,0.0,0
59616,Household_60049,Product_56436,0.0,0.0,0.000000,0.000000,0.000000,0.0,0
59617,Household_60049,Product_24364,0.0,0.0,0.000000,0.000000,0.000000,0.0,0
59618,Household_60049,Product_65766,0.0,0.0,0.000000,0.000000,0.000000,0.0,0


In [19]:
customer_product_data = customer_product_data.drop(columns = ['quantity','recency'])
customer_product_data

Unnamed: 0,customer_id,product_id,quantity_score,recency_score,final_score,rank,relevance
0,Household_60001,Product_57552,0.896552,0.973937,0.898873,1.0,1
1,Household_60001,Product_63354,0.275862,0.973937,0.296804,2.0,1
2,Household_60001,Product_2445,0.235632,0.973937,0.257781,3.0,1
3,Household_60001,Product_58744,0.181034,0.973937,0.204822,4.0,1
4,Household_60001,Product_4053,0.158046,0.973937,0.182523,5.0,1
...,...,...,...,...,...,...,...
59615,Household_60049,Product_80250,0.000000,0.000000,0.000000,0.0,0
59616,Household_60049,Product_56436,0.000000,0.000000,0.000000,0.0,0
59617,Household_60049,Product_24364,0.000000,0.000000,0.000000,0.0,0
59618,Household_60049,Product_65766,0.000000,0.000000,0.000000,0.0,0


In [20]:
# prompt: remove from customer_id	product_id, the letters  and _in all rows

import re

def remove_chars(df, column_name):
    df[column_name] = df[column_name].astype(str).apply(lambda x: re.sub(r'[a-zA-Z_]', '', x))
    return df

# Example usage (assuming 'customer_product_data' is your DataFrame):
customer_product_data = remove_chars(customer_product_data, 'customer_id')
customer_product_data = remove_chars(customer_product_data, 'product_id')
customer_product_data

Unnamed: 0,customer_id,product_id,quantity_score,recency_score,final_score,rank,relevance
0,60001,57552,0.896552,0.973937,0.898873,1.0,1
1,60001,63354,0.275862,0.973937,0.296804,2.0,1
2,60001,2445,0.235632,0.973937,0.257781,3.0,1
3,60001,58744,0.181034,0.973937,0.204822,4.0,1
4,60001,4053,0.158046,0.973937,0.182523,5.0,1
...,...,...,...,...,...,...,...
59615,60049,80250,0.000000,0.000000,0.000000,0.0,0
59616,60049,56436,0.000000,0.000000,0.000000,0.0,0
59617,60049,24364,0.000000,0.000000,0.000000,0.0,0
59618,60049,65766,0.000000,0.000000,0.000000,0.0,0


In [21]:
del train_data

## MODELING

In [22]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [23]:
# Prepare input features
# Prepare input features (remove 'quantity')

features = ['customer_id',	'product_id',	'quantity_score',	'recency_score',	'final_score',	'rank']

# Extract input features and target
X = customer_product_data[features].values
y = customer_product_data['relevance'].values

scaler_original = StandardScaler()
scaler_full = StandardScaler()
X = scaler_original.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:

# Prepare full training data with embeddings using batch processing to avoid memory overload
def batch_process_train_data(train_data, product_embeddings, features, batch_size=1000):
    """Yield batches of combined features from train_data and product embeddings."""
    num_rows = len(train_data)
    for start_idx in range(0, num_rows, batch_size):
        end_idx = min(start_idx + batch_size, num_rows)
        batch_rows = train_data.iloc[start_idx:end_idx]
        batch_combined_features = []

        for _, row in batch_rows.iterrows():
            product_embedding = product_embeddings.get('Product_' + row['product_id'])
            if product_embedding is not None:
                combined_features = np.hstack([row[features].values.astype(float).flatten(), product_embedding.flatten()])
                batch_combined_features.append(combined_features)

        yield np.array(batch_combined_features, dtype=np.float32)

# Using the batch generator to process the data and store in memory-efficient way
train_data_with_embeddings = []
for batch_combined_features in tqdm(batch_process_train_data(customer_product_data, product_embedding_dict, features), desc='Processing training data in batches'):
    train_data_with_embeddings.extend(batch_combined_features)

Processing training data in batches: 60it [00:27,  2.20it/s]


In [25]:
len(train_data_with_embeddings)

59620

In [61]:
X_train_full = np.array(train_data_with_embeddings, dtype=np.float32)
X_train_full = scaler_full.fit_transform(X_train_full)

# Split training and validation data
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y[:len(X_train_full)], test_size=0.2, random_state=42)

# Build model
input_dim = X_train_full.shape[1]
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(input_dim,)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='linear')
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0035), loss='binary_crossentropy', metrics=['AUC'])

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=25, batch_size=256, verbose=1, callbacks=[tf.keras.callbacks.ReduceLROnPlateau()])




Epoch 1/25
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - AUC: 0.6064 - loss: 4.3430 - val_AUC: 0.5441 - val_loss: 6.4194 - learning_rate: 0.0035
Epoch 2/25
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - AUC: 0.5664 - loss: 4.1871 - val_AUC: 0.5960 - val_loss: 3.6973 - learning_rate: 0.0035
Epoch 3/25
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - AUC: 0.5741 - loss: 4.3106 - val_AUC: 0.5253 - val_loss: 4.2603 - learning_rate: 0.0035
Epoch 4/25
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - AUC: 0.6183 - loss: 4.7249 - val_AUC: 0.6857 - val_loss: 3.9642 - learning_rate: 0.0035
Epoch 5/25
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - AUC: 0.6187 - loss: 4.3762 - val_AUC: 0.5909 - val_loss: 4.5413 - learning_rate: 0.0035
Epoch 6/25
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - AUC: 0.5965 - loss: 4.7688 - val_AUC: 

<keras.src.callbacks.history.History at 0x795d62f67370>

## EVALUATION



In [48]:
# Prepare test set for Hit Rate @10
def hitrate_at_k(true_data: pd.DataFrame, predicted_data: pd.DataFrame, k: int = 10) -> float:
    """
    This function calculates the hitrate at k for the recommendations.
    It assesses how relevant our k product recommendations are.
    In other words, it calculates the proportion of recommended products that are actually purchased by the customer.

    Args:
        true_data: a pandas DataFrame containing the true data
            customer_id: the customer identifier
            product_id: the product identifier that was purchased in the test set
        predicted_data: a pandas DataFrame containing the predicted data
            customer_id: the customer identifier
            product_id: the product identifier that was recommended
            rank: the rank of the recommendation. the rank should be between 1 and k.
        k: the number of recommendations to consider. k should be between 1 and 10.

    Returns:
        The hitrate at k
    """
    data = pd.merge(left=true_data, right=predicted_data, how="left", on=["customer_id", "product_id"])
    df = data[data["rank"] <= k]
    non_null_counts = df.groupby('customer_id')['rank'].apply(lambda x: x.notna().sum()).reset_index(name='non_null_count')
    distinct_products_per_customer = data.groupby('customer_id')['product_id'].nunique().reset_index(name='distinct_product_count')
    df = pd.merge(left=distinct_products_per_customer, right=non_null_counts, how="left", on="customer_id")
    df["denominator"] = [min(df.iloc[i].distinct_product_count, k) for i in range(len(df))]
    df = df.fillna(0)
    return (df["non_null_count"] / df["denominator"]).mean()


In [49]:
# prompt: remove 'Household_' of all values in customer_ids_new array

import re

def remove_household_prefix(customer_id):
  return re.sub(r'^Household_', '', str(customer_id))

customer_ids_new = [remove_household_prefix(customer_id) for customer_id in customer_ids_new]

In [62]:

# Define customer IDs

# Prepare predicted data for hit rate calculation
predicted_data = []
for customer_id in tqdm(customer_ids_new, desc='Processing customers'):
    #customer_purchases = test_data[test_data['customer_id'] == customer_id]
    #customer_purchases = filter_top_30_products(train_data[train_data['customer_id'] == customer_id], train_data)

    customer_purchases = customer_product_data[customer_product_data['customer_id'] == customer_id]
    customer_purchases = customer_purchases[(customer_purchases['rank'] <= 50) & (customer_product_data['rank'] > 0) ]
    customer_filtered = customer_product_data[customer_product_data['customer_id'] == customer_id]
    if not customer_filtered.empty:
        customer_features = customer_filtered[features].iloc[0].values
        customer_features = scaler_original.transform([customer_features.astype(float)])
        scores = []
        for _, row in customer_purchases.iterrows():
            product_embedding = product_embedding_dict.get('Product_'+row['product_id'])
            if product_embedding is not None:
                input_features = np.hstack([customer_features, product_embedding])
                score = model.predict(np.array(input_features).reshape(1, -1), verbose=0)
                scores.append(('Product_'+row['product_id'], score))
        top_10 = []
        seen_products = set()
        for product_id, score in sorted(scores, key=lambda x: x[1], reverse=True):
            if product_id not in seen_products:
                top_10.append((product_id, len(top_10) + 1))
                seen_products.add(product_id)
            if len(top_10) == 10:
                break
        predicted_data.extend([('Household_'+customer_id, product_id, rank) for product_id, rank in top_10])

predicted_df = pd.DataFrame(predicted_data, columns=['customer_id', 'product_id', 'rank'])

# Calculate Hit Rate @10 using new function
hit_rate_at_10 = hitrate_at_k(test_data[['customer_id', 'product_id']], predicted_df, k=10)
print('Hit Rate @10:', hit_rate_at_10)

  customer_purchases = customer_purchases[(customer_purchases['rank'] <= 50) & (customer_product_data['rank'] > 0) ]
  customer_purchases = customer_purchases[(customer_purchases['rank'] <= 50) & (customer_product_data['rank'] > 0) ]
  customer_purchases = customer_purchases[(customer_purchases['rank'] <= 50) & (customer_product_data['rank'] > 0) ]
  customer_purchases = customer_purchases[(customer_purchases['rank'] <= 50) & (customer_product_data['rank'] > 0) ]
  customer_purchases = customer_purchases[(customer_purchases['rank'] <= 50) & (customer_product_data['rank'] > 0) ]
  customer_purchases = customer_purchases[(customer_purchases['rank'] <= 50) & (customer_product_data['rank'] > 0) ]
  customer_purchases = customer_purchases[(customer_purchases['rank'] <= 50) & (customer_product_data['rank'] > 0) ]
  customer_purchases = customer_purchases[(customer_purchases['rank'] <= 50) & (customer_product_data['rank'] > 0) ]
  customer_purchases = customer_purchases[(customer_purchases['r

Hit Rate @10: 0.20974246841593783





In [34]:
# Create submission file from existing predictions
submission_df = predicted_df[predicted_df['customer_id'].isin([f"Household_{i}" for i in range(1, 100001)])]
prediction = submission_df



In [35]:
household_16874_df = test_data[test_data['customer_id'] == 'Household_60086']
household_16874_df3 = submission_df[submission_df['customer_id'] == 'Household_60086']
common_products = set(household_16874_df['product_id']).intersection(set(household_16874_df3['product_id']))
print(f"Number of common products: {len(common_products)}")
similarity_ratio = len(common_products) / len(household_16874_df['product_id']) if len(household_16874_df['product_id']) > 0 else 0
print(f"Similarity Ratio: {similarity_ratio}")

if similarity_ratio > 0.5:  # Adjust threshold as needed
    print("The product IDs in the two dataframes are relatively similar.")
else:
    print("The product IDs in the two dataframes are not very similar.")

Number of common products: 0
Similarity Ratio: 0
The product IDs in the two dataframes are not very similar.
