## IMPORTS

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!pip install tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from google.colab import drive
drive.mount('/content/gdrive')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# Import necessary libraries

import torch

# Load pre-trained BERT model and tokenizer
trainFolder = "gdrive/My Drive/MLDM - Carrefour Project/data-train"

import os
for dirname, _, filenames in os.walk(f'{trainFolder}'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Mounted at /content/gdrive
gdrive/My Drive/MLDM - Carrefour Project/data-train/products_data.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/test_data.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_1.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_10.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_2.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_3.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_4.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_5.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_6.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_7.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_8.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_9.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/products_embe.csv
gdrive/My Drive/MLDM - Car

In [2]:
# Import libraries
from tqdm import tqdm
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

### Data Description

This project uses three datasets:

#### `train_data.csv`:
This dataset contains two years (2022 & 2023) of historical transactions for 100,000 Carrefour customers. It has 10 columns:

* ***date***: Date of the transaction
* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: Product purchased
* ***has_loyality_card***: Flag indicating whether the customer has a loyalty card
* ***store_id***: Store where the purchase was made
* ***is_promo***: Flag indicating whether there was a discount on the product
* ***quantity***: Quantity purchased of the product
* ***format***: Ecommerce activity format (clcv, lex, or DRIVE)
  - clcv : courses livrées chez vous
  - lex : livraison express
  - DRIVE.
* ***orderChannelCode***: Indicates whether the online activity was made through the website or mobile app

#### `products_data.csv`:
This dataset contains detailed information about the products. The following columns are relevant to this project:

* ***product_id*** : Product name
* ***product_description*** : Product description
* ***department_key***: Department key
* ***class_key***: Class key
* ***subclass_key***: Subclass key
* ***sector***: sector name
* ***brand_key***: Brand name
* ***shelf_level1***: Top-level shelf category
* ***shelf_level2***: Second-level shelf category
* ***shelf_level3***: Third-level shelf category
* ***shelf_level4***: Fourth-level shelf category
* ***sector***: Sector
* ***bio***: Flag indicating whether the product is organic
* ***sugar_free***: Flag indicating whether the product is sugar-free
* ***aspartame_free***: Flag indicating whether the product is aspartame-free
* ***gluten_free***: Flag indicating whether the product is gluten-free
* ***halal***: Flag indicating whether the product is halal
* ***casher***: Flag indicating whether the product is kosher
* ***eco_friendly***: Flag indicating whether the product is eco-friendly
* ***local_french***: Flag indicating whether the product is locally produced in France
* ***artificial_coloring_free***: Flag indicating whether the product is free of artificial coloring
* ***taste_enhancer_free***: Flag indicating whether the product is free of taste enhancers
* ***naturality***: Naturality score
* ***antibiotic_free***: Flag indicating whether the product is antibiotic-free
* ***reduced_sugar***: Flag indicating whether the product has reduced sugar content
* ***vegetarian***: Flag indicating whether the product is vegetarian
* ***pesticide_free***: Flag indicating whether the product is pesticide-free
* ***grain_free***: Flag indicating whether the product is grain-free
* ***no_added_sugar***: Flag indicating whether the product has no added sugar
* ***salt_reduced***: Flag indicating whether the product has reduced salt content
* ***nitrite_free***: Flag indicating whether the product is nitrite-free
* ***fed_without_ogm***: Flag indicating whether the animals were fed without GMOs
* ***no_added_salt***: Flag indicating whether the product has no added salt
* ***no_artificial_flavours***: Flag indicating whether the product has no artificial flavors
* ***porc***: Flag indicating whether the product contains pork
* ***vegan***: Flag indicating whether the product is vegan
* ***frozen***: Flag indicating whether the product is frozen
* ***fat_free***: Flag indicating whether the product is fat-free
* ***reduced_fats***: Flag indicating whether the product has reduced fat content
* ***fresh***: Flag indicating whether the product is fresh
* ***alcool***: Flag indicating whether the product contains alcohol
* ***lactose_free***: Flag indicating whether the product is lactose-free
* ***phenylalanine_free***: Flag indicating whether the product is phenylalanine-free
* ***palm_oil_free***: Flag indicating whether the product is palm oil-free
* ***ecoscore***: Ecoscore
* ***produits_du_monde***: Flag indicating whether the product is an international product
* ***regional_product***: Flag indicating whether the product is a regional product
* ***national_brand***: Flag indicating whether the product is a national brand
* ***first_price_brand***: Flag indicating whether the product is a first-price brand
* ***carrefour_brand***: Flag indicating whether the product is a Carrefour brand

#### `test_data.csv`:
This dataset contains the actual purchases of the first 80,000 customers in 2024. It has three columns:

* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: the id of the purchased product

### Load data

* Load *train_data.csv*, *products_data.csv* and *test_data.csv* using pandas.

In [3]:
# This code reads the data from CSV files named "train_data_part_i.csv" for all i from 1 to 10
# and concatenates them into a single pandas DataFrame
train_dataframes = []
for i in tqdm(range(1, 3)): ## 1 - 11
    train_dataframes.append(pd.read_csv(f'{trainFolder}/train_data_part_{i}.csv'))
train_data = pd.concat(train_dataframes, ignore_index=True)

# free up memory by deleting the dataframes we no longer need
del train_dataframes

100%|██████████| 2/2 [00:28<00:00, 14.41s/it]


In [4]:
#train_data = train_data.sample(frac=0.1, random_state=1)  # frac=0.1 takes 10% of the data

In [5]:
# This code reads the data from a CSV file named "products_data.csv" into a pandas DataFrame
products_data = pd.read_csv(f'{trainFolder}/products_data.csv')

  products_data = pd.read_csv(f'{trainFolder}/products_data.csv')


In [6]:
# This code reads the data from a CSV file named "test_data.csv" into a pandas DataFrame
test_data = pd.read_csv(f'{trainFolder}/test_data.csv')

In [7]:
# Load product embeddings
product_embeddings_np = np.load(f'{trainFolder}/product_embeddings.npy')

# Load product IDs
product_ids = np.load(f'{trainFolder}/product_ids.npy', allow_pickle=True)  # Use allow_pickle=True if product_ids is an array of objects (e.g., strings)

# Create the product embedding dictionary
product_embedding_dict = dict(zip(product_ids, product_embeddings_np))

In [8]:
customer_data_dict = {customer_id: data for customer_id, data in train_data.groupby('customer_id')}

In [9]:
# Ensure all embeddings are numpy arrays of floats
for key in product_embedding_dict:
    product_embedding_dict[key] = np.array(product_embedding_dict[key], dtype=np.float32)

## Training Data Preproccesing

In [10]:
# Ensure 'date' is in datetime format
train_data['date'] = pd.to_datetime(train_data['date'])

# Extract temporal features
train_data['month'] = train_data['date'].dt.month
train_data['day_of_week'] = train_data['date'].dt.dayofweek
train_data['is_weekend'] = train_data['day_of_week'].isin([5, 6]).astype(int)

# Calculate recency: days since the last purchase for each customer
train_data['days_since_last_purchase'] = train_data.groupby('customer_id')['date'].transform(
    lambda x: (x.max() - x).dt.days
)

# Drop the 'date' column if it's no longer needed
train_data = train_data.drop(columns=['date'], errors='ignore')

In [11]:
# Aggregate customer-specific features
customer_features = train_data.groupby('customer_id').agg(
    purchase_frequency=('product_id', 'count'),  # Total number of purchases
    avg_quantity=('quantity', 'mean'),          # Average quantity purchased
    promo_ratio=('is_promo', 'mean'),           # Proportion of purchases made with promotions
    unique_products=('product_id', 'nunique')   # Number of unique products purchased
).reset_index()

# Merge these features into train_data and test_data
train_data = train_data.merge(customer_features, on='customer_id', how='left')

In [12]:
# Check for missing values
print("Missing values in train_data:")
print(train_data.isnull().sum())

print("Missing values in test_data:")
print(test_data.isnull().sum())

# Fill remaining missing values with zeros as a safety measure
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

Missing values in train_data:
transaction_id              0
customer_id                 0
product_id                  0
has_loyality_card           0
store_id                    0
is_promo                    0
quantity                    0
format                      0
order_channel               0
month                       0
day_of_week                 0
is_weekend                  0
days_since_last_purchase    0
purchase_frequency          0
avg_quantity                0
promo_ratio                 0
unique_products             0
dtype: int64
Missing values in test_data:
transaction_id    0
customer_id       0
product_id        0
dtype: int64


In [13]:
train_data.head()

Unnamed: 0,transaction_id,customer_id,product_id,has_loyality_card,store_id,is_promo,quantity,format,order_channel,month,day_of_week,is_weekend,days_since_last_purchase,purchase_frequency,avg_quantity,promo_ratio,unique_products
0,Transaction_1730125,Household_39,Product_5362,0,Store_2,0,1.0,DRIVE,MOBILE_APP,11,2,0,0,282,1.219858,0.014184,185
1,Transaction_1560535,Household_39,Product_67174,0,Store_2,0,2.0,DRIVE,WEBSITE,7,2,0,483,282,1.219858,0.014184,185
2,Transaction_1560535,Household_39,Product_82254,0,Store_2,0,2.0,DRIVE,WEBSITE,7,2,0,483,282,1.219858,0.014184,185
3,Transaction_1730125,Household_39,Product_3895,0,Store_2,0,1.0,DRIVE,MOBILE_APP,11,2,0,0,282,1.219858,0.014184,185
4,Transaction_1560535,Household_39,Product_34014,0,Store_2,0,1.0,DRIVE,WEBSITE,7,2,0,483,282,1.219858,0.014184,185


In [14]:
train_data = train_data.drop(columns=['has_loyality_card', 'store_id', 'day_of_week','is_weekend' ,'format','order_channel','month'])

In [15]:
train_data.head()

Unnamed: 0,transaction_id,customer_id,product_id,is_promo,quantity,days_since_last_purchase,purchase_frequency,avg_quantity,promo_ratio,unique_products
0,Transaction_1730125,Household_39,Product_5362,0,1.0,0,282,1.219858,0.014184,185
1,Transaction_1560535,Household_39,Product_67174,0,2.0,483,282,1.219858,0.014184,185
2,Transaction_1560535,Household_39,Product_82254,0,2.0,483,282,1.219858,0.014184,185
3,Transaction_1730125,Household_39,Product_3895,0,1.0,0,282,1.219858,0.014184,185
4,Transaction_1560535,Household_39,Product_34014,0,1.0,483,282,1.219858,0.014184,185


In [16]:
customer_data_dict = {customer_id: data for customer_id, data in train_data.groupby('customer_id')}

In [17]:
customer_data_dict

Output hidden; open in https://colab.research.google.com to view.

## MODELING

In [18]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [19]:
# Prepare input features
features = ['is_promo', 'quantity', 'days_since_last_purchase', 'purchase_frequency', 'avg_quantity', 'promo_ratio', 'unique_products']

X = train_data[features].values
y = train_data['quantity'].values

scaler_original = StandardScaler()
scaler_full = StandardScaler()
X = scaler_original.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Train model
train_data_with_embeddings = []
for _, row in tqdm(train_data.iterrows(), total=len(train_data), desc='Preparing training data'):
    product_embedding = product_embedding_dict.get(row['product_id'])
    if product_embedding is not None:
        combined_features = np.hstack([row[features].values.astype(float).flatten(), product_embedding.flatten()])
        train_data_with_embeddings.append(combined_features)


Preparing training data: 100%|██████████| 17514314/17514314 [2:05:50<00:00, 2319.55it/s]


In [21]:
X_train_full = np.array(train_data_with_embeddings, dtype=np.float32)
X_train_full = scaler_full.fit_transform(X_train_full)

# Split training and validation data
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y[:len(X_train_full)], test_size=0.2, random_state=42)

# Build model
input_dim = X_train_full.shape[1]
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(input_dim,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='linear')
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])



In [22]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=256, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7b8590c6e200>

In [29]:
model.save('gdrive/My Drive/MLDM - Carrefour Project/models/nn/trained_recommender_model.h5')

  saving_api.save_model(


## EVALUATION



In [30]:
# Prepare test set for Hit Rate @10
def hitrate_at_k(true_data: pd.DataFrame, predicted_data: pd.DataFrame, k: int = 10) -> float:
    """
    This function calculates the hitrate at k for the recommendations.
    It assesses how relevant our k product recommendations are.
    In other words, it calculates the proportion of recommended products that are actually purchased by the customer.

    Args:
        true_data: a pandas DataFrame containing the true data
            customer_id: the customer identifier
            product_id: the product identifier that was purchased in the test set
        predicted_data: a pandas DataFrame containing the predicted data
            customer_id: the customer identifier
            product_id: the product identifier that was recommended
            rank: the rank of the recommendation. the rank should be between 1 and k.
        k: the number of recommendations to consider. k should be between 1 and 10.

    Returns:
        The hitrate at k
    """
    data = pd.merge(left=true_data, right=predicted_data, how="left", on=["customer_id", "product_id"])
    df = data[data["rank"] <= k]
    non_null_counts = df.groupby('customer_id')['rank'].apply(lambda x: x.notna().sum()).reset_index(name='non_null_count')
    distinct_products_per_customer = data.groupby('customer_id')['product_id'].nunique().reset_index(name='distinct_product_count')
    df = pd.merge(left=distinct_products_per_customer, right=non_null_counts, how="left", on="customer_id")
    df["denominator"] = [min(df.iloc[i].distinct_product_count, k) for i in range(len(df))]
    df = df.fillna(0)
    return (df["non_null_count"] / df["denominator"]).mean()


In [31]:

# Define customer IDs
customer_ids = test_data['customer_id'].unique()[:20000]

# Prepare predicted data for hit rate calculation
predicted_data = []
for customer_id in tqdm(customer_ids, desc='Processing customers'):
    customer_purchases = test_data[test_data['customer_id'] == customer_id]
    customer_filtered = train_data[train_data['customer_id'] == customer_id]
    if not customer_filtered.empty:
        customer_features = customer_filtered[features].iloc[0].values
        customer_features = scaler_original.transform([customer_features.astype(float)])
        scores = []
        for _, row in customer_purchases.iterrows():
            product_embedding = product_embedding_dict.get(row['product_id'])
            if product_embedding is not None:
                input_features = np.hstack([customer_features, product_embedding])
                score = model.predict(np.array(input_features).reshape(1, -1), verbose=0)
                scores.append((row['product_id'], score))
        top_10 = []
        seen_products = set()
        for product_id, score in sorted(scores, key=lambda x: x[1], reverse=True):
            if product_id not in seen_products:
                top_10.append((product_id, len(top_10) + 1))
                seen_products.add(product_id)
            if len(top_10) == 10:
                break
        predicted_data.extend([(customer_id, product_id, rank) for product_id, rank in top_10])

predicted_df = pd.DataFrame(predicted_data, columns=['customer_id', 'product_id', 'rank'])

# Calculate Hit Rate @10 using new function
hit_rate_at_10 = hitrate_at_k(test_data[['customer_id', 'product_id']], predicted_df, k=10)
print('Hit Rate @10:', hit_rate_at_10)

Processing customers: 100%|██████████| 20000/20000 [7:40:36<00:00,  1.38s/it]


Hit Rate @10: 0.0621125


In [32]:
predicted_df

Unnamed: 0,customer_id,product_id,rank
0,Household_16874,Product_7217,1
1,Household_16874,Product_57011,2
2,Household_16874,Product_13951,3
3,Household_16874,Product_9790,4
4,Household_16874,Product_68295,5
...,...,...,...
47838,Household_15894,Product_13688,6
47839,Household_15894,Product_44844,7
47840,Household_15894,Product_6451,8
47841,Household_15894,Product_53017,9


In [42]:

#Create submission file for
# Limit customer_ids to the range we want for submission
customer_ids = test_data['customer_id'].unique()[:20000]


# Create submission file from existing predictions
submission_df = predicted_df[predicted_df['customer_id'].isin([f"Household_{i}" for i in range(1, 100001)])]
prediction = submission_df


In [43]:
prediction

Unnamed: 0,customer_id,product_id,rank
0,Household_16874,Product_7217,1
1,Household_16874,Product_57011,2
2,Household_16874,Product_13951,3
3,Household_16874,Product_9790,4
4,Household_16874,Product_68295,5
...,...,...,...
47838,Household_15894,Product_13688,6
47839,Household_15894,Product_44844,7
47840,Household_15894,Product_6451,8
47841,Household_15894,Product_53017,9


In [44]:
submission_df

Unnamed: 0,customer_id,product_id,rank
0,Household_16874,Product_7217,1
1,Household_16874,Product_57011,2
2,Household_16874,Product_13951,3
3,Household_16874,Product_9790,4
4,Household_16874,Product_68295,5
...,...,...,...
47838,Household_15894,Product_13688,6
47839,Household_15894,Product_44844,7
47840,Household_15894,Product_6451,8
47841,Household_15894,Product_53017,9


In [46]:
# Process and format prediction
def process_and_format_prediction(df):
    # Replace invalid characters in column names
    df.columns = df.columns.str.replace('+AF8-', '_', regex=False)
    df = df.replace(r'\+AF8-', '_', regex=True)

    # Clean columns 'customer_id', 'product_id', and 'transaction_id'
    if 'customer_id' in df.columns and df['customer_id'].dtype == 'object':
        df['customer_id'] = df['customer_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'product_id' in df.columns and df['product_id'].dtype == 'object':
        df['product_id'] = df['product_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'transaction_id' in df.columns and df['transaction_id'].dtype == 'object':
        df['transaction_id'] = df['transaction_id'].str.replace(r'\D', '', regex=True).fillna(11).astype(int)

    df['id'] = df.index
    df = df[['id'] + [col for col in df.columns if col != 'id']]

    if 'customer_id' not in df.columns or 'product_id' not in df.columns:
        raise ValueError("true_data must contain 'customer_id' and 'product_id' columns")

    # Group by 'customer_id' and concatenate product and rank values
    prediction_grouped = df.groupby('customer_id').agg({
        'id': 'first',
        'product_id': lambda x: ','.join(map(str, x)),
        'rank': lambda x: ','.join(map(str, x))
    }).reset_index()

    # Drop 'id' column if it exists
    if 'id' in prediction_grouped.columns:
        prediction_grouped = prediction_grouped.drop(columns=['id'])

    # Filter data
    prediction_grouped = prediction_grouped[prediction_grouped['customer_id'] != 11]
    prediction_grouped.insert(0, 'id', range(len(prediction_grouped)))

    # Verify ranks and duplicates
    for index, row in prediction_grouped.iterrows():
        ranks = list(map(int, row['rank'].split(',')))
        if sorted(ranks) != list(range(1, 11)):
            print(ranks)
            print("Doublon détecté. Les rangs doivent être distincts (de 1 à 10) pour chacun des 10 produits prédits pour un client.\n")
            return None
        products = row['product_id'].split(',')
        if len(products) != len(set(products)):
            print("Doublon détecté. Il doit y avoir 10 produits différents par client.\n")
            return None

    return prediction_grouped

prediction_grouped = process_and_format_prediction(prediction)
print(prediction_grouped)


        id  customer_id                                         product_id  \
0        0            1  23579,72217,35730,57011,20576,49682,45719,170,...   
1        1            7  76562,32823,38669,7246,11724,67034,45965,35546...   
2        2           10  580,74328,63157,44048,29639,45631,5769,55676,4...   
3        3           12                11764,21485,26095,20421,38437,73342   
4        4           13  44525,72561,78384,10913,52577,22438,20181,4754...   
...    ...          ...                                                ...   
4964  4964        19988  52231,32226,50658,31751,48336,7417,1370,9035,5...   
4965  4965        19989  20709,15311,77319,11716,79869,55541,30339,6829...   
4966  4966        19996  7217,62948,11257,54404,50980,24759,23586,57314...   
4967  4967        19997  55846,39415,37408,61839,74335,33779,39698,1346...   
4968  4968        20000  1999,51040,41586,30992,80497,46586,57042,57942...   

                      rank  
0     1,2,3,4,5,6,7,8,9,10  
1    

In [47]:
# Create a .csv file to submit on kaggle
# A lancer en local sur votre ordinateur
prediction_grouped.to_csv('gdrive/My Drive/MLDM - Carrefour Project/submission/submission_list_bert.csv', index=False) ####

In [36]:
predicted_df.to_csv('gdrive/My Drive/MLDM - Carrefour Project/submission/predicted_df.csv', index=False) ####

## prev


In [48]:
# Prepare test set for Hit Rate @10
#customer_ids = test_data['customer_id'].unique()

# Prepare test set for Hit Rate @10
customer_ids = test_data['customer_id'].unique()[:100]

hit_count = 0
customer_total_hits = 0
for customer_id in tqdm(customer_ids, desc='Processing customers'):
    customer_purchases = test_data[test_data['customer_id'] == customer_id]
    customer_filtered = train_data[train_data['customer_id'] == customer_id]
    if not customer_filtered.empty:
        customer_features = customer_filtered[features].iloc[0].values
    else:
        continue
    customer_features = scaler_original.transform([customer_features.astype(float)])
    scores = []
    for _, row in tqdm(customer_purchases.iterrows(), total=len(customer_purchases), desc='Processing products', leave=False):
        product_embedding = product_embedding_dict.get(row['product_id'])
        if product_embedding is not None:
            input_features = np.hstack([customer_features, product_embedding])
            score = model.predict(np.array(input_features).reshape(1, -1), verbose=0)
            scores.append((row['product_id'], score))
    top_10 = list(dict(sorted(scores, key=lambda x: x[1], reverse=True)).items())[:10]
    purchased_products = customer_purchases['product_id'].values
    hit_count += len(set(purchased_products) & set([product_id for product_id, _ in top_10]))
    customer_total_hits += 1 if len(set(purchased_products) & set([product_id for product_id, _ in top_10])) > 0 else 0

hit_rate_at_10 = customer_total_hits / len(customer_ids)
print('Hit Rate @10:', hit_rate_at_10)



Processing customers:   1%|          | 1/100 [00:00<00:25,  3.89it/s]
Processing products:   0%|          | 0/13 [00:00<?, ?it/s][A
Processing products:  15%|█▌        | 2/13 [00:00<00:00, 13.52it/s][A
Processing products:  31%|███       | 4/13 [00:00<00:00, 13.67it/s][A
Processing products:  46%|████▌     | 6/13 [00:00<00:00, 13.42it/s][A
Processing products:  62%|██████▏   | 8/13 [00:00<00:00, 13.23it/s][A
Processing products:  77%|███████▋  | 10/13 [00:00<00:00, 12.80it/s][A
Processing products:  92%|█████████▏| 12/13 [00:00<00:00, 12.76it/s][A
Processing customers:  14%|█▍        | 14/100 [00:04<00:18,  4.54it/s]
Processing products:   0%|          | 0/21 [00:00<?, ?it/s][A
Processing products:  10%|▉         | 2/21 [00:00<00:01, 14.13it/s][A
Processing products:  19%|█▉        | 4/21 [00:00<00:01, 14.20it/s][A
Processing products:  29%|██▊       | 6/21 [00:00<00:01, 13.90it/s][A
Processing products:  38%|███▊      | 8/21 [00:00<00:00, 13.94it/s][A
Processing products: 

Hit Rate @10: 0.13





In [49]:
# Create submission file for
submission_data = []
for customer_id in customer_ids:
    scores = []
    customer_purchases = test_data[test_data['customer_id'] == customer_id]
    for _, row in customer_purchases.iterrows():
        product_embedding = product_embedding_dict.get(row['product_id'])
        if product_embedding is not None:
            input_features = np.hstack([customer_features, product_embedding])
            score = model.predict(np.array(input_features).reshape(1, -1), verbose=0)
            scores.append((row['product_id'], score))
    top_10 = sorted(scores, key=lambda x: x[1], reverse=True)[:10]
    submission_data.extend([(customer_id, product_id, rank + 1) for rank, (product_id, _) in enumerate(top_10)])

# Convert to DataFrame and filter
submission_df = pd.DataFrame(submission_data, columns=['customer_id', 'product_id', 'rank'])
prediction = submission_df[submission_df.customer_id.isin([f"Household_{i}" for i in range(1, 100001)])]

# Process and format prediction
def process_and_format_prediction(df):
    # Replace invalid characters in column names
    df.columns = df.columns.str.replace('+AF8-', '_', regex=False)
    df = df.replace(r'\+AF8-', '_', regex=True)

    # Clean columns 'customer_id', 'product_id', and 'transaction_id'
    if 'customer_id' in df.columns and df['customer_id'].dtype == 'object':
        df['customer_id'] = df['customer_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'product_id' in df.columns and df['product_id'].dtype == 'object':
        df['product_id'] = df['product_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'transaction_id' in df.columns and df['transaction_id'].dtype == 'object':
        df['transaction_id'] = df['transaction_id'].str.replace(r'\D', '', regex=True).fillna(11).astype(int)

    df['id'] = df.index
    df = df[['id'] + [col for col in df.columns if col != 'id']]

    if 'customer_id' not in df.columns or 'product_id' not in df.columns:
        raise ValueError("true_data must contain 'customer_id' and 'product_id' columns")

    # Group by 'customer_id' and concatenate product and rank values
    prediction_grouped = df.groupby('customer_id').agg({
        'id': 'first',
        'product_id': lambda x: ','.join(map(str, x)),
        'rank': lambda x: ','.join(map(str, x))
    }).reset_index()

    # Drop 'id' column if it exists
    if 'id' in prediction_grouped.columns:
        prediction_grouped = prediction_grouped.drop(columns=['id'])

    # Filter data
    prediction_grouped = prediction_grouped[prediction_grouped['customer_id'] != 11]
    prediction_grouped.insert(0, 'id', range(len(prediction_grouped)))

    # Verify ranks and duplicates
    for index, row in prediction_grouped.iterrows():
        ranks = list(map(int, row['rank'].split(',')))
        if sorted(ranks) != list(range(1, 11)):
            print("Doublon détecté. Les rangs doivent être distincts (de 1 à 10) pour chacun des 10 produits prédits pour un client.\n")
            return None
        products = row['product_id'].split(',')
        if len(products) != len(set(products)):
            print("Doublon détecté. Il doit y avoir 10 produits différents par client.\n")
            return None

    return prediction_grouped

prediction_grouped = process_and_format_prediction(prediction)
print(prediction_grouped)


Doublon détecté. Les rangs doivent être distincts (de 1 à 10) pour chacun des 10 produits prédits pour un client.

None


In [50]:
prediction_grouped.to_csv('submission/submission_list_bert_ft.csv', index=False) ####

AttributeError: 'NoneType' object has no attribute 'to_csv'