## IMPORTS

In [38]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!pip install tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from google.colab import drive
drive.mount('/content/gdrive')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# Import necessary libraries

import torch

# Load pre-trained BERT model and tokenizer
trainFolder = "gdrive/My Drive/MLDM - Carrefour Project/data-train"

import os
for dirname, _, filenames in os.walk(f'{trainFolder}'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
gdrive/My Drive/MLDM - Carrefour Project/data-train/products_data.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/test_data.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_1.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_10.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_2.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_3.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_4.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_5.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_6.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_7.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_8.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_par

In [39]:
# Import libraries
from tqdm import tqdm
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

### Data Description

This project uses three datasets:

#### `train_data.csv`:
This dataset contains two years (2022 & 2023) of historical transactions for 100,000 Carrefour customers. It has 10 columns:

* ***date***: Date of the transaction
* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: Product purchased
* ***has_loyality_card***: Flag indicating whether the customer has a loyalty card
* ***store_id***: Store where the purchase was made
* ***is_promo***: Flag indicating whether there was a discount on the product
* ***quantity***: Quantity purchased of the product
* ***format***: Ecommerce activity format (clcv, lex, or DRIVE)
  - clcv : courses livrées chez vous
  - lex : livraison express
  - DRIVE.
* ***orderChannelCode***: Indicates whether the online activity was made through the website or mobile app

#### `products_data.csv`:
This dataset contains detailed information about the products. The following columns are relevant to this project:

* ***product_id*** : Product name
* ***product_description*** : Product description
* ***department_key***: Department key
* ***class_key***: Class key
* ***subclass_key***: Subclass key
* ***sector***: sector name
* ***brand_key***: Brand name
* ***shelf_level1***: Top-level shelf category
* ***shelf_level2***: Second-level shelf category
* ***shelf_level3***: Third-level shelf category
* ***shelf_level4***: Fourth-level shelf category
* ***sector***: Sector
* ***bio***: Flag indicating whether the product is organic
* ***sugar_free***: Flag indicating whether the product is sugar-free
* ***aspartame_free***: Flag indicating whether the product is aspartame-free
* ***gluten_free***: Flag indicating whether the product is gluten-free
* ***halal***: Flag indicating whether the product is halal
* ***casher***: Flag indicating whether the product is kosher
* ***eco_friendly***: Flag indicating whether the product is eco-friendly
* ***local_french***: Flag indicating whether the product is locally produced in France
* ***artificial_coloring_free***: Flag indicating whether the product is free of artificial coloring
* ***taste_enhancer_free***: Flag indicating whether the product is free of taste enhancers
* ***naturality***: Naturality score
* ***antibiotic_free***: Flag indicating whether the product is antibiotic-free
* ***reduced_sugar***: Flag indicating whether the product has reduced sugar content
* ***vegetarian***: Flag indicating whether the product is vegetarian
* ***pesticide_free***: Flag indicating whether the product is pesticide-free
* ***grain_free***: Flag indicating whether the product is grain-free
* ***no_added_sugar***: Flag indicating whether the product has no added sugar
* ***salt_reduced***: Flag indicating whether the product has reduced salt content
* ***nitrite_free***: Flag indicating whether the product is nitrite-free
* ***fed_without_ogm***: Flag indicating whether the animals were fed without GMOs
* ***no_added_salt***: Flag indicating whether the product has no added salt
* ***no_artificial_flavours***: Flag indicating whether the product has no artificial flavors
* ***porc***: Flag indicating whether the product contains pork
* ***vegan***: Flag indicating whether the product is vegan
* ***frozen***: Flag indicating whether the product is frozen
* ***fat_free***: Flag indicating whether the product is fat-free
* ***reduced_fats***: Flag indicating whether the product has reduced fat content
* ***fresh***: Flag indicating whether the product is fresh
* ***alcool***: Flag indicating whether the product contains alcohol
* ***lactose_free***: Flag indicating whether the product is lactose-free
* ***phenylalanine_free***: Flag indicating whether the product is phenylalanine-free
* ***palm_oil_free***: Flag indicating whether the product is palm oil-free
* ***ecoscore***: Ecoscore
* ***produits_du_monde***: Flag indicating whether the product is an international product
* ***regional_product***: Flag indicating whether the product is a regional product
* ***national_brand***: Flag indicating whether the product is a national brand
* ***first_price_brand***: Flag indicating whether the product is a first-price brand
* ***carrefour_brand***: Flag indicating whether the product is a Carrefour brand

#### `test_data.csv`:
This dataset contains the actual purchases of the first 80,000 customers in 2024. It has three columns:

* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: the id of the purchased product

### Load data

* Load *train_data.csv*, *products_data.csv* and *test_data.csv* using pandas.

In [40]:
# This code reads the data from CSV files named "train_data_part_i.csv" for all i from 1 to 10
# and concatenates them into a single pandas DataFrame
train_dataframes = []
for i in tqdm(range(1, 11)): ## 1 - 11
    train_dataframes.append(pd.read_csv(f'{trainFolder}/train_data_part_{i}.csv'))
train_data = pd.concat(train_dataframes, ignore_index=True)

# free up memory by deleting the dataframes we no longer need
del train_dataframes

100%|██████████| 10/10 [02:14<00:00, 13.41s/it]


In [41]:
#train_data = train_data.sample(frac=0.1, random_state=1)  # frac=0.1 takes 10% of the data

In [42]:
# This code reads the data from a CSV file named "products_data.csv" into a pandas DataFrame
products_data = pd.read_csv(f'{trainFolder}/products_data.csv')

  products_data = pd.read_csv(f'{trainFolder}/products_data.csv')


In [43]:
# This code reads the data from a CSV file named "test_data.csv" into a pandas DataFrame
test_data = pd.read_csv(f'{trainFolder}/test_data.csv')

In [44]:
# Load product embeddings
product_embeddings_np = np.load(f'{trainFolder}/product_embeddings.npy')

# Load product IDs
product_ids = np.load(f'{trainFolder}/product_ids.npy', allow_pickle=True)  # Use allow_pickle=True if product_ids is an array of objects (e.g., strings)

# Create the product embedding dictionary
product_embedding_dict = dict(zip(product_ids, product_embeddings_np))

In [45]:
customer_ids = test_data['customer_id'].unique()[:1000]

In [46]:
# prompt: keep in train_data and test_data the rows that have the cusomer_id in customer_ids

# Filter train_data
train_data = train_data[train_data['customer_id'].isin(customer_ids)]

# Filter test_data
test_data = test_data[test_data['customer_id'].isin(customer_ids)]

In [47]:
train_data

Unnamed: 0,date,transaction_id,customer_id,product_id,has_loyality_card,store_id,is_promo,quantity,format,order_channel
3468,2022-01-18,Transaction_1883261,Household_1853,Product_47313,0,Store_23,0,2.0,DRIVE,MOBILE_APP
5560,2023-11-14,Transaction_2591825,Household_9131,Product_40168,1,Store_56,0,1.0,DRIVE,WEBSITE
5561,2023-11-14,Transaction_2591825,Household_9131,Product_19309,1,Store_56,0,1.0,DRIVE,WEBSITE
5562,2023-05-22,Transaction_645993,Household_9131,Product_81032,1,Store_56,0,2.0,DRIVE,MOBILE_APP
5682,2023-02-08,Transaction_1203128,Household_1853,Product_65669,0,Store_60,0,1.0,CLCV,WEBSITE
...,...,...,...,...,...,...,...,...,...,...
69866958,2023-07-03,Transaction_2316563,Household_72167,Product_17608,1,Store_2523,0,1.0,DRIVE,MOBILE_APP
69867277,2022-01-11,Transaction_980909,Household_78185,Product_49495,0,Store_2523,0,1.0,DRIVE,MOBILE_APP
69867278,2023-06-09,Transaction_2028242,Household_78185,Product_42259,0,Store_2523,0,1.0,DRIVE,WEBSITE
69867279,2023-06-09,Transaction_2028242,Household_78185,Product_41041,0,Store_2523,0,1.0,DRIVE,WEBSITE


In [48]:
# Ensure all embeddings are numpy arrays of floats
for key in product_embedding_dict:
    product_embedding_dict[key] = np.array(product_embedding_dict[key], dtype=np.float32)

## Training Data Preproccesing

In [49]:
train_data['date'] = pd.to_datetime(train_data['date'])

# Add recency attribute
latest_date = train_data['date'].max()  # Find the latest date in the dataset
train_data['recency'] = (latest_date - train_data['date']).dt.days  # Calculate days since last purchase

# Group by customer_id and product_id to calculate quantity and most recent purchase
customer_product_data = train_data.groupby(['customer_id', 'product_id']).agg({
    'quantity': 'sum',
    'recency': 'min'  # Minimum days since purchase (most recent)
}).reset_index()

# Normalize quantity and recency scores
customer_product_data['quantity_score'] = customer_product_data['quantity'] / customer_product_data['quantity'].max()
customer_product_data['recency_score'] = 1 - (customer_product_data['recency'] / customer_product_data['recency'].max())  # Recent = higher score

best_alpha = 0.97

# Calculate the final score with this alpha value
customer_product_data['final_score'] = (
    best_alpha * customer_product_data['quantity_score'] +
    (1 - best_alpha) * customer_product_data['recency_score']
)

# Rank products for each customer with unique ranks
customer_product_data['rank'] = customer_product_data.sort_values(
    by=['customer_id', 'final_score', 'quantity', 'product_id'],  # Add secondary sort keys
    ascending=[True, False, False, True]  # Ensure consistent order
).groupby('customer_id').cumcount() + 1  # Assign unique incremental ranks


# Sort the DataFrame by customer_id and rank
customer_product_data = customer_product_data.sort_values(by=['customer_id', 'rank'])

# Filter out products with a rank greater than 200 for each customer
#customer_product_data = customer_product_data[customer_product_data['rank'] <= 20]


In [50]:
customer_product_data

Unnamed: 0,customer_id,product_id,quantity,recency,quantity_score,recency_score,final_score,rank
148,Household_10041,Product_57942,5.0,34,0.001472,0.953361,0.030029,1
176,Household_10041,Product_69094,2.0,34,0.000589,0.953361,0.029172,2
31,Household_10041,Product_19843,1.0,34,0.000294,0.953361,0.028886,3
41,Household_10041,Product_22731,1.0,34,0.000294,0.953361,0.028886,4
78,Household_10041,Product_35185,1.0,34,0.000294,0.953361,0.028886,5
...,...,...,...,...,...,...,...,...
336384,Household_9688,Product_5077,1.0,666,0.000294,0.086420,0.002878,236
336401,Household_9688,Product_55053,1.0,666,0.000294,0.086420,0.002878,237
336462,Household_9688,Product_73153,1.0,666,0.000294,0.086420,0.002878,238
336466,Household_9688,Product_74465,1.0,666,0.000294,0.086420,0.002878,239


In [51]:


# Extract temporal features
train_data['month'] = train_data['date'].dt.month
train_data['day_of_week'] = train_data['date'].dt.dayofweek
train_data['is_weekend'] = train_data['day_of_week'].isin([5, 6]).astype(int)

# Calculate recency: days since the last purchase for each customer
train_data['days_since_last_purchase'] = train_data.groupby('customer_id')['date'].transform(
    lambda x: (x.max() - x).dt.days
)

# Drop the 'date' column if it's no longer needed
train_data = train_data.drop(columns=['date'], errors='ignore')

In [52]:
# Aggregate customer-specific features
customer_features = train_data.groupby('customer_id').agg(
    purchase_frequency=('product_id', 'count'),  # Total number of purchases
    avg_quantity=('quantity', 'mean'),          # Average quantity purchased
    promo_ratio=('is_promo', 'mean'),           # Proportion of purchases made with promotions
    unique_products=('product_id', 'nunique')   # Number of unique products purchased
).reset_index()

# Merge these features into train_data and test_data
train_data = train_data.merge(customer_features, on='customer_id', how='left')

In [53]:
# Check for missing values
print("Missing values in train_data:")
print(train_data.isnull().sum())

print("Missing values in test_data:")
print(test_data.isnull().sum())

# Fill remaining missing values with zeros as a safety measure
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

Missing values in train_data:
transaction_id              0
customer_id                 0
product_id                  0
has_loyality_card           0
store_id                    0
is_promo                    0
quantity                    0
format                      0
order_channel               0
recency                     0
month                       0
day_of_week                 0
is_weekend                  0
days_since_last_purchase    0
purchase_frequency          0
avg_quantity                0
promo_ratio                 0
unique_products             0
dtype: int64
Missing values in test_data:
transaction_id    0
customer_id       0
product_id        0
dtype: int64


In [54]:
train_data.head()

Unnamed: 0,transaction_id,customer_id,product_id,has_loyality_card,store_id,is_promo,quantity,format,order_channel,recency,month,day_of_week,is_weekend,days_since_last_purchase,purchase_frequency,avg_quantity,promo_ratio,unique_products
0,Transaction_1883261,Household_1853,Product_47313,0,Store_23,0,2.0,DRIVE,MOBILE_APP,712,1,1,0,579,584,1.436644,0.07363,298
1,Transaction_2591825,Household_9131,Product_40168,1,Store_56,0,1.0,DRIVE,WEBSITE,47,11,1,0,43,179,1.195531,0.01676,156
2,Transaction_2591825,Household_9131,Product_19309,1,Store_56,0,1.0,DRIVE,WEBSITE,47,11,1,0,43,179,1.195531,0.01676,156
3,Transaction_645993,Household_9131,Product_81032,1,Store_56,0,2.0,DRIVE,MOBILE_APP,223,5,0,0,219,179,1.195531,0.01676,156
4,Transaction_1203128,Household_1853,Product_65669,0,Store_60,0,1.0,CLCV,WEBSITE,326,2,2,0,193,584,1.436644,0.07363,298


In [55]:
train_data = train_data.drop(columns=['transaction_id','has_loyality_card', 'store_id', 'day_of_week','is_weekend' ,'format','order_channel','month','unique_products'])

In [56]:
train_data.head()

Unnamed: 0,customer_id,product_id,is_promo,quantity,recency,days_since_last_purchase,purchase_frequency,avg_quantity,promo_ratio
0,Household_1853,Product_47313,0,2.0,712,579,584,1.436644,0.07363
1,Household_9131,Product_40168,0,1.0,47,43,179,1.195531,0.01676
2,Household_9131,Product_19309,0,1.0,47,43,179,1.195531,0.01676
3,Household_9131,Product_81032,0,2.0,223,219,179,1.195531,0.01676
4,Household_1853,Product_65669,0,1.0,326,193,584,1.436644,0.07363


## MODELING

In [57]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [58]:
# Prepare input features
features = ['is_promo','quantity', 'days_since_last_purchase', 'purchase_frequency', 'avg_quantity', 'promo_ratio']

X = train_data[features].values
y = train_data['quantity'].values

scaler_original = StandardScaler()
scaler_full = StandardScaler()
X = scaler_original.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [59]:

# Prepare full training data with embeddings using batch processing to avoid memory overload
def batch_process_train_data(train_data, product_embeddings, features, batch_size=1000):
    """Yield batches of combined features from train_data and product embeddings."""
    num_rows = len(train_data)
    for start_idx in range(0, num_rows, batch_size):
        end_idx = min(start_idx + batch_size, num_rows)
        batch_rows = train_data.iloc[start_idx:end_idx]
        batch_combined_features = []

        for _, row in batch_rows.iterrows():
            product_embedding = product_embeddings.get(row['product_id'])
            if product_embedding is not None:
                combined_features = np.hstack([row[features].values.astype(float).flatten(), product_embedding.flatten()])
                batch_combined_features.append(combined_features)

        yield np.array(batch_combined_features, dtype=np.float32)

# Using the batch generator to process the data and store in memory-efficient way
train_data_with_embeddings = []
for batch_combined_features in tqdm(batch_process_train_data(train_data, product_embedding_dict, features), desc='Processing training data in batches'):
    train_data_with_embeddings.extend(batch_combined_features)

Processing training data in batches: 887it [04:51,  3.04it/s]


In [60]:
len(train_data_with_embeddings[0])

774

In [73]:
X_train_full = np.array(train_data_with_embeddings, dtype=np.float32)
X_train_full = scaler_full.fit_transform(X_train_full)

# Split training and validation data
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y[:len(X_train_full)], test_size=0.2, random_state=42)

# Build model
input_dim = X_train_full.shape[1]
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(input_dim,)),
    tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(1, activation='linear')
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=25, batch_size=256, verbose=1, callbacks=[tf.keras.callbacks.ReduceLROnPlateau()]




Epoch 1/25


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 52ms/step - accuracy: 0.0278 - loss: 1.6559 - val_accuracy: 0.0000e+00 - val_loss: 1.1876e-04 - learning_rate: 0.0010
Epoch 2/25
[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 35ms/step - accuracy: 0.0000e+00 - loss: 2.9732e-05 - val_accuracy: 0.0000e+00 - val_loss: 1.7886e-07 - learning_rate: 0.0010
Epoch 3/25
[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.0000e+00 - loss: 1.8024e-07 - val_accuracy: 0.0000e+00 - val_loss: 1.8770e-07 - learning_rate: 0.0010
Epoch 4/25
[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.0000e+00 - loss: 1.8813e-07 - val_accuracy: 0.0000e+00 - val_loss: 1.8509e-07 - learning_rate: 0.0010
Epoch 5/25
[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.0000e+00 - loss: 1.8758e-07 - val_accuracy: 0.0000e+00 - val_loss: 1.8867e-07 - learning_rate: 0.0010
Ep

<keras.src.callbacks.history.History at 0x7d184b240850>

## EVALUATION



In [74]:
# Prepare test set for Hit Rate @10
def hitrate_at_k(true_data: pd.DataFrame, predicted_data: pd.DataFrame, k: int = 10) -> float:
    """
    This function calculates the hitrate at k for the recommendations.
    It assesses how relevant our k product recommendations are.
    In other words, it calculates the proportion of recommended products that are actually purchased by the customer.

    Args:
        true_data: a pandas DataFrame containing the true data
            customer_id: the customer identifier
            product_id: the product identifier that was purchased in the test set
        predicted_data: a pandas DataFrame containing the predicted data
            customer_id: the customer identifier
            product_id: the product identifier that was recommended
            rank: the rank of the recommendation. the rank should be between 1 and k.
        k: the number of recommendations to consider. k should be between 1 and 10.

    Returns:
        The hitrate at k
    """
    data = pd.merge(left=true_data, right=predicted_data, how="left", on=["customer_id", "product_id"])
    df = data[data["rank"] <= k]
    non_null_counts = df.groupby('customer_id')['rank'].apply(lambda x: x.notna().sum()).reset_index(name='non_null_count')
    distinct_products_per_customer = data.groupby('customer_id')['product_id'].nunique().reset_index(name='distinct_product_count')
    df = pd.merge(left=distinct_products_per_customer, right=non_null_counts, how="left", on="customer_id")
    df["denominator"] = [min(df.iloc[i].distinct_product_count, k) for i in range(len(df))]
    df = df.fillna(0)
    return (df["non_null_count"] / df["denominator"]).mean()


In [76]:

# Define customer IDs

# Prepare predicted data for hit rate calculation
predicted_data = []
for customer_id in tqdm(customer_ids, desc='Processing customers'):
    #customer_purchases = test_data[test_data['customer_id'] == customer_id]
    #customer_purchases = filter_top_30_products(train_data[train_data['customer_id'] == customer_id], train_data)
    customer_purchases = customer_product_data[customer_product_data['customer_id'] == customer_id]
    customer_purchases = customer_purchases[customer_purchases['rank'] <= 50]
    customer_filtered = train_data[train_data['customer_id'] == customer_id]
    if not customer_filtered.empty:
        customer_features = customer_filtered[features].iloc[0].values
        customer_features = scaler_original.transform([customer_features.astype(float)])
        scores = []
        for _, row in customer_purchases.iterrows():
            product_embedding = product_embedding_dict.get(row['product_id'])
            if product_embedding is not None:
                input_features = np.hstack([customer_features, product_embedding])
                score = model.predict(np.array(input_features).reshape(1, -1), verbose=0)
                scores.append((row['product_id'], score))
        top_10 = []
        seen_products = set()
        for product_id, score in sorted(scores, key=lambda x: x[1], reverse=True):
            if product_id not in seen_products:
                top_10.append((product_id, len(top_10) + 1))
                seen_products.add(product_id)
            if len(top_10) == 10:
                break
        predicted_data.extend([(customer_id, product_id, rank) for product_id, rank in top_10])

predicted_df = pd.DataFrame(predicted_data, columns=['customer_id', 'product_id', 'rank'])

# Calculate Hit Rate @10 using new function
hit_rate_at_10 = hitrate_at_k(test_data[['customer_id', 'product_id']], predicted_df, k=10)
print('Hit Rate @10:', hit_rate_at_10)

Processing customers: 100%|██████████| 1000/1000 [52:54<00:00,  3.17s/it]

Hit Rate @10: 0.34513730158730166





In [64]:
# Create submission file from existing predictions
submission_df = predicted_df[predicted_df['customer_id'].isin([f"Household_{i}" for i in range(1, 100001)])]
prediction = submission_df



In [65]:
household_16874_df = test_data[test_data['customer_id'] == 'Household_16874']
household_16874_df3 = submission_df[submission_df['customer_id'] == 'Household_16874']
common_products = set(household_16874_df['product_id']).intersection(set(household_16874_df3['product_id']))
print(f"Number of common products: {len(common_products)}")
similarity_ratio = len(common_products) / len(household_16874_df['product_id']) if len(household_16874_df['product_id']) > 0 else 0
print(f"Similarity Ratio: {similarity_ratio}")

if similarity_ratio > 0.5:  # Adjust threshold as needed
    print("The product IDs in the two dataframes are relatively similar.")
else:
    print("The product IDs in the two dataframes are not very similar.")

Number of common products: 4
Similarity Ratio: 0.3333333333333333
The product IDs in the two dataframes are not very similar.


In [66]:
submission_df

Unnamed: 0,customer_id,product_id,rank
0,Household_16874,Product_11109,1
1,Household_16874,Product_71206,2
2,Household_16874,Product_16940,3
3,Household_16874,Product_73333,4
4,Household_16874,Product_13951,5
...,...,...,...
9995,Household_17344,Product_36019,6
9996,Household_17344,Product_79974,7
9997,Household_17344,Product_64825,8
9998,Household_17344,Product_33929,9


In [67]:
prediction

Unnamed: 0,customer_id,product_id,rank
0,Household_16874,Product_11109,1
1,Household_16874,Product_71206,2
2,Household_16874,Product_16940,3
3,Household_16874,Product_73333,4
4,Household_16874,Product_13951,5
...,...,...,...
9995,Household_17344,Product_36019,6
9996,Household_17344,Product_79974,7
9997,Household_17344,Product_64825,8
9998,Household_17344,Product_33929,9


In [68]:
submission_df

Unnamed: 0,customer_id,product_id,rank
0,Household_16874,Product_11109,1
1,Household_16874,Product_71206,2
2,Household_16874,Product_16940,3
3,Household_16874,Product_73333,4
4,Household_16874,Product_13951,5
...,...,...,...
9995,Household_17344,Product_36019,6
9996,Household_17344,Product_79974,7
9997,Household_17344,Product_64825,8
9998,Household_17344,Product_33929,9


In [69]:
# Process and format prediction
def process_and_format_prediction(df):
    # Replace invalid characters in column names
    df.columns = df.columns.str.replace('+AF8-', '_', regex=False)
    df = df.replace(r'\+AF8-', '_', regex=True)

    # Clean columns 'customer_id', 'product_id', and 'transaction_id'
    if 'customer_id' in df.columns and df['customer_id'].dtype == 'object':
        df['customer_id'] = df['customer_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'product_id' in df.columns and df['product_id'].dtype == 'object':
        df['product_id'] = df['product_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'transaction_id' in df.columns and df['transaction_id'].dtype == 'object':
        df['transaction_id'] = df['transaction_id'].str.replace(r'\D', '', regex=True).fillna(11).astype(int)

    df['id'] = df.index
    df = df[['id'] + [col for col in df.columns if col != 'id']]

    if 'customer_id' not in df.columns or 'product_id' not in df.columns:
        raise ValueError("true_data must contain 'customer_id' and 'product_id' columns")

    # Group by 'customer_id' and concatenate product and rank values
    prediction_grouped = df.groupby('customer_id').agg({
        'id': 'first',
        'product_id': lambda x: ','.join(map(str, x)),
        'rank': lambda x: ','.join(map(str, x))
    }).reset_index()

    # Drop 'id' column if it exists
    if 'id' in prediction_grouped.columns:
        prediction_grouped = prediction_grouped.drop(columns=['id'])

    # Filter data
    prediction_grouped = prediction_grouped[prediction_grouped['customer_id'] != 11]
    prediction_grouped.insert(0, 'id', range(len(prediction_grouped)))

    return prediction_grouped

prediction_grouped = process_and_format_prediction(prediction)
print(prediction_grouped)


      id  customer_id                                         product_id  \
0      0           18  73736,74150,19636,45578,57129,42221,20926,5432...   
1      1           56  8576,65416,13055,40042,70196,55943,50349,13366...   
2      2          103  59962,6216,43071,74150,74484,6799,16940,35034,...   
3      3          278  51497,63301,61528,14434,55743,78358,22281,5608...   
4      4          296  52162,36019,20411,77081,15350,25814,53459,3090...   
..   ...          ...                                                ...   
995  995        79756  57651,36019,17626,57274,47309,43176,75494,6183...   
996  996        79772  80257,7941,33133,36403,73599,82214,62129,10052...   
997  997        79810  11791,21613,39751,32435,57478,10291,34772,4953...   
998  998        79834  41885,75217,16474,42910,75864,13814,57420,3840...   
999  999        79931  28248,36019,32922,27662,73282,30503,74484,3388...   

                     rank  
0    1,2,3,4,5,6,7,8,9,10  
1    1,2,3,4,5,6,7,8,9,10  
2  

In [70]:
# Create a .csv file to submit on kaggle
# A lancer en local sur votre ordinateur
prediction_grouped.to_csv('gdrive/My Drive/MLDM - Carrefour Project/submission/submission_list_bert.csv', index=False) ####

In [71]:
predicted_df.to_csv('gdrive/My Drive/MLDM - Carrefour Project/submission/predicted_df.csv', index=False) ####

## prev


In [None]:
# Prepare test set for Hit Rate @10
#customer_ids = test_data['customer_id'].unique()

# Prepare test set for Hit Rate @10

hit_count = 0
customer_total_hits = 0
for customer_id in tqdm(customer_ids, desc='Processing customers'):
    customer_purchases = test_data[test_data['customer_id'] == customer_id]
    customer_filtered = train_data[train_data['customer_id'] == customer_id]
    if not customer_filtered.empty:
        customer_features = customer_filtered[features].iloc[0].values
    else:
        continue
    customer_features = scaler_original.transform([customer_features.astype(float)])
    scores = []
    for _, row in tqdm(customer_purchases.iterrows(), total=len(customer_purchases), desc='Processing products', leave=False):
        product_embedding = product_embedding_dict.get(row['product_id'])
        if product_embedding is not None:
            input_features = np.hstack([customer_features, product_embedding])
            score = model.predict(np.array(input_features).reshape(1, -1), verbose=0)
            scores.append((row['product_id'], score))
    top_10 = list(dict(sorted(scores, key=lambda x: x[1], reverse=True)).items())[:10]
    purchased_products = customer_purchases['product_id'].values
    hit_count += len(set(purchased_products) & set([product_id for product_id, _ in top_10]))
    customer_total_hits += 1 if len(set(purchased_products) & set([product_id for product_id, _ in top_10])) > 0 else 0

hit_rate_at_10 = customer_total_hits / len(customer_ids)
print('Hit Rate @10:', hit_rate_at_10)



Processing customers:   1%|          | 1/100 [00:00<00:25,  3.89it/s]
Processing products:   0%|          | 0/13 [00:00<?, ?it/s][A
Processing products:  15%|█▌        | 2/13 [00:00<00:00, 13.52it/s][A
Processing products:  31%|███       | 4/13 [00:00<00:00, 13.67it/s][A
Processing products:  46%|████▌     | 6/13 [00:00<00:00, 13.42it/s][A
Processing products:  62%|██████▏   | 8/13 [00:00<00:00, 13.23it/s][A
Processing products:  77%|███████▋  | 10/13 [00:00<00:00, 12.80it/s][A
Processing products:  92%|█████████▏| 12/13 [00:00<00:00, 12.76it/s][A
Processing customers:  14%|█▍        | 14/100 [00:04<00:18,  4.54it/s]
Processing products:   0%|          | 0/21 [00:00<?, ?it/s][A
Processing products:  10%|▉         | 2/21 [00:00<00:01, 14.13it/s][A
Processing products:  19%|█▉        | 4/21 [00:00<00:01, 14.20it/s][A
Processing products:  29%|██▊       | 6/21 [00:00<00:01, 13.90it/s][A
Processing products:  38%|███▊      | 8/21 [00:00<00:00, 13.94it/s][A
Processing products: 

Hit Rate @10: 0.13





In [None]:
# Create submission file for
submission_data = []
for customer_id in customer_ids:
    scores = []
    customer_purchases = test_data[test_data['customer_id'] == customer_id]
    for _, row in customer_purchases.iterrows():
        product_embedding = product_embedding_dict.get(row['product_id'])
        if product_embedding is not None:
            input_features = np.hstack([customer_features, product_embedding])
            score = model.predict(np.array(input_features).reshape(1, -1), verbose=0)
            scores.append((row['product_id'], score))
    top_10 = sorted(scores, key=lambda x: x[1], reverse=True)[:10]
    submission_data.extend([(customer_id, product_id, rank + 1) for rank, (product_id, _) in enumerate(top_10)])

# Convert to DataFrame and filter
submission_df = pd.DataFrame(submission_data, columns=['customer_id', 'product_id', 'rank'])
prediction = submission_df[submission_df.customer_id.isin([f"Household_{i}" for i in range(1, 100001)])]

# Process and format prediction
def process_and_format_prediction(df):
    # Replace invalid characters in column names
    df.columns = df.columns.str.replace('+AF8-', '_', regex=False)
    df = df.replace(r'\+AF8-', '_', regex=True)

    # Clean columns 'customer_id', 'product_id', and 'transaction_id'
    if 'customer_id' in df.columns and df['customer_id'].dtype == 'object':
        df['customer_id'] = df['customer_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'product_id' in df.columns and df['product_id'].dtype == 'object':
        df['product_id'] = df['product_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'transaction_id' in df.columns and df['transaction_id'].dtype == 'object':
        df['transaction_id'] = df['transaction_id'].str.replace(r'\D', '', regex=True).fillna(11).astype(int)

    df['id'] = df.index
    df = df[['id'] + [col for col in df.columns if col != 'id']]

    if 'customer_id' not in df.columns or 'product_id' not in df.columns:
        raise ValueError("true_data must contain 'customer_id' and 'product_id' columns")

    # Group by 'customer_id' and concatenate product and rank values
    prediction_grouped = df.groupby('customer_id').agg({
        'id': 'first',
        'product_id': lambda x: ','.join(map(str, x)),
        'rank': lambda x: ','.join(map(str, x))
    }).reset_index()

    # Drop 'id' column if it exists
    if 'id' in prediction_grouped.columns:
        prediction_grouped = prediction_grouped.drop(columns=['id'])

    # Filter data
    prediction_grouped = prediction_grouped[prediction_grouped['customer_id'] != 11]
    prediction_grouped.insert(0, 'id', range(len(prediction_grouped)))

    # Verify ranks and duplicates
    for index, row in prediction_grouped.iterrows():
        ranks = list(map(int, row['rank'].split(',')))
        if sorted(ranks) != list(range(1, 11)):
            print("Doublon détecté. Les rangs doivent être distincts (de 1 à 10) pour chacun des 10 produits prédits pour un client.\n")
            return None
        products = row['product_id'].split(',')
        if len(products) != len(set(products)):
            print("Doublon détecté. Il doit y avoir 10 produits différents par client.\n")
            return None

    return prediction_grouped

prediction_grouped = process_and_format_prediction(prediction)
print(prediction_grouped)


Doublon détecté. Les rangs doivent être distincts (de 1 à 10) pour chacun des 10 produits prédits pour un client.

None


In [None]:
prediction_grouped.to_csv('submission/submission_list_bert_ft.csv', index=False) ####

AttributeError: 'NoneType' object has no attribute 'to_csv'