## IMPORTS

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!pip install tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from google.colab import drive
drive.mount('/content/gdrive')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# Import necessary libraries

import torch

# Load pre-trained BERT model and tokenizer
trainFolder = "gdrive/My Drive/MLDM - Carrefour Project/data-train"

import os
for dirname, _, filenames in os.walk(f'{trainFolder}'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
gdrive/My Drive/MLDM - Carrefour Project/data-train/products_data.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/test_data.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_1.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_10.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_2.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_3.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_4.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_5.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_6.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_7.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_8.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_par

In [2]:
# Import libraries
from tqdm import tqdm
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

### Data Description

This project uses three datasets:

#### `train_data.csv`:
This dataset contains two years (2022 & 2023) of historical transactions for 100,000 Carrefour customers. It has 10 columns:

* ***date***: Date of the transaction
* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: Product purchased
* ***has_loyality_card***: Flag indicating whether the customer has a loyalty card
* ***store_id***: Store where the purchase was made
* ***is_promo***: Flag indicating whether there was a discount on the product
* ***quantity***: Quantity purchased of the product
* ***format***: Ecommerce activity format (clcv, lex, or DRIVE)
  - clcv : courses livrées chez vous
  - lex : livraison express
  - DRIVE.
* ***orderChannelCode***: Indicates whether the online activity was made through the website or mobile app

#### `products_data.csv`:
This dataset contains detailed information about the products. The following columns are relevant to this project:

* ***product_id*** : Product name
* ***product_description*** : Product description
* ***department_key***: Department key
* ***class_key***: Class key
* ***subclass_key***: Subclass key
* ***sector***: sector name
* ***brand_key***: Brand name
* ***shelf_level1***: Top-level shelf category
* ***shelf_level2***: Second-level shelf category
* ***shelf_level3***: Third-level shelf category
* ***shelf_level4***: Fourth-level shelf category
* ***sector***: Sector
* ***bio***: Flag indicating whether the product is organic
* ***sugar_free***: Flag indicating whether the product is sugar-free
* ***aspartame_free***: Flag indicating whether the product is aspartame-free
* ***gluten_free***: Flag indicating whether the product is gluten-free
* ***halal***: Flag indicating whether the product is halal
* ***casher***: Flag indicating whether the product is kosher
* ***eco_friendly***: Flag indicating whether the product is eco-friendly
* ***local_french***: Flag indicating whether the product is locally produced in France
* ***artificial_coloring_free***: Flag indicating whether the product is free of artificial coloring
* ***taste_enhancer_free***: Flag indicating whether the product is free of taste enhancers
* ***naturality***: Naturality score
* ***antibiotic_free***: Flag indicating whether the product is antibiotic-free
* ***reduced_sugar***: Flag indicating whether the product has reduced sugar content
* ***vegetarian***: Flag indicating whether the product is vegetarian
* ***pesticide_free***: Flag indicating whether the product is pesticide-free
* ***grain_free***: Flag indicating whether the product is grain-free
* ***no_added_sugar***: Flag indicating whether the product has no added sugar
* ***salt_reduced***: Flag indicating whether the product has reduced salt content
* ***nitrite_free***: Flag indicating whether the product is nitrite-free
* ***fed_without_ogm***: Flag indicating whether the animals were fed without GMOs
* ***no_added_salt***: Flag indicating whether the product has no added salt
* ***no_artificial_flavours***: Flag indicating whether the product has no artificial flavors
* ***porc***: Flag indicating whether the product contains pork
* ***vegan***: Flag indicating whether the product is vegan
* ***frozen***: Flag indicating whether the product is frozen
* ***fat_free***: Flag indicating whether the product is fat-free
* ***reduced_fats***: Flag indicating whether the product has reduced fat content
* ***fresh***: Flag indicating whether the product is fresh
* ***alcool***: Flag indicating whether the product contains alcohol
* ***lactose_free***: Flag indicating whether the product is lactose-free
* ***phenylalanine_free***: Flag indicating whether the product is phenylalanine-free
* ***palm_oil_free***: Flag indicating whether the product is palm oil-free
* ***ecoscore***: Ecoscore
* ***produits_du_monde***: Flag indicating whether the product is an international product
* ***regional_product***: Flag indicating whether the product is a regional product
* ***national_brand***: Flag indicating whether the product is a national brand
* ***first_price_brand***: Flag indicating whether the product is a first-price brand
* ***carrefour_brand***: Flag indicating whether the product is a Carrefour brand

#### `test_data.csv`:
This dataset contains the actual purchases of the first 80,000 customers in 2024. It has three columns:

* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: the id of the purchased product

### Load data

* Load *train_data.csv*, *products_data.csv* and *test_data.csv* using pandas.

In [3]:
# This code reads the data from CSV files named "train_data_part_i.csv" for all i from 1 to 10
# and concatenates them into a single pandas DataFrame
train_dataframes = []

#train_dataframes.append(pd.read_csv(f'{trainFolder}/train_data_all_9.csv'))
#train_dataframes.append(pd.read_csv(f'{trainFolder}/train_data_all_10.csv'))

#train_data = pd.concat(train_dataframes, ignore_index=True)

# free up memory by deleting the dataframes we no longer need
del train_dataframes

In [4]:
#train_data = train_data.sample(frac=0.1, random_state=1)  # frac=0.1 takes 10% of the data

In [5]:
# This code reads the data from a CSV file named "products_data.csv" into a pandas DataFrame
products_data = pd.read_csv(f'{trainFolder}/products_data.csv')

  products_data = pd.read_csv(f'{trainFolder}/products_data.csv')


In [6]:
# This code reads the data from a CSV file named "test_data.csv" into a pandas DataFrame
test_data = pd.read_csv(f'{trainFolder}/test_data.csv')

In [7]:
# Load product embeddings
product_embeddings_np = np.load(f'{trainFolder}/product_embeddings.npy')

# Load product IDs
product_ids = np.load(f'{trainFolder}/product_ids.npy', allow_pickle=True)  # Use allow_pickle=True if product_ids is an array of objects (e.g., strings)

# Create the product embedding dictionary
product_embedding_dict = dict(zip(product_ids, product_embeddings_np))

In [8]:
test_data

Unnamed: 0,transaction_id,customer_id,product_id
0,Transaction_2024_1,Household_16874,Product_9790
1,Transaction_2024_1,Household_16874,Product_68295
2,Transaction_2024_1,Household_16874,Product_19494
3,Transaction_2024_1,Household_16874,Product_11109
4,Transaction_2024_4,Household_9247,Product_57151
...,...,...,...
1220701,Transaction_2024_19277,Household_79999,Product_8951
1220702,Transaction_2024_19277,Household_79999,Product_9249
1220703,Transaction_2024_19277,Household_79999,Product_21485
1220704,Transaction_2024_81175,Household_80000,Product_74965


In [9]:
# Ensure all embeddings are numpy arrays of floats
for key in product_embedding_dict:
    product_embedding_dict[key] = np.array(product_embedding_dict[key], dtype=np.float32)

In [10]:

# Create a list of customer IDs within the specified range
#customer_ids_new = [f'Household_{i}' for i in range(80001, 100002)]
#customer_ids_new = [f'Household_{i}' for i in range(10001, 20000)]
customer_ids_new = [f'Household_{i}' for i in range(80001, 100002)]

## Training Data Preproccesing

In [11]:
import pandas as pd
import numpy as np

customer_product_data = np.load(os.path.join(trainFolder, 'customer_product_data_5.npy'), allow_pickle=True)



## MODELING

In [12]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [13]:
# Prepare input features
# Prepare input features (remove 'quantity')

features = ['customer_id','product_id','is_promo', 'recency', 'purchase_frequency', 'avg_quantity', 'promo_ratio','month']


scaler_original = StandardScaler()
scaler_full = StandardScaler()


In [14]:
y = np.load(os.path.join(trainFolder, 'relevance_array_5.npy'), allow_pickle=True)

In [15]:
X_train_full = np.load(os.path.join(trainFolder, 'train_data_with_embeddings_final_52.npy'), allow_pickle=True)

In [16]:

scaler_original = StandardScaler()
scaler_full = StandardScaler()

#X_train_full = np.array(train_data_with_embeddings, dtype=np.float32)
#X_train_full = scaler_full.fit_transform(X_train_full)

# Split training and validation data
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y[:len(X_train_full)], test_size=0.2, random_state=42)

# Build model
input_dim = X_train_full.shape[1]

import gc
del X_train_full
gc.collect()



53

In [17]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(input_dim,)),
    tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


In [18]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])

callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3),
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
]

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=6, batch_size=256, verbose=1, callbacks=[tf.keras.callbacks.ReduceLROnPlateau()])


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.src.callbacks.History at 0x7d0ba77b9780>

In [19]:
# Save the model after training on the first batch
model.save('gdrive/My Drive/MLDM - Carrefour Project/models/model_checkpoint_05.keras')

## EVALUATION



In [20]:
# Save Recommendations to File
recommendations_file = 'gdrive/My Drive/MLDM - Carrefour Project/data-train/product_recommendations_combined.npy'

# Step 6: Load Recommendations from File
def load_recommendations():
    """Load recommendations from the saved file."""
    return np.load(recommendations_file, allow_pickle=True).item()

recommendations_data = load_recommendations()

# Step 7: Recommend Products from Loaded File
def recommend_from_loaded_file(product_id):
    """Get recommendations for a product ID from the preloaded recommendations."""
    try:
        return recommendations_data[product_id]
    except KeyError:
        return f"Product ID {product_id} not found in recommendations."

# Example Usage
example_product_id = "Product_33508"
print(f"Recommendations for {example_product_id}: {recommend_from_loaded_file(example_product_id)}")

Recommendations for Product_33508: ['Product_1370', 'Product_42748']


In [21]:
# Prepare test set for Hit Rate @10
def hitrate_at_k(true_data: pd.DataFrame, predicted_data: pd.DataFrame, k: int = 10) -> float:
    """
    This function calculates the hitrate at k for the recommendations.
    It assesses how relevant our k product recommendations are.
    In other words, it calculates the proportion of recommended products that are actually purchased by the customer.

    Args:
        true_data: a pandas DataFrame containing the true data
            customer_id: the customer identifier
            product_id: the product identifier that was purchased in the test set
        predicted_data: a pandas DataFrame containing the predicted data
            customer_id: the customer identifier
            product_id: the product identifier that was recommended
            rank: the rank of the recommendation. the rank should be between 1 and k.
        k: the number of recommendations to consider. k should be between 1 and 10.

    Returns:
        The hitrate at k
    """
    data = pd.merge(left=true_data, right=predicted_data, how="left", on=["customer_id", "product_id"])
    df = data[data["rank"] <= k]
    non_null_counts = df.groupby('customer_id')['rank'].apply(lambda x: x.notna().sum()).reset_index(name='non_null_count')
    distinct_products_per_customer = data.groupby('customer_id')['product_id'].nunique().reset_index(name='distinct_product_count')
    df = pd.merge(left=distinct_products_per_customer, right=non_null_counts, how="left", on="customer_id")
    df["denominator"] = [min(df.iloc[i].distinct_product_count, k) for i in range(len(df))]
    df = df.fillna(0)
    return (df["non_null_count"] / df["denominator"]).mean()


In [22]:
# prompt: change products_data to dataframe with colum 'product_id'

products_data2 = pd.DataFrame(products_data, columns=['product_id'])
products_data2

Unnamed: 0,product_id
0,Product_33508
1,Product_15347
2,Product_80604
3,Product_81740
4,Product_82516
...,...
82961,Product_18949
82962,Product_66524
82963,Product_66126
82964,Product_31161


In [23]:
import random

def generate_recommendations_subgroup(customer_purchases):

  recommended_product_ids = []

  for product_id in customer_purchases['product_id']:
    recommendations = recommend_from_loaded_file(product_id)
    recommended_product_ids.extend(recommendations)
    #recommended_product_ids.extend(random.choice(products_data2['product_id']))

  recommended_df = pd.DataFrame({'product_id': recommended_product_ids})

  customer_purchases = pd.concat([customer_purchases, recommended_df], ignore_index=True)
  customer_purchases
  customer_purchases = customer_purchases.drop_duplicates(subset='product_id', keep='first')
  return customer_purchases

In [37]:
customer_product_data

array([['100000', '10182', 2.0, ..., 0.6008230452674896,
        0.018602072310405657, 517],
       ['100000', '10317', 0.0, ..., 1.0, 0.030000000000000027, 13],
       ['100000', '10505', 2.0, ..., 0.7283950617283951,
        0.022429232804232823, 484],
       ...,
       ['99999', '9975', 0.0, ..., 1.0, 0.030000000000000027, 2544],
       ['99999', '998', 0.0, ..., 1.0, 0.030000000000000027, 2545],
       ['99999', '9997', 0.0, ..., 1.0, 0.030000000000000027, 2546]],
      dtype=object)

In [30]:
# prompt: remove from all in  customer_ids_new 'Household'

customer_ids_new = [customer_id.replace('Household_', '') for customer_id in customer_ids_new]

In [40]:
train_dataframes = []

train_dataframes.append(pd.read_csv(f'{trainFolder}/train_data_all_9.csv'))
train_dataframes.append(pd.read_csv(f'{trainFolder}/train_data_all_10.csv'))

train_data = pd.concat(train_dataframes, ignore_index=True)

In [41]:
del X_train, X_val, y_train, y_val

In [42]:
train_data.tail()

Unnamed: 0,customer_id,product_id,is_promo,quantity,recency,month,days_since_last_purchase,purchase_frequency,avg_quantity,promo_ratio,relevance
34337719,Household_99999,Product_41346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
34337720,Household_99999,Product_65052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
34337721,Household_99999,Product_63126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
34337722,Household_99999,Product_30377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
34337723,Household_99999,Product_7822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [43]:

# Define customer IDs

# Prepare predicted data for hit rate calculation
predicted_data = []
for customer_id in tqdm(customer_ids_new, desc='Processing customers'):
    #customer_purchases = test_data[test_data['customer_id'] == customer_id]
    #customer_purchases = filter_top_30_products(train_data[train_data['customer_id'] == customer_id], train_data)
    #customer_product_data[0]
    customer_purchases = customer_product_data[customer_product_data['customer_id'] == customer_id]
    customer_purchases = customer_purchases[customer_purchases['rank'] <= 30]
    customer_purchases = customer_purchases.drop(columns=['customer_id','quantity','recency','quantity_score','recency_score','final_score','rank'])
    #customer_purchases = generate_recommendations_subgroup(customer_purchases)
    #print(customer_purchases)
    #customer_filtered = train_data[train_data['customer_id'] == customer_id]

    customer_features = train_data[features].iloc[0].values
    customer_features = scaler_original.transform([customer_features.astype(float)])
    scores = []
    for _, row in customer_purchases.iterrows():
        product_embedding = product_embedding_dict.get(row['product_id'])
        if product_embedding is not None:
            input_features = np.hstack([customer_features, product_embedding])
            score = model.predict(np.array(input_features).reshape(1, -1), verbose=0)
            #print(score)
            scores.append((row['product_id'], score))
    top_10 = []
    seen_products = set()
    for product_id, score in sorted(scores, key=lambda x: x[1], reverse=True):
        if product_id not in seen_products:
            top_10.append((product_id, len(top_10) + 1))
            seen_products.add(product_id)
        if len(top_10) == 10:
            break
    predicted_data.extend([(customer_id, product_id, rank) for product_id, rank in top_10])

predicted_df = pd.DataFrame(predicted_data, columns=['customer_id', 'product_id', 'rank'])

# Calculate Hit Rate @10 using new function
hit_rate_at_10 = hitrate_at_k(test_data[['customer_id', 'product_id']], predicted_df, k=10)
print('Hit Rate @10:', hit_rate_at_10)

Processing customers:   0%|          | 0/20001 [00:00<?, ?it/s]


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
# Create submission file from existing predictions
submission_df = predicted_df[predicted_df['customer_id'].isin([f"Household_{i}" for i in range(1, 100001)])]
prediction = submission_df



In [None]:
household_16874_df = test_data[test_data['customer_id'] == 'Household_60015']
household_16874_df3 = submission_df[submission_df['customer_id'] == 'Household_60015']
common_products = set(household_16874_df['product_id']).intersection(set(household_16874_df3['product_id']))
print(f"Number of common products: {len(common_products)}")
similarity_ratio = len(common_products) / len(household_16874_df['product_id']) if len(household_16874_df['product_id']) > 0 else 0
print(f"Similarity Ratio: {similarity_ratio}")

if similarity_ratio > 0.5:  # Adjust threshold as needed
    print("The product IDs in the two dataframes are relatively similar.")
else:
    print("The product IDs in the two dataframes are not very similar.")

In [None]:
prediction

In [None]:
submission_df

In [None]:
# Process and format prediction
def process_and_format_prediction(df):
    # Replace invalid characters in column names
    df.columns = df.columns.str.replace('+AF8-', '_', regex=False)
    df = df.replace(r'\+AF8-', '_', regex=True)

    # Clean columns 'customer_id', 'product_id', and 'transaction_id'
    if 'customer_id' in df.columns and df['customer_id'].dtype == 'object':
        df['customer_id'] = df['customer_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'product_id' in df.columns and df['product_id'].dtype == 'object':
        df['product_id'] = df['product_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'transaction_id' in df.columns and df['transaction_id'].dtype == 'object':
        df['transaction_id'] = df['transaction_id'].str.replace(r'\D', '', regex=True).fillna(11).astype(int)

    df['id'] = df.index
    df = df[['id'] + [col for col in df.columns if col != 'id']]

    if 'customer_id' not in df.columns or 'product_id' not in df.columns:
        raise ValueError("true_data must contain 'customer_id' and 'product_id' columns")

    # Group by 'customer_id' and concatenate product and rank values
    prediction_grouped = df.groupby('customer_id').agg({
        'id': 'first',
        'product_id': lambda x: ','.join(map(str, x)),
        'rank': lambda x: ','.join(map(str, x))
    }).reset_index()

    # Drop 'id' column if it exists
    if 'id' in prediction_grouped.columns:
        prediction_grouped = prediction_grouped.drop(columns=['id'])

    # Filter data
    prediction_grouped = prediction_grouped[prediction_grouped['customer_id'] != 11]
    prediction_grouped.insert(0, 'id', range(len(prediction_grouped)))

    return prediction_grouped

prediction_grouped = process_and_format_prediction(prediction)
print(prediction_grouped)


In [None]:
# Create a .csv file to submit on kaggle
# A lancer en local sur votre ordinateur
prediction_grouped.to_csv('gdrive/My Drive/MLDM - Carrefour Project/submission/submission_list_bert.csv', index=False) ####

In [None]:
predicted_df.to_csv('gdrive/My Drive/MLDM - Carrefour Project/submission/predicted_df.csv', index=False) ####

In [None]:
# Prepare test set for Hit Rate @10
#customer_ids = test_data['customer_id'].unique()

# Prepare test set for Hit Rate @10

hit_count = 0
customer_total_hits = 0
for customer_id in tqdm(customer_ids, desc='Processing customers'):
    customer_purchases = test_data[test_data['customer_id'] == customer_id]
    customer_filtered = train_data[train_data['customer_id'] == customer_id]
    if not customer_filtered.empty:
        customer_features = customer_filtered[features].iloc[0].values
    else:
        continue
    customer_features = scaler_original.transform([customer_features.astype(float)])
    scores = []
    for _, row in tqdm(customer_purchases.iterrows(), total=len(customer_purchases), desc='Processing products', leave=False):
        product_embedding = product_embedding_dict.get(row['product_id'])
        if product_embedding is not None:
            input_features = np.hstack([customer_features, product_embedding])
            score = model.predict(np.array(input_features).reshape(1, -1), verbose=0)
            scores.append((row['product_id'], score))
    top_10 = list(dict(sorted(scores, key=lambda x: x[1], reverse=True)).items())[:10]
    purchased_products = customer_purchases['product_id'].values
    hit_count += len(set(purchased_products) & set([product_id for product_id, _ in top_10]))
    customer_total_hits += 1 if len(set(purchased_products) & set([product_id for product_id, _ in top_10])) > 0 else 0

hit_rate_at_10 = customer_total_hits / len(customer_ids)
print('Hit Rate @10:', hit_rate_at_10)



In [None]:
# Create submission file for
submission_data = []
for customer_id in customer_ids:
    scores = []
    customer_purchases = test_data[test_data['customer_id'] == customer_id]
    for _, row in customer_purchases.iterrows():
        product_embedding = product_embedding_dict.get(row['product_id'])
        if product_embedding is not None:
            input_features = np.hstack([customer_features, product_embedding])
            score = model.predict(np.array(input_features).reshape(1, -1), verbose=0)
            scores.append((row['product_id'], score))
    top_10 = sorted(scores, key=lambda x: x[1], reverse=True)[:10]
    submission_data.extend([(customer_id, product_id, rank + 1) for rank, (product_id, _) in enumerate(top_10)])

# Convert to DataFrame and filter
submission_df = pd.DataFrame(submission_data, columns=['customer_id', 'product_id', 'rank'])
prediction = submission_df[submission_df.customer_id.isin([f"Household_{i}" for i in range(1, 100001)])]

# Process and format prediction
def process_and_format_prediction(df):
    # Replace invalid characters in column names
    df.columns = df.columns.str.replace('+AF8-', '_', regex=False)
    df = df.replace(r'\+AF8-', '_', regex=True)

    # Clean columns 'customer_id', 'product_id', and 'transaction_id'
    if 'customer_id' in df.columns and df['customer_id'].dtype == 'object':
        df['customer_id'] = df['customer_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'product_id' in df.columns and df['product_id'].dtype == 'object':
        df['product_id'] = df['product_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'transaction_id' in df.columns and df['transaction_id'].dtype == 'object':
        df['transaction_id'] = df['transaction_id'].str.replace(r'\D', '', regex=True).fillna(11).astype(int)

    df['id'] = df.index
    df = df[['id'] + [col for col in df.columns if col != 'id']]

    if 'customer_id' not in df.columns or 'product_id' not in df.columns:
        raise ValueError("true_data must contain 'customer_id' and 'product_id' columns")

    # Group by 'customer_id' and concatenate product and rank values
    prediction_grouped = df.groupby('customer_id').agg({
        'id': 'first',
        'product_id': lambda x: ','.join(map(str, x)),
        'rank': lambda x: ','.join(map(str, x))
    }).reset_index()

    # Drop 'id' column if it exists
    if 'id' in prediction_grouped.columns:
        prediction_grouped = prediction_grouped.drop(columns=['id'])

    # Filter data
    prediction_grouped = prediction_grouped[prediction_grouped['customer_id'] != 11]
    prediction_grouped.insert(0, 'id', range(len(prediction_grouped)))

    # Verify ranks and duplicates
    for index, row in prediction_grouped.iterrows():
        ranks = list(map(int, row['rank'].split(',')))
        if sorted(ranks) != list(range(1, 11)):
            print("Doublon détecté. Les rangs doivent être distincts (de 1 à 10) pour chacun des 10 produits prédits pour un client.\n")
            return None
        products = row['product_id'].split(',')
        if len(products) != len(set(products)):
            print("Doublon détecté. Il doit y avoir 10 produits différents par client.\n")
            return None

    return prediction_grouped

prediction_grouped = process_and_format_prediction(prediction)
print(prediction_grouped)


In [None]:
prediction_grouped.to_csv('submission/submission_list_bert_ftdinaldiaadsads.csv', index=False) ####