## IMPORTS

In [30]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!pip install tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from google.colab import drive
drive.mount('/content/gdrive')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# Import necessary libraries

import torch

# Load pre-trained BERT model and tokenizer
trainFolder = "gdrive/My Drive/MLDM - Carrefour Project/data-train"

import os
for dirname, _, filenames in os.walk(f'{trainFolder}'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
gdrive/My Drive/MLDM - Carrefour Project/data-train/products_data.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/test_data.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_1.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_10.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_2.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_3.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_4.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_5.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_6.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_7.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_8.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_par

In [31]:
# Import libraries
from tqdm import tqdm
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

### Data Description

This project uses three datasets:

#### `train_data.csv`:
This dataset contains two years (2022 & 2023) of historical transactions for 100,000 Carrefour customers. It has 10 columns:

* ***date***: Date of the transaction
* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: Product purchased
* ***has_loyality_card***: Flag indicating whether the customer has a loyalty card
* ***store_id***: Store where the purchase was made
* ***is_promo***: Flag indicating whether there was a discount on the product
* ***quantity***: Quantity purchased of the product
* ***format***: Ecommerce activity format (clcv, lex, or DRIVE)
  - clcv : courses livrées chez vous
  - lex : livraison express
  - DRIVE.
* ***orderChannelCode***: Indicates whether the online activity was made through the website or mobile app

#### `products_data.csv`:
This dataset contains detailed information about the products. The following columns are relevant to this project:

* ***product_id*** : Product name
* ***product_description*** : Product description
* ***department_key***: Department key
* ***class_key***: Class key
* ***subclass_key***: Subclass key
* ***sector***: sector name
* ***brand_key***: Brand name
* ***shelf_level1***: Top-level shelf category
* ***shelf_level2***: Second-level shelf category
* ***shelf_level3***: Third-level shelf category
* ***shelf_level4***: Fourth-level shelf category
* ***sector***: Sector
* ***bio***: Flag indicating whether the product is organic
* ***sugar_free***: Flag indicating whether the product is sugar-free
* ***aspartame_free***: Flag indicating whether the product is aspartame-free
* ***gluten_free***: Flag indicating whether the product is gluten-free
* ***halal***: Flag indicating whether the product is halal
* ***casher***: Flag indicating whether the product is kosher
* ***eco_friendly***: Flag indicating whether the product is eco-friendly
* ***local_french***: Flag indicating whether the product is locally produced in France
* ***artificial_coloring_free***: Flag indicating whether the product is free of artificial coloring
* ***taste_enhancer_free***: Flag indicating whether the product is free of taste enhancers
* ***naturality***: Naturality score
* ***antibiotic_free***: Flag indicating whether the product is antibiotic-free
* ***reduced_sugar***: Flag indicating whether the product has reduced sugar content
* ***vegetarian***: Flag indicating whether the product is vegetarian
* ***pesticide_free***: Flag indicating whether the product is pesticide-free
* ***grain_free***: Flag indicating whether the product is grain-free
* ***no_added_sugar***: Flag indicating whether the product has no added sugar
* ***salt_reduced***: Flag indicating whether the product has reduced salt content
* ***nitrite_free***: Flag indicating whether the product is nitrite-free
* ***fed_without_ogm***: Flag indicating whether the animals were fed without GMOs
* ***no_added_salt***: Flag indicating whether the product has no added salt
* ***no_artificial_flavours***: Flag indicating whether the product has no artificial flavors
* ***porc***: Flag indicating whether the product contains pork
* ***vegan***: Flag indicating whether the product is vegan
* ***frozen***: Flag indicating whether the product is frozen
* ***fat_free***: Flag indicating whether the product is fat-free
* ***reduced_fats***: Flag indicating whether the product has reduced fat content
* ***fresh***: Flag indicating whether the product is fresh
* ***alcool***: Flag indicating whether the product contains alcohol
* ***lactose_free***: Flag indicating whether the product is lactose-free
* ***phenylalanine_free***: Flag indicating whether the product is phenylalanine-free
* ***palm_oil_free***: Flag indicating whether the product is palm oil-free
* ***ecoscore***: Ecoscore
* ***produits_du_monde***: Flag indicating whether the product is an international product
* ***regional_product***: Flag indicating whether the product is a regional product
* ***national_brand***: Flag indicating whether the product is a national brand
* ***first_price_brand***: Flag indicating whether the product is a first-price brand
* ***carrefour_brand***: Flag indicating whether the product is a Carrefour brand

#### `test_data.csv`:
This dataset contains the actual purchases of the first 80,000 customers in 2024. It has three columns:

* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: the id of the purchased product

### Load data

* Load *train_data.csv*, *products_data.csv* and *test_data.csv* using pandas.

In [32]:
# This code reads the data from CSV files named "train_data_part_i.csv" for all i from 1 to 10
# and concatenates them into a single pandas DataFrame
train_dataframes = []
train_dataframes.append(pd.read_csv(f'{trainFolder}/train_data_all_9.csv'))
train_dataframes.append(pd.read_csv(f'{trainFolder}/train_data_all_10.csv'))
#train_dataframes.append(pd.read_csv(f'{trainFolder}/train_data_all_5.csv'))
train_data = pd.concat(train_dataframes, ignore_index=True)

# free up memory by deleting the dataframes we no longer need
del train_dataframes

In [33]:
#train_data = train_data.sample(frac=0.0001, random_state=1)  # frac=0.1 takes 10% of the data

In [34]:
# This code reads the data from a CSV file named "products_data.csv" into a pandas DataFrame
products_data = pd.read_csv(f'{trainFolder}/products_data.csv')

  products_data = pd.read_csv(f'{trainFolder}/products_data.csv')


In [35]:
# This code reads the data from a CSV file named "test_data.csv" into a pandas DataFrame
test_data = pd.read_csv(f'{trainFolder}/test_data.csv')

In [36]:
# Load product embeddings
product_embeddings_np = np.load(f'{trainFolder}/product_embeddings.npy')

# Load product IDs
product_ids = np.load(f'{trainFolder}/product_ids.npy', allow_pickle=True)  # Use allow_pickle=True if product_ids is an array of objects (e.g., strings)

# Create the product embedding dictionary
product_embedding_dict = dict(zip(product_ids, product_embeddings_np))

In [37]:
train_data

Unnamed: 0,customer_id,product_id,is_promo,quantity,recency,month,days_since_last_purchase,purchase_frequency,avg_quantity,promo_ratio,relevance
0,Household_80212,Product_8995,0.0,1.0,525.0,7.0,377.0,471.0,1.350318,0.038217,1
1,Household_80212,Product_70212,0.0,1.0,309.0,2.0,161.0,471.0,1.350318,0.038217,1
2,Household_80212,Product_13766,0.0,1.0,148.0,8.0,0.0,471.0,1.350318,0.038217,1
3,Household_80212,Product_57942,0.0,2.0,504.0,8.0,356.0,471.0,1.350318,0.038217,1
4,Household_80212,Product_30641,0.0,3.0,525.0,7.0,377.0,471.0,1.350318,0.038217,1
...,...,...,...,...,...,...,...,...,...,...,...
34337719,Household_99999,Product_41346,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0
34337720,Household_99999,Product_65052,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0
34337721,Household_99999,Product_63126,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0
34337722,Household_99999,Product_30377,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0


In [38]:
# prompt: print from train_date rows with customer id Household_80001
import pandas as pd

# Create a list of customer IDs within the specified range
#customer_ids_new = [f'Household_{i}' for i in range(80001, 100002)]
#customer_ids_new = [f'Household_{i}' for i in range(10001, 20000)]
#customer_ids_new = [f'Household_{i}' for i in range(1, 100)]

# Remove "Household_" prefix from 'customer_id' column
train_data['customer_id'] = train_data['customer_id'].str.replace('Household_', '')

# Remove "Product_" prefix from 'product_id' column
train_data['product_id'] = train_data['product_id'].str.replace('Product_', '')

In [39]:
# prompt: keep in train_data and test_data the rows that have the cusomer_id in customer_ids#

# Filter train_data
#train_data = train_data[train_data['customer_id'].isin(customer_ids_new)]

# Filter test_data
#test_data = test_data[test_data['customer_id'].isin(customer_ids_new)]

In [40]:
train_data

Unnamed: 0,customer_id,product_id,is_promo,quantity,recency,month,days_since_last_purchase,purchase_frequency,avg_quantity,promo_ratio,relevance
0,80212,8995,0.0,1.0,525.0,7.0,377.0,471.0,1.350318,0.038217,1
1,80212,70212,0.0,1.0,309.0,2.0,161.0,471.0,1.350318,0.038217,1
2,80212,13766,0.0,1.0,148.0,8.0,0.0,471.0,1.350318,0.038217,1
3,80212,57942,0.0,2.0,504.0,8.0,356.0,471.0,1.350318,0.038217,1
4,80212,30641,0.0,3.0,525.0,7.0,377.0,471.0,1.350318,0.038217,1
...,...,...,...,...,...,...,...,...,...,...,...
34337719,99999,41346,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0
34337720,99999,65052,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0
34337721,99999,63126,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0
34337722,99999,30377,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0


In [41]:
test_data

Unnamed: 0,transaction_id,customer_id,product_id
0,Transaction_2024_1,Household_16874,Product_9790
1,Transaction_2024_1,Household_16874,Product_68295
2,Transaction_2024_1,Household_16874,Product_19494
3,Transaction_2024_1,Household_16874,Product_11109
4,Transaction_2024_4,Household_9247,Product_57151
...,...,...,...
1220701,Transaction_2024_19277,Household_79999,Product_8951
1220702,Transaction_2024_19277,Household_79999,Product_9249
1220703,Transaction_2024_19277,Household_79999,Product_21485
1220704,Transaction_2024_81175,Household_80000,Product_74965


In [42]:
# Ensure all embeddings are numpy arrays of floats
for key in product_embedding_dict:
    product_embedding_dict[key] = np.array(product_embedding_dict[key], dtype=np.float32)

## Training Data Preproccesing

In [43]:
train_data.tail()

Unnamed: 0,customer_id,product_id,is_promo,quantity,recency,month,days_since_last_purchase,purchase_frequency,avg_quantity,promo_ratio,relevance
34337719,99999,41346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
34337720,99999,65052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
34337721,99999,63126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
34337722,99999,30377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
34337723,99999,7822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [44]:
# prompt: in train_data for household Household_9247 count how manyrelevance 1 and how many are 0.

# Assuming 'train_data' DataFrame is already loaded as in the provided code.

household_9247_data = train_data[train_data['customer_id'] == 'Household_76806']

relevance_counts = household_9247_data['relevance'].value_counts()

print(f"Relevance counts for Household_9247:")
relevance_counts

Relevance counts for Household_9247:


Unnamed: 0_level_0,count
relevance,Unnamed: 1_level_1


In [45]:
# prompt: print from train_date rows with customer id Household_80001
import pandas as pd

# Create a list of customer IDs within the specified range
#customer_ids_new = [f'Household_{i}' for i in range(80001, 100002)]
#customer_ids_new = [f'Household_{i}' for i in range(10001, 20000)]
#customer_ids_new = [f'Household_{i}' for i in range(1, 1500)]

# prompt: keep in train_data and test_data the rows that have the cusomer_id in customer_ids

# Filter train_data
#train_data = train_data[train_data['customer_id'].isin(customer_ids_new)]

# Filter test_data
#test_data = test_data[test_data['customer_id'].isin(customer_ids_new)]

In [46]:
# Group by customer_id and product_id to calculate quantity and most recent purchase
customer_product_data = train_data.groupby(['customer_id', 'product_id']).agg({
    'quantity': 'sum',
    'recency': 'min'  # Minimum days since purchase (most recent)
}).reset_index()

# Normalize quantity and recency scores
customer_product_data['quantity_score'] = customer_product_data['quantity'] / customer_product_data['quantity'].max()
customer_product_data['recency_score'] = 1 - (customer_product_data['recency'] / customer_product_data['recency'].max())  # Recent = higher score

best_alpha = 0.97

# Calculate the final score with this alpha value
customer_product_data['final_score'] = (
    best_alpha * customer_product_data['quantity_score'] +
    (1 - best_alpha) * customer_product_data['recency_score']
)

# Rank products for each customer with unique ranks
customer_product_data['rank'] = customer_product_data.sort_values(
    by=['customer_id', 'final_score', 'quantity', 'product_id'],  # Add secondary sort keys
    ascending=[True, False, False, True]  # Ensure consistent order
).groupby('customer_id').cumcount() + 1  # Assign unique incremental ranks



## MODELING

In [47]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [48]:
# Prepare input features
# Prepare input features (remove 'quantity')

features = ['customer_id','product_id','is_promo', 'recency', 'purchase_frequency', 'avg_quantity', 'promo_ratio','month']


In [49]:
new_product_embedding_dict = {}
for key, value in product_embedding_dict.items():
    new_key = key.replace('Product_', '')
    new_product_embedding_dict[new_key] = value

product_embedding_dict = new_product_embedding_dict

In [53]:

# Prepare full training data with embeddings using batch processing to avoid memory overload
def batch_process_train_data(train_data, product_embeddings, features, batch_size=1000):
    """Yield batches of combined features from train_data and product embeddings."""
    num_rows = len(train_data)
    for start_idx in range(0, num_rows, batch_size):
        end_idx = min(start_idx + batch_size, num_rows)
        batch_rows = train_data.iloc[start_idx:end_idx]
        batch_combined_features = []

        for _, row in batch_rows.iterrows():
            product_embedding = product_embeddings.get(row['product_id'])
            if product_embedding is not None:
                combined_features = np.hstack([row[features].values.astype(float).flatten(), product_embedding.flatten()])
                batch_combined_features.append(combined_features)
        #print(batch_combined_features)
        yield np.array(batch_combined_features, dtype=np.float32)

# Using the batch generator to process the data and store in memory-efficient way
train_data_with_embeddings = []
for batch_combined_features in tqdm(batch_process_train_data(train_data, product_embedding_dict, features), desc='Processing training data in batches'):
    train_data_with_embeddings.extend(batch_combined_features)

Processing training data in batches: 34338it [3:48:08,  2.51it/s]


In [54]:
len(train_data_with_embeddings)

34337724

In [55]:
train_data.head()

Unnamed: 0,customer_id,product_id,is_promo,quantity,recency,month,days_since_last_purchase,purchase_frequency,avg_quantity,promo_ratio,relevance
0,80212,8995,0.0,1.0,525.0,7.0,377.0,471.0,1.350318,0.038217,1
1,80212,70212,0.0,1.0,309.0,2.0,161.0,471.0,1.350318,0.038217,1
2,80212,13766,0.0,1.0,148.0,8.0,0.0,471.0,1.350318,0.038217,1
3,80212,57942,0.0,2.0,504.0,8.0,356.0,471.0,1.350318,0.038217,1
4,80212,30641,0.0,3.0,525.0,7.0,377.0,471.0,1.350318,0.038217,1


In [56]:
trainFolder = "gdrive/My Drive/MLDM - Carrefour Project/data-train"

In [57]:
import numpy as np

# Convert the list to a NumPy array
train_data_with_embeddings_array = np.array(train_data_with_embeddings)

# Check the properties of the NumPy array
print("Array type:", type(train_data_with_embeddings_array))
print("Array shape:", train_data_with_embeddings_array.shape)
print("Array size:", train_data_with_embeddings_array.size)
print("Array dtype:", train_data_with_embeddings_array.dtype)
print("Array memory size (GB):", train_data_with_embeddings_array.nbytes / (1024**3))


Array type: <class 'numpy.ndarray'>
Array shape: (34337724, 776)
Array size: 26646073824
Array dtype: float32
Array memory size (GB): 99.26436030864716


In [58]:
# Save the data
np.save(os.path.join(trainFolder, 'train_data_with_embeddings_final_52.npy'), np.array(train_data_with_embeddings_array))

In [59]:


# Extract the 'relevance' column as a NumPy array
relevance_array = train_data['relevance'].values

# Specify the file path where you want to save the array
file_path = os.path.join(trainFolder, 'relevance_array_5.npy')

# Save the array to a file
np.save(file_path, relevance_array)

In [60]:
# prompt: save dictionary customer_product_data on trainFolder and load it

import os
import numpy as np



# Save the dictionary
np.save(os.path.join(trainFolder, 'customer_product_data_5.npy'), customer_product_data)

# Load the dictionary
loaded_customer_product_data = np.load(os.path.join(trainFolder, 'customer_product_data_5.npy'), allow_pickle=True)

