## IMPORTS

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!pip install tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from google.colab import drive
drive.mount('/content/gdrive')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# Import necessary libraries

import torch

# Load pre-trained BERT model and tokenizer
trainFolder = "gdrive/My Drive/MLDM - Carrefour Project/data-train"

import os
for dirname, _, filenames in os.walk(f'{trainFolder}'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Mounted at /content/gdrive
gdrive/My Drive/MLDM - Carrefour Project/data-train/products_data.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/test_data.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_1.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_10.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_2.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_3.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_4.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_5.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_6.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_7.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_8.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_9.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/products_embe.csv
gdrive/My Drive/MLDM - Car

In [2]:
# Import libraries
from tqdm import tqdm
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

### Data Description

This project uses three datasets:

#### `train_data.csv`:
This dataset contains two years (2022 & 2023) of historical transactions for 100,000 Carrefour customers. It has 10 columns:

* ***date***: Date of the transaction
* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: Product purchased
* ***has_loyality_card***: Flag indicating whether the customer has a loyalty card
* ***store_id***: Store where the purchase was made
* ***is_promo***: Flag indicating whether there was a discount on the product
* ***quantity***: Quantity purchased of the product
* ***format***: Ecommerce activity format (clcv, lex, or DRIVE)
  - clcv : courses livrées chez vous
  - lex : livraison express
  - DRIVE.
* ***orderChannelCode***: Indicates whether the online activity was made through the website or mobile app

#### `products_data.csv`:
This dataset contains detailed information about the products. The following columns are relevant to this project:

* ***product_id*** : Product name
* ***product_description*** : Product description
* ***department_key***: Department key
* ***class_key***: Class key
* ***subclass_key***: Subclass key
* ***sector***: sector name
* ***brand_key***: Brand name
* ***shelf_level1***: Top-level shelf category
* ***shelf_level2***: Second-level shelf category
* ***shelf_level3***: Third-level shelf category
* ***shelf_level4***: Fourth-level shelf category
* ***sector***: Sector
* ***bio***: Flag indicating whether the product is organic
* ***sugar_free***: Flag indicating whether the product is sugar-free
* ***aspartame_free***: Flag indicating whether the product is aspartame-free
* ***gluten_free***: Flag indicating whether the product is gluten-free
* ***halal***: Flag indicating whether the product is halal
* ***casher***: Flag indicating whether the product is kosher
* ***eco_friendly***: Flag indicating whether the product is eco-friendly
* ***local_french***: Flag indicating whether the product is locally produced in France
* ***artificial_coloring_free***: Flag indicating whether the product is free of artificial coloring
* ***taste_enhancer_free***: Flag indicating whether the product is free of taste enhancers
* ***naturality***: Naturality score
* ***antibiotic_free***: Flag indicating whether the product is antibiotic-free
* ***reduced_sugar***: Flag indicating whether the product has reduced sugar content
* ***vegetarian***: Flag indicating whether the product is vegetarian
* ***pesticide_free***: Flag indicating whether the product is pesticide-free
* ***grain_free***: Flag indicating whether the product is grain-free
* ***no_added_sugar***: Flag indicating whether the product has no added sugar
* ***salt_reduced***: Flag indicating whether the product has reduced salt content
* ***nitrite_free***: Flag indicating whether the product is nitrite-free
* ***fed_without_ogm***: Flag indicating whether the animals were fed without GMOs
* ***no_added_salt***: Flag indicating whether the product has no added salt
* ***no_artificial_flavours***: Flag indicating whether the product has no artificial flavors
* ***porc***: Flag indicating whether the product contains pork
* ***vegan***: Flag indicating whether the product is vegan
* ***frozen***: Flag indicating whether the product is frozen
* ***fat_free***: Flag indicating whether the product is fat-free
* ***reduced_fats***: Flag indicating whether the product has reduced fat content
* ***fresh***: Flag indicating whether the product is fresh
* ***alcool***: Flag indicating whether the product contains alcohol
* ***lactose_free***: Flag indicating whether the product is lactose-free
* ***phenylalanine_free***: Flag indicating whether the product is phenylalanine-free
* ***palm_oil_free***: Flag indicating whether the product is palm oil-free
* ***ecoscore***: Ecoscore
* ***produits_du_monde***: Flag indicating whether the product is an international product
* ***regional_product***: Flag indicating whether the product is a regional product
* ***national_brand***: Flag indicating whether the product is a national brand
* ***first_price_brand***: Flag indicating whether the product is a first-price brand
* ***carrefour_brand***: Flag indicating whether the product is a Carrefour brand

#### `test_data.csv`:
This dataset contains the actual purchases of the first 80,000 customers in 2024. It has three columns:

* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: the id of the purchased product

### Load data

* Load *train_data.csv*, *products_data.csv* and *test_data.csv* using pandas.

In [3]:
# This code reads the data from CSV files named "train_data_part_i.csv" for all i from 1 to 10
# and concatenates them into a single pandas DataFrame
train_dataframes = []
for i in tqdm(range(1, 11)): ## 1 - 11
    train_dataframes.append(pd.read_csv(f'{trainFolder}/train_data_part_{i}.csv'))
train_data = pd.concat(train_dataframes, ignore_index=True)

# free up memory by deleting the dataframes we no longer need
del train_dataframes

100%|██████████| 10/10 [03:25<00:00, 20.56s/it]


In [4]:
#train_data = train_data.sample(frac=0.1, random_state=1)  # frac=0.1 takes 10% of the data

In [5]:
# This code reads the data from a CSV file named "products_data.csv" into a pandas DataFrame
products_data = pd.read_csv(f'{trainFolder}/products_data.csv')

  products_data = pd.read_csv(f'{trainFolder}/products_data.csv')


In [6]:
# This code reads the data from a CSV file named "test_data.csv" into a pandas DataFrame
test_data = pd.read_csv(f'{trainFolder}/test_data.csv')

In [7]:
# Load product embeddings
product_embeddings_np = np.load(f'{trainFolder}/product_embeddings.npy')

# Load product IDs
product_ids = np.load(f'{trainFolder}/product_ids.npy', allow_pickle=True)  # Use allow_pickle=True if product_ids is an array of objects (e.g., strings)

# Create the product embedding dictionary
product_embedding_dict = dict(zip(product_ids, product_embeddings_np))

In [8]:
train_data

Unnamed: 0,date,transaction_id,customer_id,product_id,has_loyality_card,store_id,is_promo,quantity,format,order_channel
0,2023-11-15,Transaction_1730125,Household_39,Product_5362,0,Store_2,0,1.0,DRIVE,MOBILE_APP
1,2022-07-20,Transaction_1560535,Household_39,Product_67174,0,Store_2,0,2.0,DRIVE,WEBSITE
2,2022-07-20,Transaction_1560535,Household_39,Product_82254,0,Store_2,0,2.0,DRIVE,WEBSITE
3,2023-11-15,Transaction_1730125,Household_39,Product_3895,0,Store_2,0,1.0,DRIVE,MOBILE_APP
4,2022-07-20,Transaction_1560535,Household_39,Product_34014,0,Store_2,0,1.0,DRIVE,WEBSITE
...,...,...,...,...,...,...,...,...,...,...
87037457,2022-01-07,Transaction_1162379,Household_96742,Product_28756,0,Store_2542,0,1.0,DRIVE,MOBILE_APP
87037458,2023-08-17,Transaction_2306043,Household_96742,Product_62786,0,Store_2542,0,1.0,DRIVE,MOBILE_APP
87037459,2022-10-12,Transaction_1524531,Household_96742,Product_16362,0,Store_2542,0,1.0,DRIVE,MOBILE_APP
87037460,2023-03-10,Transaction_1972306,Household_96742,Product_78870,0,Store_2542,0,1.0,DRIVE,MOBILE_APP


In [9]:
# prompt: print from train_date rows with customer id Household_80001
import pandas as pd

# Create a list of customer IDs within the specified range
#customer_ids_new = [f'Household_{i}' for i in range(80001, 100002)]
#customer_ids_new = [f'Household_{i}' for i in range(10001, 20000)]
customer_ids_new = [f'Household_{i}' for i in range(20001, 30000)]

# prompt: keep in train_data and test_data the rows that have the cusomer_id in customer_ids

# Filter train_data
train_data = train_data[train_data['customer_id'].isin(customer_ids_new)]

# Filter test_data
test_data = test_data[test_data['customer_id'].isin(customer_ids_new)]

In [10]:
train_data

Unnamed: 0,date,transaction_id,customer_id,product_id,has_loyality_card,store_id,is_promo,quantity,format,order_channel
17514314,2023-02-09,Transaction_318310,Household_20188,Product_51237,0,Store_2,0,1.0,DRIVE,WEBSITE
17514315,2022-05-09,Transaction_238974,Household_20188,Product_32922,0,Store_2,0,1.0,DRIVE,WEBSITE
17514316,2023-02-09,Transaction_318310,Household_20188,Product_45500,0,Store_2,0,1.0,DRIVE,WEBSITE
17514317,2022-05-09,Transaction_238974,Household_20188,Product_16016,0,Store_2,0,1.0,DRIVE,WEBSITE
17514318,2023-03-25,Transaction_1477910,Household_20617,Product_64825,0,Store_2,0,1.0,DRIVE,MOBILE_APP
...,...,...,...,...,...,...,...,...,...,...
26166778,2022-05-24,Transaction_1485371,Household_29503,Product_74143,0,Store_2542,0,1.0,DRIVE,WEBSITE
26166779,2023-12-04,Transaction_2332026,Household_29503,Product_32349,0,Store_2542,0,1.0,DRIVE,WEBSITE
26166780,2023-03-09,Transaction_1339911,Household_23691,Product_3153,0,Store_2544,0,1.0,LEX,PRISE EN COMPTE
26166781,2023-12-27,Transaction_605731,Household_27440,Product_19524,0,Store_2545,0,1.0,CLCV,MOBILE_APP


In [11]:
test_data

Unnamed: 0,transaction_id,customer_id,product_id
25,Transaction_2024_10,Household_21933,Product_55825
26,Transaction_2024_10,Household_21933,Product_9723
27,Transaction_2024_10,Household_21933,Product_3914
28,Transaction_2024_10,Household_21933,Product_59618
29,Transaction_2024_10,Household_21933,Product_30800
...,...,...,...
966102,Transaction_2024_29356,Household_29999,Product_48044
966103,Transaction_2024_29356,Household_29999,Product_49717
966104,Transaction_2024_29356,Household_29999,Product_54686
966105,Transaction_2024_29356,Household_29999,Product_45025


In [12]:
train_data = train_data.drop(columns=['transaction_id','has_loyality_card', 'store_id','format','order_channel'])

## Training Data Preproccesing

In [13]:
train_data['date'] = pd.to_datetime(train_data['date'])

# Add recency attribute
latest_date = train_data['date'].max()  # Find the latest date in the dataset
train_data['recency'] = (latest_date - train_data['date']).dt.days  # Calculate days since last purchase

# Group by customer_id and product_id to calculate quantity and most recent purchase
customer_product_data = train_data.groupby(['customer_id', 'product_id']).agg({
    'quantity': 'sum',
    'recency': 'min'  # Minimum days since purchase (most recent)
}).reset_index()

# Normalize quantity and recency scores
customer_product_data['quantity_score'] = customer_product_data['quantity'] / customer_product_data['quantity'].max()
customer_product_data['recency_score'] = 1 - (customer_product_data['recency'] / customer_product_data['recency'].max())  # Recent = higher score

best_alpha = 0.97

# Calculate the final score with this alpha value
customer_product_data['final_score'] = (
    best_alpha * customer_product_data['quantity_score'] +
    (1 - best_alpha) * customer_product_data['recency_score']
)

# Rank products for each customer with unique ranks
customer_product_data['rank'] = customer_product_data.sort_values(
    by=['customer_id', 'final_score', 'quantity', 'product_id'],  # Add secondary sort keys
    ascending=[True, False, False, True]  # Ensure consistent order
).groupby('customer_id').cumcount() + 1  # Assign unique incremental ranks


# Sort the DataFrame by customer_id and rank
customer_product_data = customer_product_data.sort_values(by=['customer_id', 'rank'])

# Filter out products with a rank greater than 200 for each customer
#customer_product_data = customer_product_data[customer_product_data['rank'] <= 20]


In [14]:
customer_product_data

Unnamed: 0,customer_id,product_id,quantity,recency,quantity_score,recency_score,final_score,rank
167,Household_20001,Product_73736,17.0,93,0.004615,0.872428,0.030649,1
139,Household_20001,Product_64479,10.0,93,0.002714,0.872428,0.028806,2
192,Household_20001,Product_8528,8.0,93,0.002172,0.872428,0.028279,3
123,Household_20001,Product_59115,7.0,93,0.001900,0.872428,0.028016,4
8,Household_20001,Product_12215,6.0,93,0.001629,0.872428,0.027753,5
...,...,...,...,...,...,...,...,...
3328689,Household_29999,Product_44450,2.0,726,0.000543,0.004115,0.000650,809
3328801,Household_29999,Product_55513,2.0,726,0.000543,0.004115,0.000650,810
3328804,Household_29999,Product_55641,2.0,726,0.000543,0.004115,0.000650,811
3328889,Household_29999,Product_62656,2.0,726,0.000543,0.004115,0.000650,812


In [15]:


# Extract temporal features
train_data['month'] = train_data['date'].dt.month
train_data['day_of_week'] = train_data['date'].dt.dayofweek
train_data['is_weekend'] = train_data['day_of_week'].isin([5, 6]).astype(int)

# Calculate recency: days since the last purchase for each customer
train_data['days_since_last_purchase'] = train_data.groupby('customer_id')['date'].transform(
    lambda x: (x.max() - x).dt.days
)

# Drop the 'date' column if it's no longer needed
train_data = train_data.drop(columns=['date'], errors='ignore')

In [16]:
# Aggregate customer-specific features
customer_features = train_data.groupby(['customer_id']).agg(
    purchase_frequency=('product_id', 'count'),  # Total number of purchases
    avg_quantity=('quantity', 'mean'),          # Average quantity purchased
    promo_ratio=('is_promo', 'mean'),           # Proportion of purchases made with promotions
    unique_products=('product_id', 'nunique')   # Number of unique products purchased
).reset_index()

# Merge these features into train_data and test_data
train_data = train_data.merge(customer_features, on='customer_id', how='left')

In [17]:
train_data.tail()

Unnamed: 0,customer_id,product_id,is_promo,quantity,recency,month,day_of_week,is_weekend,days_since_last_purchase,purchase_frequency,avg_quantity,promo_ratio,unique_products
8651412,Household_29503,Product_74143,0,1.0,586,5,1,0,576,570,1.061404,0.021053,209
8651413,Household_29503,Product_32349,0,1.0,27,12,0,0,17,570,1.061404,0.021053,209
8651414,Household_23691,Product_3153,0,1.0,297,3,3,0,292,222,1.490991,0.130631,186
8651415,Household_27440,Product_19524,0,1.0,4,12,2,0,0,1037,1.591128,0.062681,414
8651416,Household_27440,Product_50995,0,3.0,102,9,2,0,98,1037,1.591128,0.062681,414


In [18]:
products_data.tail()

Unnamed: 0,product_id,product_description,department_key,class_key,subclass_key,sector,brand_key,shelf_level1,shelf_level2,shelf_level3,shelf_level4,bio,sugar_free,aspartame_free,gluten_free,halal,casher,eco_friendly,local_french,artificial_coloring_free,taste_enhancer_free,naturality,antibiotic_free,reduced_sugar,vegetarian,pesticide_free,grain_free,no_added_sugar,salt_reduced,nitrite_free,fed_without_ogm,no_added_salt,no_artificial_flavours,porc,vegan,frozen,fat_free,reduced_fats,fresh,alcool,lactose_free,phenylalanine_free,palm_oil_free,ecoscore,produits_du_monde,regional_product,national_brand,first_price_brand,carrefour_brand
82961,Product_18949,857699 BONNET POLAIRE NOIR TU,Department_66,Class_6620,SubClass_66204,TEXTILE,TEX,Mode et Bagagerie,Bagagerie et Maroquinerie,Accessoires de mode,"Chapeaux, Casquettes et Bonnets",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,0,1
82962,Product_66524,857699 BONNET PO BLANC CAS2 TU,Department_66,Class_6620,SubClass_66204,TEXTILE,TEX,Mode et Bagagerie,Bagagerie et Maroquinerie,Accessoires de mode,"Chapeaux, Casquettes et Bonnets",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,0,1
82963,Product_66126,876946 PULL DE NOEL PET ECRU S,Department_66,Class_6635,SubClass_66358,TEXTILE,TEX,Animalerie,Chiens,"Colliers, Laisses et Vêtements",Vêtements et Chaussures,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,0,1
82964,Product_31161,CHARGE BUT 13K CRF,Department_75,Class_7520,SubClass_75201,ACTIVITES PERIPHERIQUES,CARREFOUR,Maison et Décoration,Bouteilles de gaz,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,0,1
82965,Product_80183,CHARGE BUT 6K AZ,Department_75,Class_7520,SubClass_75203,ACTIVITES PERIPHERIQUES,ZZZZZZZZZZ,Maison et Décoration,Bouteilles de gaz,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,0,1


# Identifyng Missing Products

In [19]:
# Convert product_id columns to sets for quick set operations
train_product_ids = set(train_data["product_id"].unique())
products_product_ids = set(products_data["product_id"].unique())

# Find products in products_data that are not in train_data
missing_from_train = products_product_ids - train_product_ids

print(f"Number of products not in train_data: {len(missing_from_train)}")

if len(missing_from_train) > 0:
    print("These product IDs are in products_data but not in train_data:")
    for pid in missing_from_train:
        print(pid)
else:
    print("All products in products_data appear in train_data.")


[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
Product_64892
Product_48642
Product_30418
Product_9036
Product_60065
Product_63835
Product_35939
Product_11240
Product_52674
Product_76720
Product_12400
Product_55020
Product_40000
Product_1494
Product_10421
Product_19535
Product_5171
Product_563
Product_20941
Product_80469
Product_6479
Product_75754
Product_16272
Product_81435
Product_60641
Product_74381
Product_1594
Product_15617
Product_71502
Product_37095
Product_55497
Product_17556
Product_82159
Product_15746
Product_44176
Product_1707
Product_68255
Product_73949
Product_57172
Product_1995
Product_43175
Product_23137
Product_74186
Product_74148
Product_618
Product_54773
Product_16163
Product_40172
Product_50322
Product_47821
Product_35111
Product_55435
Product_24213
Product_17094
Product_33797
Product_78848
Product_14935
Product_38242
Product_5676
Product_20416
Product_24000
Product_51397
Product_27096
Product_54044
Product_15583
Product_14576
Product_27796

In [20]:
missing_products_info = products_data[products_data["product_id"].isin(missing_from_train)]

# Display the missing products dataframe
missing_products_info


Unnamed: 0,product_id,product_description,department_key,class_key,subclass_key,sector,brand_key,shelf_level1,shelf_level2,shelf_level3,shelf_level4,bio,sugar_free,aspartame_free,gluten_free,halal,casher,eco_friendly,local_french,artificial_coloring_free,taste_enhancer_free,naturality,antibiotic_free,reduced_sugar,vegetarian,pesticide_free,grain_free,no_added_sugar,salt_reduced,nitrite_free,fed_without_ogm,no_added_salt,no_artificial_flavours,porc,vegan,frozen,fat_free,reduced_fats,fresh,alcool,lactose_free,phenylalanine_free,palm_oil_free,ecoscore,produits_du_monde,regional_product,national_brand,first_price_brand,carrefour_brand
13,Product_51620,33CL LIMONADE LA BEAUCERONNE,Department_10,Class_1000,SubClass_10000,PGC,BEAUCERONN,Boissons,"Colas, Thés glacés, Sirops et Sodas","Limonades, Limes et Tonics",,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,1,0,0
18,Product_39850,75CL LORINA LIMONADE CITRON VT,Department_10,Class_1000,SubClass_10000,PGC,LORINA,Boissons,"Colas, Thés glacés, Sirops et Sodas","Limonades, Limes et Tonics",,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,1,0,0
19,Product_31931,75CL BEAUCERONNE CITRON GLACE,Department_10,Class_1000,SubClass_10000,PGC,BEAUCERONN,Boissons,"Colas, Thés glacés, Sirops et Sodas","Limonades, Limes et Tonics",,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,1,1,0,0
20,Product_69737,0.75 L LIMO GRENADINE 1888,Department_10,Class_1000,SubClass_10000,PGC,BEAUCERONN,Boissons,"Colas, Thés glacés, Sirops et Sodas","Limonades, Limes et Tonics",,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,1,1,0,0
21,Product_1433,BELVOIR CITRONNADE 750ML,Department_10,Class_1000,SubClass_10000,PGC,BELVOIR,Boissons,"Colas, Thés glacés, Sirops et Sodas","Limonades, Limes et Tonics",,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82959,Product_40278,926107 COUSSIN UNI COT ROSE TU,Department_65,Class_6573,SubClass_65730,TEXTILE,TEX HOME,Maison et Décoration,Mobilier et Décoration,Décorations et Parfums d'intérieur,Coussins et galettes de chaise,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,0,1
82960,Product_57956,885558 CUSHION 4 BLANC CAS2 TU,Department_65,Class_6573,SubClass_65730,TEXTILE,TEX HOME,Maison et Décoration,Mobilier et Décoration,Décorations et Parfums d'intérieur,Coussins et galettes de chaise,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,0,1
82961,Product_18949,857699 BONNET POLAIRE NOIR TU,Department_66,Class_6620,SubClass_66204,TEXTILE,TEX,Mode et Bagagerie,Bagagerie et Maroquinerie,Accessoires de mode,"Chapeaux, Casquettes et Bonnets",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,0,1
82963,Product_66126,876946 PULL DE NOEL PET ECRU S,Department_66,Class_6635,SubClass_66358,TEXTILE,TEX,Animalerie,Chiens,"Colliers, Laisses et Vêtements",Vêtements et Chaussures,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,0,1


In [21]:
# Load product embeddings
product_embeddings_np = np.load(f'{trainFolder}/product_embeddings.npy')

# Load product IDs
product_ids = np.load(f'{trainFolder}/product_ids.npy', allow_pickle=True)  # Use allow_pickle=True if product_ids is an array of objects (e.g., strings)

# Create the product embedding dictionary
product_embedding_dict = dict(zip(product_ids, product_embeddings_np))

# Negative Sampling using Embeddings

In [22]:
train_data

Unnamed: 0,customer_id,product_id,is_promo,quantity,recency,month,day_of_week,is_weekend,days_since_last_purchase,purchase_frequency,avg_quantity,promo_ratio,unique_products
0,Household_20188,Product_51237,0,1.0,325,2,3,0,283,1062,1.146893,0.051789,535
1,Household_20188,Product_32922,0,1.0,601,5,0,0,559,1062,1.146893,0.051789,535
2,Household_20188,Product_45500,0,1.0,325,2,3,0,283,1062,1.146893,0.051789,535
3,Household_20188,Product_16016,0,1.0,601,5,0,0,559,1062,1.146893,0.051789,535
4,Household_20617,Product_64825,0,1.0,281,3,5,1,271,252,1.369048,0.119048,113
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8651412,Household_29503,Product_74143,0,1.0,586,5,1,0,576,570,1.061404,0.021053,209
8651413,Household_29503,Product_32349,0,1.0,27,12,0,0,17,570,1.061404,0.021053,209
8651414,Household_23691,Product_3153,0,1.0,297,3,3,0,292,222,1.490991,0.130631,186
8651415,Household_27440,Product_19524,0,1.0,4,12,2,0,0,1037,1.591128,0.062681,414


In [23]:
len(train_data)

8651417

In [24]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import numpy as np
import pandas as pd

# Generate a list of all unique product IDs
all_product_ids = set(product_embedding_dict.keys())

# Prepare for storing negative samples
negative_samples = []

# Group by customer for efficient processing
customer_groups = train_data.groupby('customer_id')

for customer_id, group in tqdm(customer_groups, desc="Generating hard negative samples"):
    # Purchased products for the current customer
    purchased_products = set(group['product_id'])

    # Non-purchased products for the current customer
    non_purchased_products = list(all_product_ids - purchased_products)

    # Skip if there are no products to compare
    if len(non_purchased_products) == 0 or len(purchased_products) == 0:
        continue

    # Get embeddings for purchased and non-purchased products
    purchased_embeddings = np.array([product_embedding_dict[pid] for pid in purchased_products])
    non_purchased_embeddings = np.array([product_embedding_dict[pid] for pid in non_purchased_products])

    # Reshape to ensure 2D arrays
    purchased_embeddings = purchased_embeddings.reshape(-1, purchased_embeddings.shape[-1])
    non_purchased_embeddings = non_purchased_embeddings.reshape(-1, non_purchased_embeddings.shape[-1])

    # Compute cosine similarity between non-purchased and purchased products
    similarity_scores = cosine_similarity(non_purchased_embeddings, purchased_embeddings)

    # Identify the most similar non-purchased products (hard.  negatives)
    most_similar_indices = np.argsort(-similarity_scores, axis=0)[:len(group)]
    #hard_negatives = {non_purchased_products[i] for i in most_similar_indices.flatten()}[:len(group)]
    # Flatten and select up to len(group) unique hard negatives
    hard_negatives = list({non_purchased_products[i] for i in most_similar_indices.flatten()})[:len(group)]


    # Append hard negatives as negative samples
    negative_samples.extend([{
        'customer_id': customer_id,
        'product_id': product_id,
        'quantity': 0  # Indicating negative sample
    } for product_id in hard_negatives])

# Convert negative samples to a DataFrame
negative_samples_df = pd.DataFrame(negative_samples)

# Append negative samples to the original training data
train_data = pd.concat([train_data, negative_samples_df], ignore_index=True)

# Update relevance column (1 for purchased, 0 for negatives)
train_data['relevance'] = (train_data['quantity'] > 0).astype(int)


Generating hard negative samples: 100%|██████████| 9999/9999 [11:53:27<00:00,  4.28s/it]


In [25]:
train_data.tail()

Unnamed: 0,customer_id,product_id,is_promo,quantity,recency,month,day_of_week,is_weekend,days_since_last_purchase,purchase_frequency,avg_quantity,promo_ratio,unique_products,relevance
17302829,Household_29999,Product_34918,,0.0,,,,,,,,,,0
17302830,Household_29999,Product_51138,,0.0,,,,,,,,,,0
17302831,Household_29999,Product_46601,,0.0,,,,,,,,,,0
17302832,Household_29999,Product_42471,,0.0,,,,,,,,,,0
17302833,Household_29999,Product_29559,,0.0,,,,,,,,,,0


In [26]:
# Check for missing values
print("Missing values in train_data:")
print(train_data.isnull().sum())

print("Missing values in test_data:")
print(test_data.isnull().sum())

# Fill remaining missing values with zeros as a safety measure
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

Missing values in train_data:
customer_id                       0
product_id                        0
is_promo                    8651417
quantity                          0
recency                     8651417
month                       8651417
day_of_week                 8651417
is_weekend                  8651417
days_since_last_purchase    8651417
purchase_frequency          8651417
avg_quantity                8651417
promo_ratio                 8651417
unique_products             8651417
relevance                         0
dtype: int64
Missing values in test_data:
transaction_id    0
customer_id       0
product_id        0
dtype: int64


In [27]:
train_data = train_data.drop(columns=['day_of_week','is_weekend','unique_products'])

In [None]:
# prompt: in train_data for household Household_9247 count how manyrelevance 1 and how many are 0.

# Assuming 'train_data' DataFrame is already loaded as in the provided code.

household_9247_data = train_data[train_data['customer_id'] == 'Household_76806']

relevance_counts = household_9247_data['relevance'].value_counts()

print(f"Relevance counts for Household_9247:")
relevance_counts

Relevance counts for Household_9247:


Unnamed: 0_level_0,count
relevance,Unnamed: 1_level_1


In [None]:

train_data.to_csv(os.path.join(trainFolder, 'train_data_all_3.csv'), index=False)

In [None]:
len(train_data)

17302834

In [None]:
train_data

Unnamed: 0,customer_id,product_id,is_promo,quantity,recency,month,days_since_last_purchase,purchase_frequency,avg_quantity,promo_ratio,relevance
0,Household_20188,Product_51237,0.0,1.0,325.0,2.0,283.0,1062.0,1.146893,0.051789,1
1,Household_20188,Product_32922,0.0,1.0,601.0,5.0,559.0,1062.0,1.146893,0.051789,1
2,Household_20188,Product_45500,0.0,1.0,325.0,2.0,283.0,1062.0,1.146893,0.051789,1
3,Household_20188,Product_16016,0.0,1.0,601.0,5.0,559.0,1062.0,1.146893,0.051789,1
4,Household_20617,Product_64825,0.0,1.0,281.0,3.0,271.0,252.0,1.369048,0.119048,1
...,...,...,...,...,...,...,...,...,...,...,...
17302829,Household_29999,Product_34918,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0
17302830,Household_29999,Product_51138,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0
17302831,Household_29999,Product_46601,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0
17302832,Household_29999,Product_42471,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0


In [None]:
products_data