## IMPORTS

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!pip install tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from google.colab import drive
drive.mount('/content/gdrive')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# Import necessary libraries

import torch

# Load pre-trained BERT model and tokenizer
trainFolder = "gdrive/My Drive/MLDM - Carrefour Project/data-train"

import os
for dirname, _, filenames in os.walk(f'{trainFolder}'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Mounted at /content/gdrive
gdrive/My Drive/MLDM - Carrefour Project/data-train/products_data.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/test_data.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_1.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_10.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_2.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_3.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_4.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_5.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_6.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_7.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_8.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_9.csv
gdrive/My Drive/MLDM - Carrefour Project/data-train/products_embe.csv
gdrive/My Drive/MLDM - Car

In [2]:
# Import libraries
from tqdm import tqdm
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

### Data Description

This project uses three datasets:

#### `train_data.csv`:
This dataset contains two years (2022 & 2023) of historical transactions for 100,000 Carrefour customers. It has 10 columns:

* ***date***: Date of the transaction
* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: Product purchased
* ***has_loyality_card***: Flag indicating whether the customer has a loyalty card
* ***store_id***: Store where the purchase was made
* ***is_promo***: Flag indicating whether there was a discount on the product
* ***quantity***: Quantity purchased of the product
* ***format***: Ecommerce activity format (clcv, lex, or DRIVE)
  - clcv : courses livrées chez vous
  - lex : livraison express
  - DRIVE.
* ***orderChannelCode***: Indicates whether the online activity was made through the website or mobile app

#### `products_data.csv`:
This dataset contains detailed information about the products. The following columns are relevant to this project:

* ***product_id*** : Product name
* ***product_description*** : Product description
* ***department_key***: Department key
* ***class_key***: Class key
* ***subclass_key***: Subclass key
* ***sector***: sector name
* ***brand_key***: Brand name
* ***shelf_level1***: Top-level shelf category
* ***shelf_level2***: Second-level shelf category
* ***shelf_level3***: Third-level shelf category
* ***shelf_level4***: Fourth-level shelf category
* ***sector***: Sector
* ***bio***: Flag indicating whether the product is organic
* ***sugar_free***: Flag indicating whether the product is sugar-free
* ***aspartame_free***: Flag indicating whether the product is aspartame-free
* ***gluten_free***: Flag indicating whether the product is gluten-free
* ***halal***: Flag indicating whether the product is halal
* ***casher***: Flag indicating whether the product is kosher
* ***eco_friendly***: Flag indicating whether the product is eco-friendly
* ***local_french***: Flag indicating whether the product is locally produced in France
* ***artificial_coloring_free***: Flag indicating whether the product is free of artificial coloring
* ***taste_enhancer_free***: Flag indicating whether the product is free of taste enhancers
* ***naturality***: Naturality score
* ***antibiotic_free***: Flag indicating whether the product is antibiotic-free
* ***reduced_sugar***: Flag indicating whether the product has reduced sugar content
* ***vegetarian***: Flag indicating whether the product is vegetarian
* ***pesticide_free***: Flag indicating whether the product is pesticide-free
* ***grain_free***: Flag indicating whether the product is grain-free
* ***no_added_sugar***: Flag indicating whether the product has no added sugar
* ***salt_reduced***: Flag indicating whether the product has reduced salt content
* ***nitrite_free***: Flag indicating whether the product is nitrite-free
* ***fed_without_ogm***: Flag indicating whether the animals were fed without GMOs
* ***no_added_salt***: Flag indicating whether the product has no added salt
* ***no_artificial_flavours***: Flag indicating whether the product has no artificial flavors
* ***porc***: Flag indicating whether the product contains pork
* ***vegan***: Flag indicating whether the product is vegan
* ***frozen***: Flag indicating whether the product is frozen
* ***fat_free***: Flag indicating whether the product is fat-free
* ***reduced_fats***: Flag indicating whether the product has reduced fat content
* ***fresh***: Flag indicating whether the product is fresh
* ***alcool***: Flag indicating whether the product contains alcohol
* ***lactose_free***: Flag indicating whether the product is lactose-free
* ***phenylalanine_free***: Flag indicating whether the product is phenylalanine-free
* ***palm_oil_free***: Flag indicating whether the product is palm oil-free
* ***ecoscore***: Ecoscore
* ***produits_du_monde***: Flag indicating whether the product is an international product
* ***regional_product***: Flag indicating whether the product is a regional product
* ***national_brand***: Flag indicating whether the product is a national brand
* ***first_price_brand***: Flag indicating whether the product is a first-price brand
* ***carrefour_brand***: Flag indicating whether the product is a Carrefour brand

#### `test_data.csv`:
This dataset contains the actual purchases of the first 80,000 customers in 2024. It has three columns:

* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: the id of the purchased product

### Load data

* Load *train_data.csv*, *products_data.csv* and *test_data.csv* using pandas.

In [3]:
# This code reads the data from CSV files named "train_data_part_i.csv" for all i from 1 to 10
# and concatenates them into a single pandas DataFrame
train_dataframes = []

train_dataframes.append(pd.read_csv(f'{trainFolder}/train_data_all_9.csv'))
train_dataframes.append(pd.read_csv(f'{trainFolder}/train_data_all_10.csv'))

train_data = pd.concat(train_dataframes, ignore_index=True)

In [4]:
# Group by customer_id and product_id to calculate quantity and most recent purchase
customer_product_data = train_data.groupby(['customer_id', 'product_id']).agg({
    'quantity': 'sum',
    'recency': 'min'  # Minimum days since purchase (most recent)
}).reset_index()

# Normalize quantity and recency scores
customer_product_data['quantity_score'] = customer_product_data['quantity'] / customer_product_data['quantity'].max()
customer_product_data['recency_score'] = 1 - (customer_product_data['recency'] / customer_product_data['recency'].max())  # Recent = higher score

best_alpha = 0.97

# Calculate the final score with this alpha value
customer_product_data['final_score'] = (
    best_alpha * customer_product_data['quantity_score'] +
    (1 - best_alpha) * customer_product_data['recency_score']
)

# Rank products for each customer with unique ranks
customer_product_data['rank'] = customer_product_data.sort_values(
    by=['customer_id', 'final_score', 'quantity', 'product_id'],  # Add secondary sort keys
    ascending=[True, False, False, True]  # Ensure consistent order
).groupby('customer_id').cumcount() + 1  # Assign unique incremental ranks


In [5]:
# This code reads the data from a CSV file named "products_data.csv" into a pandas DataFrame
products_data = pd.read_csv(f'{trainFolder}/products_data.csv')

  products_data = pd.read_csv(f'{trainFolder}/products_data.csv')


In [6]:
# This code reads the data from a CSV file named "test_data.csv" into a pandas DataFrame
test_data = pd.read_csv(f'{trainFolder}/test_data.csv')

In [7]:
# Load product embeddings
product_embeddings_np = np.load(f'{trainFolder}/product_embeddings.npy')

# Load product IDs
product_ids = np.load(f'{trainFolder}/product_ids.npy', allow_pickle=True)  # Use allow_pickle=True if product_ids is an array of objects (e.g., strings)

# Create the product embedding dictionary
product_embedding_dict = dict(zip(product_ids, product_embeddings_np))

In [8]:

# Create a list of customer IDs within the specified range
#customer_ids_new = [f'Household_{i}' for i in range(80001, 100002)]
#customer_ids_new = [f'Household_{i}' for i in range(10001, 20000)]
customer_ids_new = [f'Household_{i}' for i in range(80001, 100002)]
#test_data = test_data[test_data['customer_id'].isin(customer_ids_new)]

In [9]:
# Ensure all embeddings are numpy arrays of floats
for key in product_embedding_dict:
    product_embedding_dict[key] = np.array(product_embedding_dict[key], dtype=np.float32)

In [10]:
import pandas as pd
import numpy as np

customer_product_data = np.load(os.path.join(trainFolder, 'customer_product_data_5.npy'), allow_pickle=True)



## Training Data Preproccesing

In [11]:
import pandas as pd
import numpy as np

#customer_product_data = np.load(os.path.join(trainFolder, 'customer_product_data_5.npy'), allow_pickle=True)



In [12]:
#customer_product_data

## MODELING

In [13]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [14]:
# Prepare input features
# Prepare input features (remove 'quantity')

features = ['customer_id','product_id','is_promo', 'recency', 'purchase_frequency', 'avg_quantity', 'promo_ratio','month']


scaler_original = StandardScaler()
scaler_full = StandardScaler()

In [15]:
model = tf.keras.models.load_model('gdrive/My Drive/MLDM - Carrefour Project/models/model_checkpoint_05.keras')

TypeError: <class 'keras.src.models.sequential.Sequential'> could not be deserialized properly. Please ensure that components that are Python object instances (layers, models, etc.) returned by `get_config()` are explicitly deserialized in the model's `from_config()` method.

config={'module': 'keras', 'class_name': 'Sequential', 'config': {'name': 'sequential', 'layers': [{'module': 'keras.layers', 'class_name': 'InputLayer', 'config': {'batch_input_shape': [None, 776], 'dtype': 'float32', 'sparse': False, 'ragged': False, 'name': 'input_1'}, 'registered_name': None}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense', 'trainable': True, 'dtype': 'float32', 'units': 256, 'activation': 'relu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.009999999776482582}, 'registered_name': None}, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 776]}}, {'module': 'keras.layers', 'class_name': 'Dropout', 'config': {'name': 'dropout', 'trainable': True, 'dtype': 'float32', 'rate': 0.4, 'noise_shape': None, 'seed': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 256]}}, {'module': 'keras.layers', 'class_name': 'BatchNormalization', 'config': {'name': 'batch_normalization', 'trainable': True, 'dtype': 'float32', 'axis': [1], 'momentum': 0.99, 'epsilon': 0.001, 'center': True, 'scale': True, 'beta_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'gamma_initializer': {'module': 'keras.initializers', 'class_name': 'Ones', 'config': {}, 'registered_name': None}, 'moving_mean_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'moving_variance_initializer': {'module': 'keras.initializers', 'class_name': 'Ones', 'config': {}, 'registered_name': None}, 'beta_regularizer': None, 'gamma_regularizer': None, 'beta_constraint': None, 'gamma_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 256]}}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_1', 'trainable': True, 'dtype': 'float32', 'units': 128, 'activation': 'relu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.009999999776482582}, 'registered_name': None}, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 256]}}, {'module': 'keras.layers', 'class_name': 'Dropout', 'config': {'name': 'dropout_1', 'trainable': True, 'dtype': 'float32', 'rate': 0.4, 'noise_shape': None, 'seed': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 128]}}, {'module': 'keras.layers', 'class_name': 'BatchNormalization', 'config': {'name': 'batch_normalization_1', 'trainable': True, 'dtype': 'float32', 'axis': [1], 'momentum': 0.99, 'epsilon': 0.001, 'center': True, 'scale': True, 'beta_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'gamma_initializer': {'module': 'keras.initializers', 'class_name': 'Ones', 'config': {}, 'registered_name': None}, 'moving_mean_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'moving_variance_initializer': {'module': 'keras.initializers', 'class_name': 'Ones', 'config': {}, 'registered_name': None}, 'beta_regularizer': None, 'gamma_regularizer': None, 'beta_constraint': None, 'gamma_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 128]}}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_2', 'trainable': True, 'dtype': 'float32', 'units': 64, 'activation': 'relu', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': {'module': 'keras.regularizers', 'class_name': 'L2', 'config': {'l2': 0.009999999776482582}, 'registered_name': None}, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 128]}}, {'module': 'keras.layers', 'class_name': 'Dense', 'config': {'name': 'dense_3', 'trainable': True, 'dtype': 'float32', 'units': 1, 'activation': 'sigmoid', 'use_bias': True, 'kernel_initializer': {'module': 'keras.initializers', 'class_name': 'GlorotUniform', 'config': {'seed': None}, 'registered_name': None}, 'bias_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 64]}}]}, 'registered_name': None, 'build_config': {'input_shape': [None, 776]}, 'compile_config': {'optimizer': {'module': 'keras.optimizers', 'class_name': 'Adam', 'config': {'name': 'Adam', 'weight_decay': None, 'clipnorm': None, 'global_clipnorm': None, 'clipvalue': None, 'use_ema': False, 'ema_momentum': 0.99, 'ema_overwrite_frequency': None, 'jit_compile': False, 'is_legacy_optimizer': False, 'learning_rate': 0.0010000000474974513, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-07, 'amsgrad': False}, 'registered_name': None}, 'loss': 'binary_crossentropy', 'metrics': ['AUC'], 'loss_weights': None, 'weighted_metrics': None, 'run_eagerly': None, 'steps_per_execution': None, 'jit_compile': None}}.

Exception encountered: <class 'keras.src.layers.normalization.batch_normalization.BatchNormalization'> could not be deserialized properly. Please ensure that components that are Python object instances (layers, models, etc.) returned by `get_config()` are explicitly deserialized in the model's `from_config()` method.

config={'module': 'keras.layers', 'class_name': 'BatchNormalization', 'config': {'name': 'batch_normalization', 'trainable': True, 'dtype': 'float32', 'axis': [1], 'momentum': 0.99, 'epsilon': 0.001, 'center': True, 'scale': True, 'beta_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'gamma_initializer': {'module': 'keras.initializers', 'class_name': 'Ones', 'config': {}, 'registered_name': None}, 'moving_mean_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'moving_variance_initializer': {'module': 'keras.initializers', 'class_name': 'Ones', 'config': {}, 'registered_name': None}, 'beta_regularizer': None, 'gamma_regularizer': None, 'beta_constraint': None, 'gamma_constraint': None}, 'registered_name': None, 'build_config': {'input_shape': [None, 256]}}.

Exception encountered: Error when deserializing class 'BatchNormalization' using config={'name': 'batch_normalization', 'trainable': True, 'dtype': 'float32', 'axis': [1], 'momentum': 0.99, 'epsilon': 0.001, 'center': True, 'scale': True, 'beta_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'gamma_initializer': {'module': 'keras.initializers', 'class_name': 'Ones', 'config': {}, 'registered_name': None}, 'moving_mean_initializer': {'module': 'keras.initializers', 'class_name': 'Zeros', 'config': {}, 'registered_name': None}, 'moving_variance_initializer': {'module': 'keras.initializers', 'class_name': 'Ones', 'config': {}, 'registered_name': None}, 'beta_regularizer': None, 'gamma_regularizer': None, 'beta_constraint': None, 'gamma_constraint': None}.

Exception encountered: int() argument must be a string, a bytes-like object or a real number, not 'list'

In [None]:
#X_train_full = np.array(train_data_with_embeddings, dtype=np.float32)
#X_train_full = scaler_full.fit_transform(X_train_full)

# Split training and validation data
#X_train, X_val, y_train, y_val = train_test_split(X_train_full, y[:len(X_train_full)], test_size=0.2, random_state=42)

# Build model
#input_dim = X_train_full.shape[1]

## EVALUATION



In [None]:
# Save Recommendations to File
recommendations_file = 'gdrive/My Drive/MLDM - Carrefour Project/data-train/product_recommendations_combined.npy'

# Step 6: Load Recommendations from File
def load_recommendations():
    """Load recommendations from the saved file."""
    return np.load(recommendations_file, allow_pickle=True).item()

recommendations_data = load_recommendations()

# Step 7: Recommend Products from Loaded File
def recommend_from_loaded_file(product_id):
    """Get recommendations for a product ID from the preloaded recommendations."""
    try:
        return recommendations_data[product_id]
    except KeyError:
        return f"Product ID {product_id} not found in recommendations."

# Example Usage
example_product_id = "Product_33508"
print(f"Recommendations for {example_product_id}: {recommend_from_loaded_file(example_product_id)}")

In [None]:
# Prepare test set for Hit Rate @10
def hitrate_at_k(true_data: pd.DataFrame, predicted_data: pd.DataFrame, k: int = 10) -> float:
    """
    This function calculates the hitrate at k for the recommendations.
    It assesses how relevant our k product recommendations are.
    In other words, it calculates the proportion of recommended products that are actually purchased by the customer.

    Args:
        true_data: a pandas DataFrame containing the true data
            customer_id: the customer identifier
            product_id: the product identifier that was purchased in the test set
        predicted_data: a pandas DataFrame containing the predicted data
            customer_id: the customer identifier
            product_id: the product identifier that was recommended
            rank: the rank of the recommendation. the rank should be between 1 and k.
        k: the number of recommendations to consider. k should be between 1 and 10.

    Returns:
        The hitrate at k
    """
    data = pd.merge(left=true_data, right=predicted_data, how="left", on=["customer_id", "product_id"])
    df = data[data["rank"] <= k]
    non_null_counts = df.groupby('customer_id')['rank'].apply(lambda x: x.notna().sum()).reset_index(name='non_null_count')
    distinct_products_per_customer = data.groupby('customer_id')['product_id'].nunique().reset_index(name='distinct_product_count')
    df = pd.merge(left=distinct_products_per_customer, right=non_null_counts, how="left", on="customer_id")
    df["denominator"] = [min(df.iloc[i].distinct_product_count, k) for i in range(len(df))]
    df = df.fillna(0)
    return (df["non_null_count"] / df["denominator"]).mean()


In [None]:
# prompt: change products_data to dataframe with colum 'product_id'

products_data2 = pd.DataFrame(products_data, columns=['product_id'])
products_data2

In [None]:
import random

def generate_recommendations_subgroup(customer_purchases):

  recommended_product_ids = []

  for product_id in customer_purchases['product_id']:
    recommendations = recommend_from_loaded_file(product_id)
    recommended_product_ids.extend(recommendations)
    #recommended_product_ids.extend(random.choice(products_data2['product_id']))

  recommended_df = pd.DataFrame({'product_id': recommended_product_ids})

  customer_purchases = pd.concat([customer_purchases, recommended_df], ignore_index=True)
  customer_purchases
  customer_purchases = customer_purchases.drop_duplicates(subset='product_id', keep='first')
  return customer_purchases

In [None]:
customer_product_data

In [None]:
customer_ids_new = [customer_id.replace('Household_', '') for customer_id in customer_ids_new]


In [None]:
type(customer_ids_new[0])

In [None]:
customer_ids_new[0]

In [None]:
type(customer_product_data[0][0])

In [None]:
customer_product_data_df = pd.DataFrame(customer_product_data)

# Set column names if they are not already set
customer_product_data_df.columns = ['customer_id', 'product_id', 'quantity', 'recency', 'quantity_score', 'recency_score', 'final_score', 'rank']

# Access the 'customer_id' column using the column name
customer_ids = customer_product_data_df['customer_id']


In [None]:
customer_product_data_df

In [None]:
customer_ids_new

In [None]:
customer_purchases = customer_product_data_df[customer_product_data_df['customer_id'] == '80001']
customer_purchases = customer_purchases[customer_purchases['rank'] <= 15]
customer_purchases

In [None]:
customer_filtered = train_data[train_data['customer_id'] == 'Household_80212']
customer_filtered

In [None]:
train_data

In [None]:
# prompt: remove all letters and underscore from dataframe rows  keep it simple

import pandas as pd
import re

def remove_letters_and_underscore(df):
  """Removes letters and underscores from dataframe rows.

  Args:
    df: The input pandas DataFrame.

  Returns:
    A new DataFrame with letters and underscores removed from specified columns.
  """

  # Create a copy to avoid modifying the original DataFrame
  modified_df = df.copy()

  for col in df.columns:
    if pd.api.types.is_string_dtype(df[col]):  # Check if the column contains strings
        modified_df[col] = df[col].astype(str).apply(lambda x: re.sub(r'[a-zA-Z_]', '', x))

  return modified_df


In [None]:
train_data = remove_letters_and_underscore(train_data)
train_data

In [None]:
train_data.drop(columns=['days_since_last_purchase'], inplace=True)

In [None]:
customer_product_data_df

In [None]:
# Assuming customer_features_train contains your training features
customer_features_train = train_data[features].values  # Select features from training data
customer_features_train = customer_features_train.astype(float)  # Convert to float
scaler_original.fit(customer_features_train)

In [None]:

#train_data = remove_letters_and_underscore(train_data)
# Prepare predicted data for hit rate calculation
predicted_data = []
for customer_id in tqdm(customer_ids_new, desc='Processing customers'):
    #customer_purchases = test_data[test_data['customer_id'] == customer_id]
    #customer_purchases = filter_top_30_products(train_data[train_data['customer_id'] == customer_id], train_data)
    customer_purchases = customer_product_data_df[customer_product_data_df['customer_id'] == customer_id]
    customer_purchases = customer_purchases[customer_purchases['rank'] <= 15]
    customer_purchases = customer_purchases.drop(columns=['customer_id','quantity_score','recency_score','final_score','rank'])
    #customer_purchases = generate_recommendations_subgroup(customer_purchases)
    #print(customer_purchases)
    customer_filtered = train_data[train_data['customer_id'] == customer_id]
    if not customer_filtered.empty:
        customer_features = customer_filtered[features].iloc[0].values
        customer_features = scaler_original.transform([customer_features.astype(float)])
        scores = []
        for _, row in customer_purchases.iterrows():
            product_embedding = product_embedding_dict.get('Product_'+row['product_id'])
            if product_embedding is not None:
                input_features = np.hstack([customer_features, product_embedding])
                score = model.predict(np.array(input_features).reshape(1, -1), verbose=0)
                scores.append(('Product_'+row['product_id'], score))
        top_10 = []
        seen_products = set()
        for product_id, score in sorted(scores, key=lambda x: x[1], reverse=True):
            if product_id not in seen_products:
                top_10.append((product_id, len(top_10) + 1))
                seen_products.add(product_id)
            if len(top_10) == 10:
                break
        predicted_data.extend([(customer_id, product_id, rank) for product_id, rank in top_10])

predicted_df = pd.DataFrame(predicted_data, columns=['customer_id', 'product_id', 'rank'])
predicted_df['customer_id'] = 'Household_' + predicted_df['customer_id'].astype(str)
# Calculate Hit Rate @10 using new function
hit_rate_at_10 = hitrate_at_k(test_data[['customer_id', 'product_id']], predicted_df, k=10)
print('Hit Rate @10:', hit_rate_at_10)

In [None]:
predicted_df

In [None]:
test_data

In [None]:
predicted_df

In [None]:
customer_product_data[8001][0]

In [None]:
scores

In [None]:
product_embedding_dict

In [None]:
# Create submission file from existing predictions
submission_df = predicted_df[predicted_df['customer_id'].isin([f"Household_{i}" for i in range(1, 100001)])]
prediction = submission_df



In [None]:
household_16874_df = test_data[test_data['customer_id'] == 'Household_5017']
household_16874_df3 = submission_df[submission_df['customer_id'] == 'Household_5017']
common_products = set(household_16874_df['product_id']).intersection(set(household_16874_df3['product_id']))
print(f"Number of common products: {len(common_products)}")
similarity_ratio = len(common_products) / len(household_16874_df['product_id']) if len(household_16874_df['product_id']) > 0 else 0
print(f"Similarity Ratio: {similarity_ratio}")

if similarity_ratio > 0.5:  # Adjust threshold as needed
    print("The product IDs in the two dataframes are relatively similar.")
else:
    print("The product IDs in the two dataframes are not very similar.")

In [None]:
prediction

In [None]:
submission_df

In [None]:
# Process and format prediction
def process_and_format_prediction(df):
    # Replace invalid characters in column names
    df.columns = df.columns.str.replace('+AF8-', '_', regex=False)
    df = df.replace(r'\+AF8-', '_', regex=True)

    # Clean columns 'customer_id', 'product_id', and 'transaction_id'
    if 'customer_id' in df.columns and df['customer_id'].dtype == 'object':
        df['customer_id'] = df['customer_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'product_id' in df.columns and df['product_id'].dtype == 'object':
        df['product_id'] = df['product_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'transaction_id' in df.columns and df['transaction_id'].dtype == 'object':
        df['transaction_id'] = df['transaction_id'].str.replace(r'\D', '', regex=True).fillna(11).astype(int)

    df['id'] = df.index
    df = df[['id'] + [col for col in df.columns if col != 'id']]

    if 'customer_id' not in df.columns or 'product_id' not in df.columns:
        raise ValueError("true_data must contain 'customer_id' and 'product_id' columns")

    # Group by 'customer_id' and concatenate product and rank values
    prediction_grouped = df.groupby('customer_id').agg({
        'id': 'first',
        'product_id': lambda x: ','.join(map(str, x)),
        'rank': lambda x: ','.join(map(str, x))
    }).reset_index()

    # Drop 'id' column if it exists
    if 'id' in prediction_grouped.columns:
        prediction_grouped = prediction_grouped.drop(columns=['id'])

    # Filter data
    prediction_grouped = prediction_grouped[prediction_grouped['customer_id'] != 11]
    prediction_grouped.insert(0, 'id', range(len(prediction_grouped)))

    return prediction_grouped

prediction_grouped = process_and_format_prediction(prediction)
print(prediction_grouped)


In [None]:
prediction_grouped

In [None]:
# Create a .csv file to submit on kaggle
# A lancer en local sur votre ordinateur
prediction_grouped.to_csv('gdrive/My Drive/MLDM - Carrefour Project/submission/finalbitch.csv', index=False) ####

In [None]:
predicted_df.to_csv('gdrive/My Drive/MLDM - Carrefour Project/submission/predicted_df.csv', index=False) ####

In [None]:
predicted_df

## prev


In [None]:
# Prepare test set for Hit Rate @10
#customer_ids = test_data['customer_id'].unique()

# Prepare test set for Hit Rate @10

hit_count = 0
customer_total_hits = 0
for customer_id in tqdm(customer_ids, desc='Processing customers'):
    customer_purchases = test_data[test_data['customer_id'] == customer_id]
    customer_filtered = train_data[train_data['customer_id'] == customer_id]
    if not customer_filtered.empty:
        customer_features = customer_filtered[features].iloc[0].values
    else:
        continue
    customer_features = scaler_original.transform([customer_features.astype(float)])
    scores = []
    for _, row in tqdm(customer_purchases.iterrows(), total=len(customer_purchases), desc='Processing products', leave=False):
        product_embedding = product_embedding_dict.get(row['product_id'])
        if product_embedding is not None:
            input_features = np.hstack([customer_features, product_embedding])
            score = model.predict(np.array(input_features).reshape(1, -1), verbose=0)
            scores.append((row['product_id'], score))
    top_10 = list(dict(sorted(scores, key=lambda x: x[1], reverse=True)).items())[:10]
    purchased_products = customer_purchases['product_id'].values
    hit_count += len(set(purchased_products) & set([product_id for product_id, _ in top_10]))
    customer_total_hits += 1 if len(set(purchased_products) & set([product_id for product_id, _ in top_10])) > 0 else 0

hit_rate_at_10 = customer_total_hits / len(customer_ids)
print('Hit Rate @10:', hit_rate_at_10)



In [None]:
# Create submission file for
submission_data = []
for customer_id in customer_ids:
    scores = []
    customer_purchases = test_data[test_data['customer_id'] == customer_id]
    for _, row in customer_purchases.iterrows():
        product_embedding = product_embedding_dict.get(row['product_id'])
        if product_embedding is not None:
            input_features = np.hstack([customer_features, product_embedding])
            score = model.predict(np.array(input_features).reshape(1, -1), verbose=0)
            scores.append((row['product_id'], score))
    top_10 = sorted(scores, key=lambda x: x[1], reverse=True)[:10]
    submission_data.extend([(customer_id, product_id, rank + 1) for rank, (product_id, _) in enumerate(top_10)])

# Convert to DataFrame and filter
submission_df = pd.DataFrame(submission_data, columns=['customer_id', 'product_id', 'rank'])
prediction = submission_df[submission_df.customer_id.isin([f"Household_{i}" for i in range(1, 100001)])]

# Process and format prediction
def process_and_format_prediction(df):
    # Replace invalid characters in column names
    df.columns = df.columns.str.replace('+AF8-', '_', regex=False)
    df = df.replace(r'\+AF8-', '_', regex=True)

    # Clean columns 'customer_id', 'product_id', and 'transaction_id'
    if 'customer_id' in df.columns and df['customer_id'].dtype == 'object':
        df['customer_id'] = df['customer_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'product_id' in df.columns and df['product_id'].dtype == 'object':
        df['product_id'] = df['product_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'transaction_id' in df.columns and df['transaction_id'].dtype == 'object':
        df['transaction_id'] = df['transaction_id'].str.replace(r'\D', '', regex=True).fillna(11).astype(int)

    df['id'] = df.index
    df = df[['id'] + [col for col in df.columns if col != 'id']]

    if 'customer_id' not in df.columns or 'product_id' not in df.columns:
        raise ValueError("true_data must contain 'customer_id' and 'product_id' columns")

    # Group by 'customer_id' and concatenate product and rank values
    prediction_grouped = df.groupby('customer_id').agg({
        'id': 'first',
        'product_id': lambda x: ','.join(map(str, x)),
        'rank': lambda x: ','.join(map(str, x))
    }).reset_index()

    # Drop 'id' column if it exists
    if 'id' in prediction_grouped.columns:
        prediction_grouped = prediction_grouped.drop(columns=['id'])

    # Filter data
    prediction_grouped = prediction_grouped[prediction_grouped['customer_id'] != 11]
    prediction_grouped.insert(0, 'id', range(len(prediction_grouped)))

    # Verify ranks and duplicates
    for index, row in prediction_grouped.iterrows():
        ranks = list(map(int, row['rank'].split(',')))
        if sorted(ranks) != list(range(1, 11)):
            print("Doublon détecté. Les rangs doivent être distincts (de 1 à 10) pour chacun des 10 produits prédits pour un client.\n")
            return None
        products = row['product_id'].split(',')
        if len(products) != len(set(products)):
            print("Doublon détecté. Il doit y avoir 10 produits différents par client.\n")
            return None

    return prediction_grouped

prediction_grouped = process_and_format_prediction(prediction)
print(prediction_grouped)


In [None]:
prediction_grouped.to_csv('submission/submission_list_bert_ft.csv', index=False) ####