In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from huggingface_hub import HfApi
import gc
import os
from torch.utils.data import IterableDataset

## Data Description

This project uses three datasets:

#### `train_data.csv`:
This dataset contains two years (2022 & 2023) of historical transactions for 100,000 Carrefour customers. It has 10 columns:

* ***date***: Date of the transaction
* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: Product purchased
* ***has_loyality_card***: Flag indicating whether the customer has a loyalty card
* ***store_id***: Store where the purchase was made
* ***is_promo***: Flag indicating whether there was a discount on the product
* ***quantity***: Quantity purchased of the product
* ***format***: Ecommerce activity format (clcv, lex, or DRIVE)
  - clcv : courses livrées chez vous
  - lex : livraison express
  - DRIVE.
* ***orderChannelCode***: Indicates whether the online activity was made through the website or mobile app

#### `products_data.csv`:
This dataset contains detailed information about the products. The following columns are relevant to this project:

* ***product_id*** : Product name
* ***product_description*** : Product description
* ***department_key***: Department key
* ***class_key***: Class key
* ***subclass_key***: Subclass key
* ***sector***: sector name
* ***brand_key***: Brand name
* ***shelf_level1***: Top-level shelf category
* ***shelf_level2***: Second-level shelf category
* ***shelf_level3***: Third-level shelf category
* ***shelf_level4***: Fourth-level shelf category
* ***sector***: Sector
* ***bio***: Flag indicating whether the product is organic
* ***sugar_free***: Flag indicating whether the product is sugar-free
* ***aspartame_free***: Flag indicating whether the product is aspartame-free
* ***gluten_free***: Flag indicating whether the product is gluten-free
* ***halal***: Flag indicating whether the product is halal
* ***casher***: Flag indicating whether the product is kosher
* ***eco_friendly***: Flag indicating whether the product is eco-friendly
* ***local_french***: Flag indicating whether the product is locally produced in France
* ***artificial_coloring_free***: Flag indicating whether the product is free of artificial coloring
* ***taste_enhancer_free***: Flag indicating whether the product is free of taste enhancers
* ***naturality***: Naturality score
* ***antibiotic_free***: Flag indicating whether the product is antibiotic-free
* ***reduced_sugar***: Flag indicating whether the product has reduced sugar content
* ***vegetarian***: Flag indicating whether the product is vegetarian
* ***pesticide_free***: Flag indicating whether the product is pesticide-free
* ***grain_free***: Flag indicating whether the product is grain-free
* ***no_added_sugar***: Flag indicating whether the product has no added sugar
* ***salt_reduced***: Flag indicating whether the product has reduced salt content
* ***nitrite_free***: Flag indicating whether the product is nitrite-free
* ***fed_without_ogm***: Flag indicating whether the animals were fed without GMOs
* ***no_added_salt***: Flag indicating whether the product has no added salt
* ***no_artificial_flavours***: Flag indicating whether the product has no artificial flavors
* ***porc***: Flag indicating whether the product contains pork
* ***vegan***: Flag indicating whether the product is vegan
* ***frozen***: Flag indicating whether the product is frozen
* ***fat_free***: Flag indicating whether the product is fat-free
* ***reduced_fats***: Flag indicating whether the product has reduced fat content
* ***fresh***: Flag indicating whether the product is fresh
* ***alcool***: Flag indicating whether the product contains alcohol
* ***lactose_free***: Flag indicating whether the product is lactose-free
* ***phenylalanine_free***: Flag indicating whether the product is phenylalanine-free
* ***palm_oil_free***: Flag indicating whether the product is palm oil-free
* ***ecoscore***: Ecoscore
* ***produits_du_monde***: Flag indicating whether the product is an international product
* ***regional_product***: Flag indicating whether the product is a regional product
* ***national_brand***: Flag indicating whether the product is a national brand
* ***first_price_brand***: Flag indicating whether the product is a first-price brand
* ***carrefour_brand***: Flag indicating whether the product is a Carrefour brand

#### `test_data.csv`:
This dataset contains the actual purchases of the first 80,000 customers in 2024. It has three columns:

* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: the id of the purchased product

## Load data

* Load *train_data.csv*, *products_data.csv* and *test_data.csv* using pandas.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# This code reads the data from CSV files named "train_data_part_i.csv" for all i from 1 to 10
# and concatenates them into a single pandas DataFrame

data_path = '/content/drive/MyDrive/Colab_Notebooks/MLDM_Project/data-train/'

train_dataframes = []
for i in tqdm(range(8, 9)): #range(8, 11) - so far: 8
    train_dataframes.append(pd.read_csv(f'{data_path}train_data_part_{i}.csv'))
train_data = pd.concat(train_dataframes, ignore_index=True)

# free up memory by deleting the dataframes we no longer need
del train_dataframes

100%|██████████| 1/1 [00:23<00:00, 23.99s/it]


In [None]:
# This code reads the data from a CSV file named "products_data.csv" into a pandas DataFrame
products_data = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MLDM_Project/data-train/products_data.csv')

  products_data = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MLDM_Project/data-train/products_data.csv')


In [None]:
# This code reads the data from a CSV file named "test_data.csv" into a pandas DataFrame
test_data = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MLDM_Project/data-train/test_data.csv')

In [None]:
train_data.sample(5)

Unnamed: 0,date,transaction_id,customer_id,product_id,has_loyality_card,store_id,is_promo,quantity,format,order_channel
6053173,2023-04-30,Transaction_2795927,Household_71252,Product_42689,0,Store_2339,0,1.0,CLCV,WEBSITE
5217956,2022-01-31,Transaction_2423118,Household_74792,Product_26296,0,Store_1791,1,1.0,DRIVE,WEBSITE
3185447,2022-02-20,Transaction_1852666,Household_73582,Product_5836,0,Store_2466,0,1.0,DRIVE,WEBSITE
17182,2022-05-30,Transaction_2573260,Household_73234,Product_74487,0,Store_167,0,1.0,DRIVE,WEBSITE
5518457,2023-05-22,Transaction_731947,Household_73622,Product_62786,0,Store_2186,0,1.0,CLCV,MOBILE_APP


In [None]:
products_data.sample(3)

Unnamed: 0,product_id,product_description,department_key,class_key,subclass_key,sector,brand_key,shelf_level1,shelf_level2,shelf_level3,...,alcool,lactose_free,phenylalanine_free,palm_oil_free,ecoscore,produits_du_monde,regional_product,national_brand,first_price_brand,carrefour_brand
72322,Product_53110,130G ASSORT ITALIEN PR CRF CL,Department_25,Class_2541,SubClass_25411,PGC,CRF CLASS,Charcuterie et Traiteur,Charcuterie,Jambons crus et Charcuteries tranchées,...,0,0,0,0,,0,0,0,0,1
9003,Product_54273,CADUM DCH HYPOALLERGENIQUE FP7,Department_12,Class_1211,SubClass_12114,PGC,CADUM,Hygiène et Beauté,Corps,Gels douche,...,0,0,0,0,,0,0,1,0,0
21774,Product_11975,1KG LASAGNES GEANTES TEZIER,Department_14,Class_1451,SubClass_14511,PGC,TEZIER,Epicerie salée,Pâtes,Lasagnes et Crozets,...,0,0,0,0,,0,1,1,0,0


In [None]:
test_data.sample(3)

Unnamed: 0,transaction_id,customer_id,product_id
24741,Transaction_2024_6056,Household_42335,Product_61412
522812,Transaction_2024_87517,Household_22626,Product_48038
59112,Transaction_2024_14640,Household_41888,Product_57858


## Preprocess the data

 Use mutual information to find important features while keeping them intact for prompt creation

In [None]:
# Load sample data
sample_data = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MLDM_Project/data-train/train_data_part_1.csv', nrows=10000)
sample_data = sample_data.merge(products_data, on='product_id', how='left')

# Calculate purchase frequency for sample data
frequency_data = sample_data.groupby('product_id').size().reset_index(name='purchase_frequency')
sample_data = sample_data.merge(frequency_data, on='product_id', how='left')

# Drop unnecessary columns and separate features and target
X = sample_data.drop(columns=['purchase_frequency', 'date', 'transaction_id', 'customer_id', 'product_id'])
y = sample_data['purchase_frequency']

# Encode categorical features using Label Encoding or One-Hot Encoding
categorical_features = X.select_dtypes(include=['object']).columns

# Use LabelEncoder if there are few unique values; otherwise, use One-Hot Encoding
for col in categorical_features:
    if X[col].nunique() < 10:  # Example threshold for few unique values
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
    else:
        X = pd.get_dummies(X, columns=[col], drop_first=True)

# Calculate mutual information
mutual_info = mutual_info_regression(X, y)
mutual_info_series = pd.Series(mutual_info, index=X.columns).sort_values(ascending=False)

# Display top features
top_features = mutual_info_series.head(10)
print("Top features based on mutual information:\n", top_features)

Top features based on mutual information:
 ecoscore                                      0.079769
shelf_level1_Crèmerie et Produits laitiers    0.059322
department_key_Department_22                  0.057195
shelf_level2_Légumes                          0.057161
carrefour_brand                               0.055642
sector                                        0.054353
department_key_Department_25                  0.051316
brand_key_ZZZZZZZZZZ                          0.045082
national_brand                                0.042777
class_key_Class_2231                          0.039390
dtype: float64


In [None]:
# Select top features for a compact dataset
top_feature_columns = ['ecoscore', 'shelf_level1', 'department_key', 'shelf_level2', 'carrefour_brand',
                       'sector', 'brand_key', 'national_brand', 'class_key']

# Keep only relevant columns in products_data
reduced_products_data = products_data[top_feature_columns + ['product_id']]

# Merge the reduced product info back into a smaller transaction dataset
reduced_train_data = train_data[['date', 'transaction_id', 'customer_id', 'product_id', 'quantity']].merge(
    reduced_products_data, on='product_id', how='left')

# Now `reduced_train_data` includes only relevant features and necessary identifiers
print(reduced_train_data.head())

         date       transaction_id      customer_id     product_id  quantity  \
0  2023-05-01  Transaction_1016059  Household_70602  Product_80441       1.0   
1  2023-09-04  Transaction_1070016  Household_71431  Product_58948       2.0   
2  2023-04-03  Transaction_2796335  Household_71431  Product_63692       1.0   
3  2023-11-02  Transaction_2390894  Household_71431  Product_79339       1.0   
4  2023-05-01  Transaction_1853829  Household_71431  Product_61922       1.0   

  ecoscore            shelf_level1 department_key  \
0      NaN  Entretien et Nettoyage  Department_11   
1      NaN              Animalerie  Department_14   
2        A         Epicerie sucrée  Department_14   
3      NaN         Epicerie sucrée  Department_14   
4        C                Boissons  Department_10   

                          shelf_level2  carrefour_brand sector   brand_key  \
0                 Nettoyants vaisselle                1    PGC  CRF EXPERT   
1                                Chats      

### Sectioning the dataset

In [None]:
# Convert the date column to datetime if not already done
reduced_train_data['date'] = pd.to_datetime(reduced_train_data['date'])

# Filter data for 2023
data_2023 = reduced_train_data[reduced_train_data['date'].dt.year == 2023]

# Split the 2023 data into 10 roughly equal sections
data_2023_sections = np.array_split(data_2023, 10)

# Display the number of rows in each section
for i, section in enumerate(data_2023_sections):
    print(f"Section {i+1}: {len(section)} rows")

del reduced_train_data
del data_2023

  return bound(*args, **kwds)


Section 1: 477088 rows
Section 2: 477088 rows
Section 3: 477088 rows
Section 4: 477088 rows
Section 5: 477087 rows
Section 6: 477087 rows
Section 7: 477087 rows
Section 8: 477087 rows
Section 9: 477087 rows
Section 10: 477087 rows


## GPT

### Initial Test

### Taking only 50 customers

In [None]:
# Convert the date column to datetime if not already done
reduced_train_data['date'] = pd.to_datetime(reduced_train_data['date'])

# Filter data for 2023
data_2023 = reduced_train_data[reduced_train_data['date'].dt.year == 2023]

# Randomly select 50 customers from the training data
selected_customers = data_2023['customer_id'].drop_duplicates().sample(n=50, random_state=42)

# Filter the training and test sets for these customers
filtered_train_data = data_2023[data_2023['customer_id'].isin(selected_customers)]
filtered_test_data = test_data[test_data['customer_id'].isin(selected_customers)]

# print("Filtered Training Data:\n", filtered_train_data.head())
# print("Filtered Test Data:\n", filtered_test_data.head())

no features, just a list of products:


In [None]:
# Merge all transactions by transaction_id and date
grouped_data = (
    filtered_train_data.groupby(['customer_id', 'date', 'transaction_id'])
    .agg({'product_id': lambda x: list(x)})  # Aggregate products into a list
    .reset_index()
)

grouped_data = grouped_data.sort_values(by=['customer_id', 'date'])
# Print the transformed data
print("Grouped Training Data:\n", grouped_data.head())

Grouped Training Data:
        customer_id       date       transaction_id  \
0  Household_70176 2023-01-18   Transaction_405354   
1  Household_70176 2023-02-14   Transaction_549363   
2  Household_70176 2023-04-05   Transaction_154183   
3  Household_70176 2023-08-23  Transaction_2614460   
4  Household_70176 2023-10-05  Transaction_1122708   

                                          product_id  
0  [Product_31374, Product_59935, Product_81818, ...  
1  [Product_16304, Product_39217, Product_56031, ...  
2                                    [Product_66063]  
3  [Product_2054, Product_12679, Product_23153, P...  
4  [Product_73607, Product_57478, Product_33872, ...  


### Training GPT for 10 customers

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device:", device)

Device: cuda


In [None]:
class PurchaseDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.inputs = []
        # Iterate through each customer
        for customer_id, customer_data in tqdm(grouped_data.groupby('customer_id'), desc="Processing customers"):

            customer_data = customer_data.sort_values(by='date')

            # Exclude the very last purchase (since it has no "next" purchase to predict)
            for i in range(len(customer_data) - 1):
                current_row = customer_data.iloc[i]
                next_row = customer_data.iloc[i + 1]
                current_product_ids = current_row['product_id']
                next_product_ids = next_row['product_id']

                # Create the prompt for the sequence of purchases
                prompt = f"Customer {customer_id} purchased: {', '.join(map(str, current_product_ids))} on {current_row['date']}. These are the products they will buy next:\n" \
                         f"{', '.join(map(str, next_product_ids))}."


            """
            prompt = f"Customer: {row['customer_id']} made a purchase on {row['date']} for product name: {row['product_id']} with features:\n" \
                                f"- Quantity: {row['quantity']}, EcoScore: {row['ecoscore']}, Shelf Level 1: {row['shelf_level1']}, Shelf Level 2: {row['shelf_level2']}, " \
                                f"- Department: {row['department_key']}, Sector: {row['sector']}, Brand: {row['brand_key']}, (Carrefour Brand: {row['carrefour_brand']}), " \
                                f"- National Brand: {row['national_brand']}, and Class: {row['class_key']}.\n"

            """
            # Tokenize the prompt
            tokenized = tokenizer(prompt, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")

            # Use input_ids as labels
            self.inputs.append({
                "input_ids": tokenized["input_ids"].squeeze(),
                "attention_mask": tokenized["attention_mask"].squeeze(),
                "labels": tokenized["input_ids"].squeeze()
            })

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx]

In [None]:
# class PurchaseDataset(Dataset):
#     def __init__(self, df, tokenizer, max_length=512):
#         self.inputs = []
#         # Iterate over the dataframe rows with tqdm for progress tracking
#         for _, row in tqdm(df.iterrows(), total=len(df), desc="Preparing Dataset"):
#             # Ensure 'product_id' is a list; it should already be grouped
#             product_ids = row['product_id']

#             # Generate the prompt with all products
#             prompt = f"Customer {row['customer_id']} purchased: {', '.join(map(str, product_ids))} on {row['date']}."

#             #print(prompt)


#             """
#             prompt = f"Customer: {row['customer_id']} made a purchase on {row['date']} for product name: {row['product_id']} with features:\n" \
#                                 f"- Quantity: {row['quantity']}, EcoScore: {row['ecoscore']}, Shelf Level 1: {row['shelf_level1']}, Shelf Level 2: {row['shelf_level2']}, " \
#                                 f"- Department: {row['department_key']}, Sector: {row['sector']}, Brand: {row['brand_key']}, (Carrefour Brand: {row['carrefour_brand']}), " \
#                                 f"- National Brand: {row['national_brand']}, and Class: {row['class_key']}.\n"

#             """
#             # Tokenize the prompt
#             tokenized = tokenizer(prompt, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")

#             # Use input_ids as labels
#             self.inputs.append({
#                 "input_ids": tokenized["input_ids"].squeeze(),
#                 "attention_mask": tokenized["attention_mask"].squeeze(),
#                 "labels": tokenized["input_ids"].squeeze()
#             })

#     def __len__(self):
#         return len(self.inputs)

#     def __getitem__(self, idx):
#         return self.inputs[idx]

In [None]:
def fine_tune_gpt(data, model, output_dir):
    dataset = PurchaseDataset(data, tokenizer)
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,
        num_train_epochs=10,#1
        logging_strategy="steps",
        logging_steps=10,  # Log every 10 steps to monitor progress
        save_strategy="no",
        report_to=[],  # Disable external loggers like W&B or TensorBoard
        log_level="error",
        disable_tqdm=False  # Enable progress bar
    )
    trainer = Trainer(model=model, args=training_args, train_dataset=dataset)
    trainer.train()

In [None]:
# Initialize GPT tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# or this:                                                      tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
# or this:                                                      checkpoint = "gpt2"       model = AutoModelForCausalLM.from_pretrained(checkpoint) (?)

# For Further tuning:
#model = GPT2LMHeadModel.from_pretrained(hub_repo_name).to(device)

# Disable Weights & Biases
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Fine-tune GPT on 2022 Data and Save to Hugging Face
fine_tune_gpt(grouped_data, model, output_dir="./gpt_recommend")

Processing customers: 100%|██████████| 50/50 [00:00<00:00, 110.34it/s]


Step,Training Loss
10,2.0185
20,1.4686
30,1.3851
40,1.1824
50,1.2965
60,1.0579
70,1.335
80,1.1957
90,1.1604
100,1.1788


### Upload in Hugging Face

In [None]:
# Hugging Face Hub details
hub_repo_name = ""
access_token = ""

# Push model and tokenizer to Hugging Face Hub
model.push_to_hub(hub_repo_name, use_auth_token=access_token)
tokenizer.push_to_hub(hub_repo_name, use_auth_token=access_token)



model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]



README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/MoGP/recom_gpt_50_samples/commit/ca68764ef98e72d6a61a9d748d24192fb1b5fd7c', commit_message='Upload tokenizer', commit_description='', oid='ca68764ef98e72d6a61a9d748d24192fb1b5fd7c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/MoGP/recom_gpt_50_samples', endpoint='https://huggingface.co', repo_type='model', repo_id='MoGP/recom_gpt_50_samples'), pr_revision=None, pr_num=None)

In [None]:
torch.cuda.empty_cache()
gc.collect()
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

### Load the model from hub


In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

checkpoint = ""
# model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
model = GPT2LMHeadModel.from_pretrained(checkpoint).to(device)

print(device)

config.json:   0%|          | 0.00/907 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

cuda


### Recommend using last purchase


In [None]:
# Step 1: Sort the grouped data by customer_id and date
grouped_data = grouped_data.sort_values(by=['customer_id', 'date'])

# Step 2: Get the last purchase for each customer
last_purchases = grouped_data.groupby('customer_id').last().reset_index()

# Step 3: Retain only customer_id and product_id columns
#last_purchases = last_purchases[['customer_id', 'product_id']]

# Display the result
print(last_purchases.head())

       customer_id       date       transaction_id  \
0  Household_70176 2023-12-28   Transaction_475209   
1  Household_70587 2023-12-25  Transaction_2691181   
2  Household_70623 2023-11-27  Transaction_2653565   
3  Household_70635 2023-12-29  Transaction_2438728   
4  Household_70639 2023-12-29   Transaction_244868   

                                          product_id  
0  [Product_77092, Product_50010, Product_20885, ...  
1  [Product_42233, Product_7633, Product_3443, Pr...  
2  [Product_37251, Product_52185, Product_37251, ...  
3  [Product_17102, Product_53104, Product_17102, ...  
4  [Product_67343, Product_69911, Product_29076, ...  


In [None]:
import re

def extract_product_names(output_text):
    # This will look for patterns like 'Product_123' in the output
    product_pattern = r"Product_\d+"  # Regex to find 'Product_ followed by digits'
    return re.findall(product_pattern, output_text)


In [None]:
def recommend_products(model, customer_id):

    # Create a context-rich prompt
    last_purchase = last_purchases[last_purchases['customer_id'] == customer_id]['product_id'].values[0]

    input_text = (
        f"Customer {customer_id} purchased: {', '.join(map(str, last_purchase))}. "
        f"What will they buy next? Recommend 10 products."
    )

    # # Create a context-rich prompt
    # input_text = (
    #     f"Customer {customer_id} purchased: {', '.join(map(str, last_purchase))}. Recommend 10 products for their next purchase."
    # )

    #input_text = (f"Based on their purchase history, recommend 10 products that customer {customer_id} is most likely to buy next.")

    # (f"Task: Recommend 10 product names customer {customer_id} is most likely to buy next. "
    #  f"Only provide product names as a comma-separated list.")
    # Recommend the next 10 most likely products for customer {customer_id} based on their purchase history.
    # Based on their purchase history, recommend 10 product names that customer {customer_id} will most likely buy in their next purchase. No features is needed.

    # Tokenize and generate recommendations
    input_ids = tokenizer.encode(input_text, return_tensors="pt", padding=True, truncation=True).to(device)
    attention_mask = input_ids != tokenizer.pad_token_id
    # output = model.generate(
    #     input_ids,
    #     attention_mask=attention_mask,
    #     pad_token_id=tokenizer.pad_token_id,
    #     do_sample=True,
    #     max_length=100,  # Limit the response length
    #     num_return_sequences=1,  # Single output sequence
    #     temperature=0.7,  # Control randomness
    #     top_p=0.9  # Use nucleus sampling for diverse outputs
    # )

    # output = model.generate(
    #     input_ids,
    #     attention_mask=attention_mask,
    #     pad_token_id=tokenizer.pad_token_id,
    #     do_sample=True,
    #     max_length=1000, #50 # Shorten the response limit to reduce verbosity
    #     num_return_sequences=1,
    #     temperature=0.5,  # Lower temperature for more deterministic output
    #     top_k=1000, #50  # Consider only the top 50 tokens at each step
    #     top_p=0.8  # Reduce randomness further
    # )

    # output = model.generate(
    #     input_ids,
    #     attention_mask=attention_mask,
    #     pad_token_id=tokenizer.pad_token_id,
    #     do_sample=False,  # Use greedy sampling to avoid random output
    #     max_length=512,  # Limit the length to avoid excessive verbosity
    #     num_return_sequences=1,
    #     temperature=0.0,  # Set to 0 for more deterministic output
    #     top_k=50,  # Consider only the top 50 tokens at each step
    #     top_p=0.8  # Reduce randomness further
    # )

    #print("Input IDs:", input_ids)
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=False,
        max_new_tokens=100,
        num_return_sequences=1,
        temperature=0.0,
        top_k=50,
        top_p=0.8
    )
    #print("Generated Output (Raw Tokens):", output)


    # Decode and split into 10 recommendations
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True).strip()
    #print(f"Model output: {decoded_output}")
    #recommendations = recommendations.split(", ")  # Assuming the model outputs a comma-separated list

    recommendation_start = decoded_output.find("Recommend")  # Look for where recommendations start
    recommendations_text = decoded_output[recommendation_start:]  # Everything after the "Recommend"

    # Now, extract only product names using regular expressions
    recommendations = extract_product_names(recommendations_text)

    return recommendations[:10]  # Ensure exactly 10 recommendations


In [None]:
def hitrate_at_k(recommendations: list, actual_purchases: list, k: int = 10) -> float:
    """
    Calculate the hitrate@k for a single customer's recommendations.

    Args:
        recommendations: List of top-k recommended product IDs.
        actual_purchases: List of actual product IDs purchased by the customer.
        k: Number of recommendations to consider (default 10).

    Returns:
        HitRate@k as a float for the given recommendations and purchases.
    """
    # Ensure recommendations are unique and limited to top-k
    recommendations = list(dict.fromkeys(recommendations))[:k]

    # Calculate hits
    hits = sum(1 for rec in recommendations if rec in actual_purchases)

    # Compute denominator as min(N, k), where N is the number of actual purchases
    denominator = min(len(actual_purchases), k)

    # Avoid division by zero
    if denominator > 0:
        return hits / denominator
    else:
        return 0.0


In [None]:
# Prepare a dictionary to store recommendations and actual purchases
hit_rates = []

for customer_id in selected_customers:
    # Generate recommendations using the fine-tuned model
    recommendations = recommend_products(model, customer_id=customer_id)
    # if isinstance(recommendations, list):
    #     recommendations = " ".join(recommendations)

    # recommendations = extract_product_names(recommendations)

    # Get the customer's next purchase from the test set
    actual_purchases = filtered_test_data[filtered_test_data['customer_id'] == customer_id]['product_id'].tolist()

    # print
    print(f"Recommendations for customer {customer_id}: {recommendations}")
    print(f"Actual purchases of customer {customer_id}: {actual_purchases}")

    # Compute hitrates
    hit_rate = hitrate_at_k(recommendations, actual_purchases, k=10)
    print(f"Hit Rate@10 for customer {customer_id}: {hit_rate:.2%}\n")
    hit_rates.append(hit_rate)

Recommendations for customer Household_73432: []
Actual purchases of customer Household_73432: ['Product_3010', 'Product_42815', 'Product_11982', 'Product_56083', 'Product_44153', 'Product_41865', 'Product_42748', 'Product_82444']
Hit Rate@10 for customer Household_73432: 0.00%

Recommendations for customer Household_78366: []
Actual purchases of customer Household_78366: ['Product_21039', 'Product_18913', 'Product_22099', 'Product_17016', 'Product_75270', 'Product_53553', 'Product_66427', 'Product_61011', 'Product_78558', 'Product_63570', 'Product_1911', 'Product_43639', 'Product_17879', 'Product_24230', 'Product_8420', 'Product_25311']
Hit Rate@10 for customer Household_78366: 0.00%

Recommendations for customer Household_72892: []
Actual purchases of customer Household_72892: ['Product_35646', 'Product_12052', 'Product_8872', 'Product_13467', 'Product_61011', 'Product_46089', 'Product_54216', 'Product_40480', 'Product_69027', 'Product_36725', 'Product_9428', 'Product_56031']
Hit Rat

In [None]:
# Compute overall hit rate
overall_hit_rate = sum(hit_rates) / len(hit_rates)
print(f"Overall Hit Rate@10: {overall_hit_rate:.2%}")

Overall Hit Rate@10: 0.00%


## Actual Process

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device:", device)

In [None]:
# # Hugging Face Hub details
# hub_repo_name = ""
# access_token = ""

# # Prepare Data for Fine-Tuning GPT with tqdm progress bar
# class PurchaseDataset(Dataset):
#     def __init__(self, df, tokenizer, max_length=512):
#         self.inputs = []
#         for _, row in tqdm(df.iterrows(), total=len(df), desc="Preparing Dataset"):
#             prompt = f"Customer {row['customer_id']} made a purchase on {row['date']} for product {row['product_id']} with features:\n" \
#                      f"- Quantity: {row['quantity']}, EcoScore: {row['ecoscore']}, Shelf Level 1: {row['shelf_level1']}, Shelf Level 2: {row['shelf_level2']}, " \
#                      f"- Department: {row['department_key']}, Sector: {row['sector']}, Brand: {row['brand_key']}, (Carrefour Brand: {row['carrefour_brand']}), " \
#                      f"- National Brand: {row['national_brand']}, and Class: {row['class_key']}.\n"

#             self.inputs.append(tokenizer(prompt, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt"))

#     def __len__(self):
#         return len(self.inputs)

#     def __getitem__(self, idx):
#         return self.inputs[idx]

# # Initialize GPT tokenizer and model
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer.pad_token = tokenizer.eos_token
# model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
# # For Further tuning:
# #model = GPT2LMHeadModel.from_pretrained(hub_repo_name).to(device)

# # Disable Weights & Biases
# os.environ["WANDB_DISABLED"] = "true"

# def fine_tune_gpt(data, output_dir):
#     dataset = PurchaseDataset(data, tokenizer)
#     training_args = TrainingArguments(
#         output_dir=output_dir,
#         per_device_train_batch_size=4,
#         num_train_epochs=1,
#         logging_strategy="no",  # Completely disable logging
#         save_strategy="no",  # Avoid intermediate saves
#         report_to=[],  # Disable all loggers like wandb and tensorboard
#         log_level="error"  # Suppress detailed logs
#     )
#     trainer = Trainer(model=model, args=training_args, train_dataset=dataset)
#     trainer.train()

#     # Save locally
#     model.save_pretrained(output_dir)
#     tokenizer.save_pretrained(output_dir)

#     # Push model and tokenizer to Hugging Face Hub
#     model.push_to_hub(hub_repo_name, use_auth_token=access_token)
#     tokenizer.push_to_hub(hub_repo_name, use_auth_token=access_token)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Hugging Face Hub details
hub_repo_name = ""
access_token = ""

# Prepare Data for Fine-Tuning GPT with tqdm progress bar
class PurchaseDataset(IterableDataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __iter__(self):
        for _, row in tqdm(self.df.iterrows(), total=len(self.df), desc="Preparing Dataset"):
            prompt = f"Customer {row['customer_id']} made a purchase on {row['date']} for product {row['product_id']} with features:\n" \
                     f"- Quantity: {row['quantity']}, EcoScore: {row['ecoscore']}, Shelf Level 1: {row['shelf_level1']}, Shelf Level 2: {row['shelf_level2']}, " \
                     f"- Department: {row['department_key']}, Sector: {row['sector']}, Brand: {row['brand_key']}, (Carrefour Brand: {row['carrefour_brand']}), " \
                     f"- National Brand: {row['national_brand']}, and Class: {row['class_key']}.\n"

            # Tokenize on-the-fly
            tokenized = self.tokenizer(prompt, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")

            # Use input_ids as labels
            yield {
                "input_ids": tokenized["input_ids"].squeeze(),
                "attention_mask": tokenized["attention_mask"].squeeze(),
                "labels": tokenized["input_ids"].squeeze()
            }


# Initialize GPT tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
# For Further tuning:
#model = GPT2LMHeadModel.from_pretrained(hub_repo_name).to(device)

# Disable Weights & Biases
os.environ["WANDB_DISABLED"] = "true"

def fine_tune_gpt(data, output_dir):
    dataset = PurchaseDataset(data, tokenizer)

    # Approximate number of steps if dataset length is unknown
    # Replace with len(data) if using a known-length dataset
    steps_per_epoch = len(data) // 4  # Assuming batch size = 4
    max_steps = steps_per_epoch  # 1 epoch

    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,
        num_train_epochs=1,
        max_steps=max_steps,  # Specify max_steps for IterableDataset
        logging_strategy="no",
        save_strategy="no",
        report_to=[],
        log_level="error"
    )

    trainer = Trainer(model=model, args=training_args, train_dataset=dataset)
    trainer.train()

    # Save locally
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Push model and tokenizer to Hugging Face Hub
    model.push_to_hub(hub_repo_name, use_auth_token=access_token)
    tokenizer.push_to_hub(hub_repo_name, use_auth_token=access_token)


In [None]:
# Fine-tune GPT on 2022 Data and Save to Hugging Face
fine_tune_gpt(data_2023_sections[0], output_dir="./gpt_recommend")

Preparing Dataset:   1%|▏         | 6600/477088 [17:43<20:59:37,  6.23it/s]

In [None]:
torch.cuda.empty_cache()
gc.collect()
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

### Recommend without history

In [None]:
def recommend_products(model, customer_id):
    input_text = f"Recommend products for customer {customer_id} based on their previous purchases."
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    output = model.generate(input_ids, max_length=100, num_return_sequences=1)
    recommendations = tokenizer.decode(output[0], skip_special_tokens=True)
    return recommendations.split(", ")


### Recommend with history

In [None]:
# Use Fine-tuned Model for Predictions on Customers
# Is there a history?
# Dunno yet
def recommend_products(model, customer_id, history):
    input_text = f"Customer {customer_id} has bought {history}\n"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    output = model.generate(input_ids, max_length=100, num_return_sequences=1)
    recommendations = tokenizer.decode(output[0], skip_special_tokens=True)
    print("Recommendations:", recommendations)

# Example usage
recommend_products(model, customer_id=12345, history="milk, bread, eggs")


# Create submission file

The goal of this part is to provide a function that allows you to encode your prediction in a format that is readable by kaggle when you submit it. In particular, this function checks that you have 10 distinct products per customer and that the ranks are some distinct integers between 1 to 10.

In [None]:
# Create submission file for

# Keep only the top 10 recommendations for Households between 80001 and 100000
prediction = top_10_recommendations[
    top_10_recommendations.customer_id.isin(
            [
                f"Household_{i}" for i in range(80001,100001)
            ]
        )
    ]

# Print the solution
prediction.head()

In [None]:
def process_and_format_prediction(df):
    # Remplacement des caractères invalides dans les noms de colonnes
    df.columns = df.columns.str.replace('+AF8-', '_', regex=False)
    df = df.replace(r'\+AF8-', '_', regex=True)

    # Nettoyage des colonnes 'customer_id', 'product_id', et 'transaction_id'
    if 'customer_id' in df.columns and df['customer_id'].dtype == 'object':
        df['customer_id'] = df['customer_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'product_id' in df.columns and df['product_id'].dtype == 'object':
        df['product_id'] = df['product_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'transaction_id' in df.columns and df['transaction_id'].dtype == 'object':
        df['transaction_id'] = df['transaction_id'].str.replace(r'\D', '', regex=True).fillna(11).astype(int)

    df['id'] = df.index
    df = df[['id'] + [col for col in df.columns if col != 'id']]

    if 'customer_id' not in df.columns or 'product_id' not in df.columns:
        raise ValueError("true_data must contain 'customer_id' and 'product_id' columns")

    # Grouper par customer_id et concaténer les valeurs des produits et des ranks
    prediction_grouped = df.groupby('customer_id').agg({
        'id': 'first',  # Prend la première valeur de 'id'
        'product_id': lambda x: ','.join(map(str, x)),  # Concatène les product_id en chaîne de caractères
        'rank': lambda x: ','.join(map(str, x))  # Concatène les ranks en chaîne de caractères
    }).reset_index()

    # Supprimer la colonne 'id' si elle existe
    if 'id' in prediction_grouped.columns:
        prediction_grouped = prediction_grouped.drop(columns=['id'])

    # Filtrer les données
    prediction_grouped = prediction_grouped[prediction_grouped['customer_id'] != 11]
    prediction_grouped.insert(0, 'id', range(len(prediction_grouped)))

       # Vérification des rangs et des doublons
    for index, row in prediction_grouped.iterrows():
        # Vérifier les ranks
        ranks = list(map(int, row['rank'].split(',')))
        if sorted(ranks) != list(range(1, 11)):  # Vérifie que les rangs sont distincts de 1 à 10
            print("Doublon détecté. Les rangs doivent être distincts (de 1 à 10) pour chacun des 10 produits prédits pour un client.\n")
            return None
        # Vérifier les doublons de produits
        products = row['product_id'].split(',')
        if len(products) != len(set(products)):  # Si des doublons sont présents dans les produits
            print("Doublon détecté. Il doit y avoir 10 produits différents par client.\n")
            return None


    return prediction_grouped
prediction_grouped=process_and_format_prediction(prediction)
print(prediction_grouped)


In [None]:
prediction_grouped.to_csv('submission/submission_list.csv', index=False)