In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
train_dataframes = []
for i in tqdm(range(1, 11)):
    train_dataframes.append(pd.read_csv(f'/kaggle/input/product-re-purchase-prediction/data-train/data-train/train_data_part_{i}.csv'))
#train_dataframes.append(pd.read_csv(f'/kaggle/input/product-re-purchase-prediction/data-train/data-train/train_data_part_7.csv'))
train_data = pd.concat(train_dataframes, ignore_index=True)

del train_dataframes

products_data = pd.read_csv('/kaggle/input/product-re-purchase-prediction/data-train/data-train/products_data.csv', low_memory=False)
test_data = pd.read_csv('/kaggle/input/product-re-purchase-prediction/data-train/data-train/test_data.csv')

In [None]:
# Hitrate@10 evaluation function
def hitrate_at_k(true_data: pd.DataFrame,
                 predicted_data: pd.DataFrame,
                 k: int = 10) -> float:
    """
    This function calculates the hitrate at k for the recommendations.
    It assesses how relevant our 10 product recommendations are.
    In other words, it calculates the proportion of recommended products that are actually purchased by the customer.

    Args:
        true_data: a pandas DataFrame containing the true data
            customer_id: the customer identifier
            product_id: the product identifier that was purchased in the test set
        predicted_data: a pandas DataFrame containing the predicted data
            customer_id: the customer identifier
            product_id: the product identifier that was recommended
            rank: the rank of the recommendation. the rank should be between 1 and 10.
        k: the number of recommendations to consider. k should be between 1 and 10.

    Returns:
        The hitrate at k
    """

    data = pd.merge(left = true_data, right = predicted_data, how = "left", on = ["customer_id", "product_id"])
    df = data[data["rank"] <= k]
    non_null_counts = df.groupby('customer_id')['rank'].apply(lambda x: x.notna().sum()).reset_index(name='non_null_count')
    distinct_products_per_customer = data.groupby('customer_id')['product_id'].nunique().reset_index(name='distinct_product_count')
    df = pd.merge(left = distinct_products_per_customer, right = non_null_counts, how = "left", on = "customer_id")
    df["denominator"] = [min(df.iloc[i].distinct_product_count,k) for i in range(len(df))]
    df = df.fillna(0)
    return (df["non_null_count"]/df["denominator"]).mean()

In [None]:
train_data['date'] = pd.to_datetime(train_data['date'])

# Add recency attribute
latest_date = train_data['date'].max()  # Find the latest date in the dataset
train_data['recency'] = (latest_date - train_data['date']).dt.days  # Calculate days since last purchase

# Group by customer_id and product_id to calculate quantity and most recent purchase
customer_product_data = train_data.groupby(['customer_id', 'product_id']).agg({
    'quantity': 'sum',
    'recency': 'min'  # Minimum days since purchase (most recent)
}).reset_index()

# Normalize quantity and recency scores
customer_product_data['quantity_score'] = customer_product_data['quantity'] / customer_product_data['quantity'].max()
customer_product_data['recency_score'] = 1 - (customer_product_data['recency'] / customer_product_data['recency'].max())  # Recent = higher score

# Merge frequency data into customer_product_data
frequency = train_data.groupby(['customer_id', 'product_id'])['transaction_id'].count().reset_index(name='frequency')
customer_product_data = customer_product_data.merge(frequency, on=['customer_id', 'product_id'], how='left')

# Fill any missing frequency values (if any product has no transactions counted, assume 0)
customer_product_data['frequency'] = customer_product_data['frequency'].fillna(0)

# Normalize frequency score
customer_product_data['frequency_score'] = customer_product_data['frequency'] / customer_product_data['frequency'].max()

# Define the set of popular items
popular_items = {'Product_23971', 'Product_28633', 'Product_39751', 'Product_20421', 'Product_63301', 'Product_57942'}

# Add a column to indicate if a product is popular
customer_product_data['is_popular'] = customer_product_data['product_id'].isin(popular_items).astype(int)

Best_alpha=0.03
Best_beta=0.87
Best_gamma=0.1
Best_leverage=0.0019395677472984205

# Recalculate the final score, adding leverage for popular items
customer_product_data['final_score'] = (
    Best_alpha * customer_product_data['quantity_score'] + 
    Best_beta * customer_product_data['frequency_score'] +
    Best_gamma * customer_product_data['recency_score'] +
    Best_leverage * customer_product_data['is_popular']  # Add leverage
)

# Rank products for each customer with unique ranks
customer_product_data['rank'] = (
    customer_product_data.sort_values(
        by=['customer_id', 'final_score', 'quantity', 'product_id'], 
        ascending=[True, False, False, True]  # Sort order
    ).groupby('customer_id').cumcount() + 1
)

# Filter only the top 10 items for each customer
top_10_per_customer = customer_product_data[customer_product_data['rank'] <= 10]

# Sort by customer_id and rank to ensure proper order
top_10_recommendations = top_10_per_customer.sort_values(by=['customer_id', 'rank'])

In [None]:
# Create submission file for 

# Keep only the top 10 recommendations for Households between 80001 and 100000
prediction = top_10_recommendations[
    top_10_recommendations.customer_id.isin(
            [
                f"Household_{i}" for i in range(80001,100001)
            ]
        )
    ]

# Print the solution
prediction.head()

In [None]:
def process_and_format_prediction(df):
    # Remplacement des caractères invalides dans les noms de colonnes
    df.columns = df.columns.str.replace('+AF8-', '_', regex=False)
    df = df.replace(r'\+AF8-', '_', regex=True)

    # Nettoyage des colonnes 'customer_id', 'product_id', et 'transaction_id'
    if 'customer_id' in df.columns and df['customer_id'].dtype == 'object':
        df['customer_id'] = df['customer_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'product_id' in df.columns and df['product_id'].dtype == 'object':
        df['product_id'] = df['product_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'transaction_id' in df.columns and df['transaction_id'].dtype == 'object':
        df['transaction_id'] = df['transaction_id'].str.replace(r'\D', '', regex=True).fillna(11).astype(int)

    df['id'] = df.index
    df = df[['id'] + [col for col in df.columns if col != 'id']]

    if 'customer_id' not in df.columns or 'product_id' not in df.columns:
        raise ValueError("true_data must contain 'customer_id' and 'product_id' columns")

    # Grouper par customer_id et concaténer les valeurs des produits et des ranks
    prediction_grouped = df.groupby('customer_id').agg({
        'id': 'first',  # Prend la première valeur de 'id'
        'product_id': lambda x: ','.join(map(str, x)),  # Concatène les product_id en chaîne de caractères
        'rank': lambda x: ','.join(map(str, x))  # Concatène les ranks en chaîne de caractères
    }).reset_index()

    # Supprimer la colonne 'id' si elle existe
    if 'id' in prediction_grouped.columns:
        prediction_grouped = prediction_grouped.drop(columns=['id'])

    # Filtrer les données
    prediction_grouped = prediction_grouped[prediction_grouped['customer_id'] != 11]
    prediction_grouped.insert(0, 'id', range(len(prediction_grouped)))

       # Vérification des rangs et des doublons
    for index, row in prediction_grouped.iterrows():
        # Vérifier les ranks
        ranks = list(map(int, row['rank'].split(',')))
        if sorted(ranks) != list(range(1, 11)):  # Vérifie que les rangs sont distincts de 1 à 10
            print("Doublon détecté. Les rangs doivent être distincts (de 1 à 10) pour chacun des 10 produits prédits pour un client.\n")
            return None
        # Vérifier les doublons de produits
        products = row['product_id'].split(',')
        if len(products) != len(set(products)):  # Si des doublons sont présents dans les produits
            print("Doublon détecté. Il doit y avoir 10 produits différents par client.\n")
            return None


    return prediction_grouped
prediction_grouped=process_and_format_prediction(prediction)
print(prediction_grouped)

In [None]:
# Create a .csv file to submit on kaggle
# A lancer en local sur votre ordinateur
prediction_grouped.to_csv('/kaggle/working/submission_list.csv', index=False)