In [None]:
import numpy as np
import pandas as pd

# Predict Next Purchase


In [None]:
def predict_next_purchases(model, transaction_history, customer_ids, arr_search*):

    predicted_embeddings = model.predict(transaction_history)  # Shape: (n, 384)

    recommendations = []
    for customer_id, embedding in zip(customer_ids, predicted_embeddings):
        # Anna's search function that receives the predicted embedding and finds the corresponding top 10 products
        top_products = get_product_ids_for_embeddings(arr, embedding)
        for rank, product_id in enumerate(top_products, start=1):
            recommendations.append({
                'customer_id': customer_id,
                'product_id': product_id,
                'rank': rank
            })

    # Output a DataFrame from the recommendations list
    recommendations_df = pd.DataFrame(recommendations)

    return recommendations

# Hit Rate
write a function to evaluate the top10

In [None]:
def average_hitrate_at_10(recommendations_list: list, actual_purchases_list: list) -> float:

    total_hitrate = 0.0
    num_customers = len(recommendations_list)

    for recommendations, actual_purchases in zip(recommendations_list, actual_purchases_list):
        # Ensure recommendations are unique and limited to top 10
        k = 10
        recommendations = list(dict.fromkeys(recommendations))[:k]

        hits = sum(1 for rec in recommendations if rec in actual_purchases)
        denominator = min(len(actual_purchases), k)

        if denominator > 0:
            total_hitrate += hits / denominator
        else:
            total_hitrate += 0.0

    return total_hitrate / num_customers if num_customers > 0 else 0.0

In [None]:
# Should be 0.76
recommendations = [["Product_25427", "Product_78280", "Product_38437", "Product_58122", "Product_45349", "Product_42347", "Product_47295", "Product_50007", "Product_37070", "Product_43475"],
                   ["Product_25427", "Product_78280", "Product_38437", "Product_58122", "Product_45349", "Product_42347", "Product_47295", "Product_50007", "Product_37070", "Product_43475"],
                   ["Product_25427", "Product_78280", "Product_38437", "Product_58123", "Product_45349", "Product_42347", "Product_47295", "Product_50007", "Product_37070", "Product_43475"]]

actual_purchases = [["Product_23849", "Product_5776", "Product_64197", "Product_58122", "Product_45349", "Product_42347", "Product_47295", "Product_50007", "Product_37070", "Product_43475"],
                    ["Product_58122", "Product_45349", "Product_42347", "Product_47295", "Product_50007", "Product_37070", "Product_43475"],
                    ["Product_23849", "Product_5776", "Product_64197", "Product_50007", "Product_37070", "Product_58122", "Product_45349", "Product_42347", "Product_47295", "Product_50007", "Product_37070", "Product_43475"]]
print(average_hitrate_at_10(recommendations, actual_purchases))

0.7666666666666666


# Create submission file

In [None]:
# Recommendations for all customers
top_10_recommendations = predict_next_purchases(model, transaction_history, customer_ids)

# Keep only the top 10 recommendations for Households between 80001 and 100000
prediction = top_10_recommendations[
    top_10_recommendations.customer_id.isin(
            [
                f"Household_{i}" for i in range(80001,100001)
            ]
        )
    ]

# Print the solution
prediction.head()

In [None]:
def process_and_format_prediction(df):
    # Remplacement des caractères invalides dans les noms de colonnes
    df.columns = df.columns.str.replace('+AF8-', '_', regex=False)
    df = df.replace(r'\+AF8-', '_', regex=True)

    # Nettoyage des colonnes 'customer_id', 'product_id', et 'transaction_id'
    if 'customer_id' in df.columns and df['customer_id'].dtype == 'object':
        df['customer_id'] = df['customer_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'product_id' in df.columns and df['product_id'].dtype == 'object':
        df['product_id'] = df['product_id'].str.extract('(\d+)').fillna(11).astype(int)
    if 'transaction_id' in df.columns and df['transaction_id'].dtype == 'object':
        df['transaction_id'] = df['transaction_id'].str.replace(r'\D', '', regex=True).fillna(11).astype(int)

    df['id'] = df.index
    df = df[['id'] + [col for col in df.columns if col != 'id']]

    if 'customer_id' not in df.columns or 'product_id' not in df.columns:
        raise ValueError("true_data must contain 'customer_id' and 'product_id' columns")

    # Grouper par customer_id et concaténer les valeurs des produits et des ranks
    prediction_grouped = df.groupby('customer_id').agg({
        'id': 'first',  # Prend la première valeur de 'id'
        'product_id': lambda x: ','.join(map(str, x)),  # Concatène les product_id en chaîne de caractères
        'rank': lambda x: ','.join(map(str, x))  # Concatène les ranks en chaîne de caractères
    }).reset_index()

    # Supprimer la colonne 'id' si elle existe
    if 'id' in prediction_grouped.columns:
        prediction_grouped = prediction_grouped.drop(columns=['id'])

    # Filtrer les données
    prediction_grouped = prediction_grouped[prediction_grouped['customer_id'] != 11]
    prediction_grouped.insert(0, 'id', range(len(prediction_grouped)))

       # Vérification des rangs et des doublons
    for index, row in prediction_grouped.iterrows():
        # Vérifier les ranks
        ranks = list(map(int, row['rank'].split(',')))
        if sorted(ranks) != list(range(1, 11)):  # Vérifie que les rangs sont distincts de 1 à 10
            print("Doublon détecté. Les rangs doivent être distincts (de 1 à 10) pour chacun des 10 produits prédits pour un client.\n")
            return None
        # Vérifier les doublons de produits
        products = row['product_id'].split(',')
        if len(products) != len(set(products)):  # Si des doublons sont présents dans les produits
            print("Doublon détecté. Il doit y avoir 10 produits différents par client.\n")
            return None


    return prediction_grouped
prediction_grouped=process_and_format_prediction(prediction)
print(prediction_grouped)

In [None]:
# Create a .csv file to submit on kaggle
prediction_grouped.to_csv('submission/submission_list.csv', index=False)