In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import random
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Step 1: Load Data

In [3]:
train_dataframes = []
for i in tqdm(range(1, 11)): ## 1 - 11
    train_dataframes.append(pd.read_csv(f'gdrive/My Drive/MLDM - Carrefour Project/data-train/train_data_part_{i}.csv'))
train_data = pd.concat(train_dataframes, ignore_index=True)

100%|██████████| 10/10 [02:37<00:00, 15.73s/it]


In [4]:
product_ids = np.load('gdrive/My Drive/MLDM - Carrefour Project/data-train/product_ids.npy', allow_pickle=True)
product_embeddings = np.load('gdrive/My Drive/MLDM - Carrefour Project/data-train/product_embeddings.npy', allow_pickle=True)

In [5]:
# Step 2: Prepare Product Embeddings
# Ensure embeddings are 2D
if product_embeddings.ndim == 3:
    product_embeddings = product_embeddings.reshape(product_embeddings.shape[0], -1)

# Create a mapping of product_id to embedding
product_embedding_dict = {product_id: embedding for product_id, embedding in zip(product_ids, product_embeddings)}

# Step 3: Build Historical Co-Occurrence Matrix
# Group by transaction_id to get co-purchased products
def build_cooccurrence_matrix(data):
    product_transactions = data.groupby('transaction_id')['product_id'].apply(list)
    cooccurrence = {}

    for products in product_transactions:
        for i, product in enumerate(products):
            if product not in cooccurrence:
                cooccurrence[product] = {}
            for co_product in products:
                if product != co_product:
                    cooccurrence[product][co_product] = cooccurrence[product].get(co_product, 0) + 1

    return cooccurrence

cooccurrence_matrix = build_cooccurrence_matrix(train_data)

# Step 4: Combine Embedding Similarity with Historical Data
# Compute cosine similarity for product embeddings
similarity_matrix = cosine_similarity(product_embeddings)
product_index_map = {product_id: idx for idx, product_id in enumerate(product_ids)}

def get_combined_recommendations(product_id, n=2):
    """Get recommendations by combining embedding similarity and historical co-purchase data."""
    if product_id not in cooccurrence_matrix:
        return []

    # Historical co-occurrence scores
    co_products = cooccurrence_matrix[product_id]

    # Embedding similarity scores
    if product_id in product_index_map:
        product_idx = product_index_map[product_id]
        embedding_scores = {
            other_product: similarity_matrix[product_idx][product_index_map[other_product]]
            for other_product in co_products.keys()
            if other_product in product_index_map
        }
    else:
        embedding_scores = {}

    # Combine scores
    combined_scores = {
        product: co_products[product] + embedding_scores.get(product, 0)
        for product in co_products
    }

    # Sort by combined score
    sorted_recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    return [product for product, _ in sorted_recommendations[:n]]

# Step 5: Generate Recommendations for All Products
all_recommendations = {}
for idx, product_id in enumerate(product_ids):
    all_recommendations[product_id] = get_combined_recommendations(product_id, n=2)
    if (idx + 1) % 10 == 0 or (idx + 1) == len(product_ids):
        print(f"Processed {idx + 1}/{len(product_ids)} products...")



[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
Processed 32980/82966 products...
Processed 32990/82966 products...
Processed 33000/82966 products...
Processed 33010/82966 products...
Processed 33020/82966 products...
Processed 33030/82966 products...
Processed 33040/82966 products...
Processed 33050/82966 products...
Processed 33060/82966 products...
Processed 33070/82966 products...
Processed 33080/82966 products...
Processed 33090/82966 products...
Processed 33100/82966 products...
Processed 33110/82966 products...
Processed 33120/82966 products...
Processed 33130/82966 products...
Processed 33140/82966 products...
Processed 33150/82966 products...
Processed 33160/82966 products...
Processed 33170/82966 products...
Processed 33180/82966 products...
Processed 33190/82966 products...
Processed 33200/82966 products...
Processed 33210/82966 products...
Processed 33220/82966 products...
Processed 33230/82966 products...
Processed 33240/82966 products...
Process

In [7]:
# Save Recommendations to File
recommendations_file = 'gdrive/My Drive/MLDM - Carrefour Project/data-train/product_recommendations_combined.npy'
np.save(recommendations_file, all_recommendations)

# Step 6: Load Recommendations from File
def load_recommendations():
    """Load recommendations from the saved file."""
    return np.load(recommendations_file, allow_pickle=True).item()

recommendations_data = load_recommendations()

# Step 7: Recommend Products from Loaded File
def recommend_from_loaded_file(product_id):
    """Get recommendations for a product ID from the preloaded recommendations."""
    try:
        return recommendations_data[product_id]
    except KeyError:
        return f"Product ID {product_id} not found in recommendations."

# Example Usage
example_product_id = "Product_33508"
print(f"Recommendations for {example_product_id}: {recommend_from_loaded_file(example_product_id)}")

Recommendations for Product_33508: ['Product_1370', 'Product_42748']
