In [15]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pickle

In [2]:
# Replace 'your_product_file.csv' and 'your_order_file.csv' with your actual file names or DataFrame variables
products_df = pd.read_csv('~/Downloads/model/products.csv')
orders_df = pd.read_csv('~/Downloads/model/order_products__train.csv')

In [17]:
products_df

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13
...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5
49684,49685,En Croute Roast Hazelnut Cranberry,42,1
49685,49686,Artisan Baguette,112,3
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8


In [4]:
orders_df=orders_df.groupby('order_id')['product_id'].agg(list).reset_index()

In [5]:
# Creating a list of tuples that contain (order_id, product_id)
interaction_data = []
for index, row in orders_df.iterrows():
    for product_id in row['product_id']:
        interaction_data.append((row['order_id'], product_id))

# Convert the list of tuples into a DataFrame
interaction_df = pd.DataFrame(interaction_data, columns=['order_id', 'product_id'])

# Map order and product to integer indices
order_indices = {order_id: i for i, order_id in enumerate(orders_df['order_id'].unique())}
product_indices = {product_id: i for i, product_id in enumerate(products_df['product_id'].unique())}

# Map the order and product IDs to the corresponding indices
interaction_df['order_id'] = interaction_df['order_id'].map(order_indices)
interaction_df['product_id'] = interaction_df['product_id'].map(product_indices)

# Create a sparse matrix
interaction_sparse = csr_matrix((np.ones(interaction_df.shape[0]), 
                                 (interaction_df['order_id'], interaction_df['product_id'])), 
                                 shape=(len(order_indices), len(product_indices)))

In [6]:
cosine_sim_sparse = cosine_similarity(interaction_sparse, dense_output=False)

In [7]:
del interaction_df,interaction_sparse,orders_df,interaction_data,order_indices

In [8]:
# Create a DataFrame for the cosine similarity matrix
# cosine_sim_df = pd.DataFrame(cosine_sim, index=interaction_matrix.columns, columns=interaction_matrix.columns)

In [9]:
# Choose K for the KNN model
K = 10

# Initialize the NearestNeighbors model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=K, n_jobs=-1)

# Fit the model on the cosine similarity matrix
model_knn.fit(cosine_sim_sparse)

In [12]:
def recommend_products(product_id, data, model, n_recommendations, product_indices, products_df):
    # Check if the product_id exists in the product_indices mapping
    if product_id not in product_indices:
        return f"Product ID {product_id} not found in product_indices."

    # Find the index of the product
    product_idx = product_indices[product_id]

    # Find the K-nearest neighbors of the product
    distances, indices = model.kneighbors(data[product_idx], n_neighbors=n_recommendations + 1)

    # Reverse the product indices mapping
    index_product = {v: k for k, v in product_indices.items()}

    # Get product recommendations
    recommendations = []
    for i in range(1, len(distances.flatten())):
        idx = indices.flatten()[i]
        # Check if the index is in the reverse mapping
        if idx in index_product:
            product_id = index_product[idx]
            # Check if the product_id is in the products_df
            if product_id in products_df['product_id'].values:
                product_name = products_df[products_df['product_id'] == product_id]['product_name'].iloc[0]
                recommendations.append((product_name, distances.flatten()[i]))
            else:
                recommendations.append((f"Product ID {product_id} not found in products_df", distances.flatten()[i]))
        else:
            recommendations.append((f"Index {idx} not found in index_product mapping", distances.flatten()[i]))

    return recommendations

In [13]:
recommended_products = recommend_products(1, cosine_sim_sparse, model_knn, 5, product_indices, products_df)
print(recommended_products)

[('Seafood Medley Value Size Stay Fresh Pack', 0.07448454790893833), ('Index 119796 not found in index_product mapping', 0.07507327044414902), ('Index 55190 not found in index_product mapping', 0.07568245570389176), ('Index 54210 not found in index_product mapping', 0.07568484574075618), ('Three Grain Bread', 0.07691677201659564)]


In [18]:
# Save the KNN model
with open('knn_model.pkl', 'wb') as file:
    pickle.dump(model_knn, file)

# Save the product indices mapping
with open('product_indices.pkl', 'wb') as file:
    pickle.dump(product_indices, file)

# If you need to save the cosine similarity matrix
with open('cosine_sim_sparse.pkl', 'wb') as file:
    pickle.dump(cosine_sim_sparse, file)

In [19]:
# Load the KNN model
with open('knn_model.pkl', 'rb') as file:
    model_knn = pickle.load(file)

# Load the product indices mapping
with open('product_indices.pkl', 'rb') as file:
    product_indices = pickle.load(file)

# Load the cosine similarity matrix if needed
with open('cosine_sim_sparse.pkl', 'rb') as file:
    cosine_sim_sparse = pickle.load(file)