In [71]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import sys
from sentence_transformers import SentenceTransformer, util

In [2]:
df = pd.read_csv('mma_mart.csv')
top_1000 = pd.read_csv('items_1000.csv')

In [102]:
rows_with_x = top_1000[top_1000['product_name'].str.contains('banana', case=False, regex=False)]

In [50]:
rows_with_x = df[df['product_name'].str.contains('orange', case=False, regex=False)]

In [103]:
rows_with_x

Unnamed: 0,product_name,total_quantity_sold,order_id,product_id,aisle_id,aisle,department_id,department,frozen,refrigerated
126,Baby Food Stage 2 Pumpkin Banana,141,163,14211,92,baby food formula,18,babies,False,False
127,"Peach, Apricot & Banana Stage 2 Baby Food",136,1457,5114,92,baby food formula,18,babies,False,False
588,Banana,14494,10,24852,24,fresh fruits,4,produce,False,False
589,Bag of Organic Bananas,11694,1,13176,24,fresh fruits,4,produce,False,False
683,Organic Banana,644,210,37067,24,fresh fruits,4,produce,False,False
991,Dark Chocolate Covered Banana,131,75,43889,119,frozen dessert,1,frozen,True,False


### Helper Functions

In [88]:
def get_similar_products(product_name: str) -> list:
    """
    Get products similar to product_name

    Parameters:
    - product_name (str): Input string.

    Returns:
    - list: list of similar product_names.
    """
    aisle = find_aisle(product_name)

    temp_df = top_1000[top_1000['aisle']== aisle]
    prod_name_list = temp_df['product_name'].tolist()
    
    # Check if the product_name is in the list
    if product_name in prod_name_list:
        prod_name_list.remove(product_name)
        
    return prod_name_list

In [45]:
def find_aisle(product_name: str) -> str:
    """
    Find the aisle of product_name

    Parameters:
    - product_name (str): Input string.

    Returns:
    - str: aisle.
    """
    result_df = df[df['product_name'] == product_name]
    
    if not result_df.empty:
        product_info = result_df.iloc[0]
        # Now you can use product_info as needed
        return product_info['aisle']
    else:
        print(f"No information found for the product: {product_name}. Canceling the run.")
        sys.exit()
    

In [9]:
def find_department(product_name: str) -> str:
    """
    Find the department of product_name

    Parameters:
    - product_name (str): Input string.

    Returns:
    - str: department.
    """
    product_info = df[df['product_name'] == product_name].iloc[0]

    return product_info['department']

### Subsitution with Term Frequenccy

In [80]:
def substitution_term_freq(top_1000: pd.DataFrame, product_name: str) -> str:
    """
    Converts product_name to vector, counts word occurance. Similar proccess for products available for substitution.

    Parameters:
    - top_1000 (pd.DataFrame): dataframe of products in aisle
    - product_name (str): String parameter for unavailable item

    Returns:
    - str: The recommended product
    """
    # Example: Printing the input DataFrame and string for demonstration    
    prod_name_list = get_similar_products(product_name)  
        
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(prod_name_list + [product_name])
    
    # Step 3: Similarity Calculation
    cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Step 4: Recommendation
    target_product_index = len(prod_name_list)  # Index of the product_name in the extended list
    similarity_scores = cosine_similarities[target_product_index]
    most_similar_index = similarity_scores.argsort()[-2]  # Index of the most similar product (excluding the target product itself)

    return prod_name_list[most_similar_index]

In [110]:
print(substitution_term_freq(top_1000, 'Bag of Organic Bananas'))

Organic Avocado


In [82]:
print(substitution_term_freq(top_1000, '100% Juice No Added Sugar Orange Tangerine'))

Organic Orange Juice


### Substitution with Sequence Transformer

In [86]:
def substitution_seq_trans(top_1000: pd.DataFrame, product_name: str) -> str:
    """
    Process a DataFrame based on a given input string.

    Parameters:
    - top_1000 (pd.DataFrame): dataframe of products in aisle
    - product_name (str): String parameter for unavailable item

    Returns:
    - str: The recommended product
    """
    # Pre-trained model
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    prod_name_list = get_similar_products(product_name)  

    # Encode descriptions to get embeddings
    description_embeddings = model.encode(prod_name_list + [product_name], convert_to_tensor=True)

    # Calculate cosine similarities between the desired product and other products
    similarities = util.pytorch_cos_sim(description_embeddings[-1:], description_embeddings[:-1])

    # Get the most similar product
    most_similar_index = similarities.argmax().item()
    recommended_product = prod_name_list[most_similar_index]

    return recommended_product

In [111]:
print(substitution_seq_trans(top_1000, 'Bag of Organic Bananas'))

Organic Banana
