In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import sys
from sentence_transformers import SentenceTransformer, util

In [2]:
df = pd.read_csv('data/mma_mart.csv')

In [3]:
df_by_sales = df.groupby("product_name")["order_id"].count()
df_by_sales = pd.DataFrame(df_by_sales)
df_by_sales.columns = ['sales']
df_by_sales.head()

Unnamed: 0_level_0,sales
product_name,Unnamed: 1_level_1
#2 Coffee Filters,24
#2 Cone White Coffee Filters,1
#4 Natural Brown Coffee Filters,4
& Go! Hazelnut Spread + Pretzel Sticks,5
+Energy Black Cherry Vegetable & Fruit Juice,1


In [4]:
top_1000 = df_by_sales.sort_values(by=['sales'], ascending=False).head(1000)
top_1000 = df[df['product_name'].isin(top_1000.index)]
top_1000.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department
2,1,10246,Organic Celery Hearts,83,fresh vegetables,4,produce
3,1,49683,Cucumber Kirby,83,fresh vegetables,4,produce
5,1,13176,Bag of Organic Bananas,24,fresh fruits,4,produce
6,1,47209,Organic Hass Avocado,24,fresh fruits,4,produce
7,1,22035,Organic Whole String Cheese,21,packaged cheese,16,dairy eggs


In [5]:
rows_with_x = top_1000[top_1000['product_name'].str.contains('banana', case=False, regex=False)]

In [6]:
rows_with_x = df[df['product_name'].str.contains('orange', case=False, regex=False)]

In [7]:
rows_with_x

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department
35,4,25146,Original Orange Juice,31,refrigerated,7,beverages
50,5,48002,Biscuits Orange Pim's,61,cookies cakes,19,snacks
67,7,34050,Orange Juice,31,refrigerated,7,beverages
111,12,3164,100% Juice No Added Sugar Orange Tangerine,98,juice nectars,7,beverages
437,51,4493,Italian Sparkling Pomegranate And Orange Soda,115,water seltzer sparkling water,7,beverages
...,...,...,...,...,...,...,...
986976,99970,34050,Orange Juice,31,refrigerated,7,beverages
987018,99975,8174,Organic Navel Orange,24,fresh fruits,4,produce
987037,99978,8174,Organic Navel Orange,24,fresh fruits,4,produce
987071,99981,15607,Sesame Ginger with Mandarin Orange Juice Marinade,5,marinades meat preparation,13,pantry


### Helper Functions

In [8]:
def get_similar_products(product_name: str) -> list:
    """
    Get products similar to product_name

    Parameters:
    - product_name (str): Input string.

    Returns:
    - list: list of similar product_names.
    """
    aisle = find_aisle(product_name)

    temp_df = top_1000[top_1000['aisle']== aisle]
    prod_name_list = temp_df['product_name'].tolist()

    # Check if the product_name is in the list
    if product_name in prod_name_list:
        prod_name_list.remove(product_name)

    return prod_name_list

In [9]:
def find_aisle(product_name: str) -> str:
    """
    Find the aisle of product_name

    Parameters:
    - product_name (str): Input string.

    Returns:
    - str: aisle.
    """
    result_df = df[df['product_name'] == product_name]

    if not result_df.empty:
        product_info = result_df.iloc[0]
        # Now you can use product_info as needed
        return product_info['aisle']
    else:
        print(f"No information found for the product: {product_name}. Canceling the run.")
        sys.exit()


In [10]:
def find_department(product_name: str) -> str:
    """
    Find the department of product_name

    Parameters:
    - product_name (str): Input string.

    Returns:
    - str: department.
    """
    product_info = df[df['product_name'] == product_name].iloc[0]

    return product_info['department']

### Subsitution with Term Frequenccy

In [11]:
def substitution_term_freq(product_name: str) -> str:
    """
    Converts product_name to vector, counts word occurance. Similar proccess for products available for substitution.

    Parameters:
    - product_name (str): String parameter for unavailable item

    Returns:
    - str: The recommended product
    """
    # Example: Printing the input DataFrame and string for demonstration
    prod_name_list = get_similar_products(product_name)

    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(prod_name_list + [product_name])

    # Step 3: Similarity Calculation
    cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Step 4: Recommendation
    target_product_index = len(prod_name_list)  # Index of the product_name in the extended list
    similarity_scores = cosine_similarities[target_product_index]
    most_similar_index = similarity_scores.argsort()[-2]  # Index of the most similar product (excluding the target product itself)

    return prod_name_list[most_similar_index]

In [12]:
%%time
# print(substitution_term_freq(top_1000, 'Bag of Organic Bananas'))

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.25 µs


In [13]:
%%time
# print(substitution_term_freq(top_1000, '100% Juice No Added Sugar Orange Tangerine'))

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs


### Substitution with Sequence Transformer

In [14]:
def substitution_seq_trans(product_name: str) -> str:
    """
    Process a DataFrame based on a given input string.

    Parameters:
    - product_name (str): String parameter for unavailable item

    Returns:
    - str: The recommended product
    """
    # Pre-trained model
    # model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    model = SentenceTransformer('substitute_classifier/st_checkpoint_final')
    prod_name_list = get_similar_products(product_name)

    # Encode descriptions to get embeddings
    description_embeddings = model.encode(prod_name_list + [product_name], convert_to_tensor=True)

    # Calculate cosine similarities between the desired product and other products
    similarities = util.pytorch_cos_sim(description_embeddings[-1:], description_embeddings[:-1])

    # Get the most similar product
    most_similar_index = similarities.argmax().item()
    recommended_product = prod_name_list[most_similar_index]

    return recommended_product

In [15]:
# %%time
# print(substitution_seq_trans('Bag of Organic Bananas'))

In [None]:
%%time
model = SentenceTransformer('substitute_classifier/st_checkpoint_final')