# Rotman Data Science Competition
### Section 6: Picking Top 1000 Products

## 0. Imports


In [1]:
import pandas as pd

## 1. Load Data


In [2]:
# Make dummy substitute data file
def make_dummy_substitute_data() -> bool:
    DATA_PATH = "data/mma_mart.csv"
    DUMMY_PATH = "data/data_with_substitutes_dummy.csv"
    data = pd.read_csv(DATA_PATH)
    data["substitute"] = data["product_id"]
    data["frozen"] = [False] * data.shape[0]
    data["refrigerated"] = [False] * data.shape[0]
    data.to_csv(DUMMY_PATH, index=False)
    return True
assert make_dummy_substitute_data()

In [3]:
DATA_WTH_SUBS_PATH = "data/data_with_substitutes_dummy.csv"
data = pd.read_csv(DATA_WTH_SUBS_PATH)
data.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department,substitute,frozen,refrigerated
0,1,49302,Bulgarian Yogurt,120,yogurt,16,dairy eggs,49302,False,False
1,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,other creams cheeses,16,dairy eggs,11109,False,False
2,1,10246,Organic Celery Hearts,83,fresh vegetables,4,produce,10246,False,False
3,1,49683,Cucumber Kirby,83,fresh vegetables,4,produce,49683,False,False
4,1,43633,Lightly Smoked Sardines in Olive Oil,95,canned meat seafood,15,canned goods,43633,False,False


## 2. Baseline Model

In [4]:
def baseline(data: pd.DataFrame, k: int, use_substitutes=True) -> pd.DataFrame:
    """ Return the top k items by sales

    Precondition:
    k is less than the number of items in the dataset
    """
    # Count number of orders for each product accounting for substitutes
    if use_substitutes:
        # Count number of unique orders for each product (accounting for substitutes)
        sales = pd.DataFrame(data.groupby("substitute")["order_id"].nunique())
    else:
        # Count number of orders for each product. Uniqueness is given.
        sales = pd.DataFrame(data.groupby("product_id")["order_id"].count())

    # Rename column to sales and sort by sales
    sales.rename(columns = {"order_id": "sales"}, inplace=True)
    sales.sort_values(by="sales", ascending=False, inplace=True)
    sales.reset_index(inplace=True)

    # Catch errors for debugging purposes
    if k > sales.shape[0]:
        raise ValueError("k must be less than the number of items in the dataset")

    # Return top k items by sales (i.e. top k items with most orders)
    return sales.iloc[:k]

In [5]:
# Test baseline model
k = 1000
baseline_aisle = baseline(data, k, use_substitutes=True)["substitute"].tolist()
baseline(data, k, use_substitutes=True).head()

Unnamed: 0,substitute,sales
0,24852,14494
1,13176,11694
2,21137,8081
3,21903,7369
4,47209,6411


## 3. Processing Data For Metric B Optimization


In [6]:
def add_met_b_column(data, use_substitutes=True) -> None:
    """ Add a column to the data that is the metric b impact score for each product calculated over the full dataset """
    order_id_to_order_size = data.groupby("order_id")["order_id"].count()
    mma_data_aug = data.copy()
    mma_data_aug["order_size"] = mma_data_aug["order_id"].map(order_id_to_order_size)

    mma_data_aug.loc[:, "portion_of_order"] = 1/mma_data_aug["order_size"]

    if use_substitutes:
        # Add a column to the data that is the metric b impact score for each product (accounting for substitutes)
        id_to_met_b_score = mma_data_aug.groupby("substitute")["portion_of_order"].sum()
    else:
        # Add a column to the data that is the metric b impact score for each product
        id_to_met_b_score = mma_data_aug.groupby("product_id")["portion_of_order"].sum()

    data["met_b_score"] = data["product_id"].map(id_to_met_b_score)
    return None

In [7]:
add_met_b_column(data)
data.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department,substitute,frozen,refrigerated,met_b_score
0,1,49302,Bulgarian Yogurt,120,yogurt,16,dairy eggs,49302,False,False,0.559483
1,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,other creams cheeses,16,dairy eggs,11109,False,False,13.6205
2,1,10246,Organic Celery Hearts,83,fresh vegetables,4,produce,10246,False,False,63.502058
3,1,49683,Cucumber Kirby,83,fresh vegetables,4,produce,49683,False,False,255.053694
4,1,43633,Lightly Smoked Sardines in Olive Oil,95,canned meat seafood,15,canned goods,43633,False,False,1.889912


## 4. Optimization and Finding Top 1000 Products


In [8]:
# trim data
my_data = data.drop(columns=["aisle_id", "aisle", "department_id", "department"])
my_data.head()

Unnamed: 0,order_id,product_id,product_name,substitute,frozen,refrigerated,met_b_score
0,1,49302,Bulgarian Yogurt,49302,False,False,0.559483
1,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese,11109,False,False,13.6205
2,1,10246,Organic Celery Hearts,10246,False,False,63.502058
3,1,49683,Cucumber Kirby,49683,False,False,255.053694
4,1,43633,Lightly Smoked Sardines in Olive Oil,43633,False,False,1.889912


In [9]:
def get_max_aisle(data: pd.DataFrame, use_substitutes=True) -> list:
    """ Return the optimal aisle of size 1000.
    """
    return _max_aisle_recur_helper(data, [], 1000, use_substitutes=use_substitutes)

In [10]:
def get_top_product_by_sales(data: pd.DataFrame, use_substitutes=True) -> list:
    """ Return the top item by sales in data. """
    # Count number of orders for each product
    if use_substitutes:
        # Count number of unique orders for each product (accounting for substitutes)
        sales = pd.DataFrame(data.groupby("substitute")["order_id"].nunique())
    else:
        # Count number of orders for each product. Uniqueness is given.
        sales = pd.DataFrame(data.groupby("product_id")["order_id"].count())

    # Rename column to sales
    sales.rename(columns = {"order_id": "sales"}, inplace=True)

    # Get top item by sales
    sales.reset_index(inplace=True) # Reset index so we don't get multiple items if there is a tie

    # Get top item
    if use_substitutes:
        top_item =  sales.loc[sales["sales"].idxmax()]["substitute"]
    else:
        top_item =  sales.loc[sales["sales"].idxmax()]["product_id"]

    return top_item

In [11]:
def _max_aisle_recur_helper(data: pd.DataFrame, aisle: list, k: int, use_substitutes=True) -> list:
    """ Return the aisle of size k by the process of picking the item with the highest metric b score, removing all orders that contain that item, and repeating until the aisle is of size k.

    Precondition:
    - aisle must be a subset of the test data
    """
    # Base case
    if len(aisle) == k:
        return aisle

    else:

        # Get top items by metric b score
        top_items = data[data["met_b_score"] == data["met_b_score"].max()]

        # If there is a tie, break it by picking the item with the highest sales
        top_item = get_top_product_by_sales(top_items, use_substitutes=use_substitutes)

        # Add top item to aisle
        aisle.append(top_item)

        # Filter data to only include orders that do not contain the top item
        top_item_orders = data[data["product_id"] == top_item]
        data_excluding_top_item = data[~data["order_id"].isin(top_item_orders["order_id"])]

        # Recursively call helper function
        return _max_aisle_recur_helper(data_excluding_top_item, aisle, k)

In [12]:
# Get the Aisle
my_max_aisle = get_max_aisle(my_data, use_substitutes=True)
my_aisle_df = data[data["substitute"].isin(my_max_aisle)]

## 5. Combining Code

In [13]:
def make_1000_items(data: pd.DataFrame, use_substitutes=True) -> list:
    """ Make our aisle of 1000 items """
    data_copy = data.copy()
    add_met_b_column(data_copy)
    return get_max_aisle(data_copy, use_substitutes=True)

## 6. Evaluation


In [14]:
def metric_a(test_data: pd.DataFrame, insta_aisle: list) -> float:
    """ Return the number of orders that utilize the in-aisle items as a percentage of the total number of orders in the test_data dataset.

    Precondition:
    len(insta_aisle) == 1000
    len(test_data) > 0
    """
    # Count the orders that utilize in-aisle items
    ## Filter the data to only include orders that purchased at least one item in the aisle
    purchases_in_aisle = test_data[test_data["product_id"].isin(insta_aisle)]

    ## Count the number of orders that include at least one item in the aisle
    orders_in_aisle = purchases_in_aisle["order_id"].nunique()

    ## Count the total number of orders in the test data
    total_orders = test_data["order_id"].nunique()

    # Return the percentage of orders that utilize in-aisle items
    result = (orders_in_aisle / total_orders) * 100


    # Check for errors for debugging purposes
    if total_orders <= 0:
        raise ValueError("There must be at least one order in the test data")

    if len(insta_aisle) != 1000:
        raise ValueError("insta_aisle must contain 1000 product ids")


    return result


In [15]:
def metric_b(test_data: pd.DataFrame, insta_aisle: list) -> float:
    """ Calculate the percentage of items in each order that utilize in-aisle items and return the average of these percentages.

    Precondition:
    len(insta_aisle) == 1000
    """
    # Check for errors for debugging purposes
    if len(insta_aisle) != 1000:
        raise ValueError("There should be 1000 items in the aisle")


    # Copy data to prevent altering test_data
    test_data_copy = test_data.copy()

    # Add column that is 1 if the product id is in insta_aisle and 0 if not
    test_data_copy["is_utilized"] = test_data_copy["product_id"].isin(insta_aisle).astype(int)

    # Sum the new column and divide by total number of items in order
    ## Get a map mapping order_id to the number of items in the order that are utilized
    order_to_ultilized_items = test_data_copy.groupby("order_id")["is_utilized"].sum()

    ## Get a map mapping order_id to the total number of items in the order
    order_to_total_items = test_data_copy.groupby("order_id")["order_id"].count()

    ## Divide the two maps to get order to the percentage of items in the order that are utilized
    order_to_precent_ultilization = order_to_ultilized_items/order_to_total_items


    # Return average of percentage ultilization over all orders
    return order_to_precent_ultilization.mean() * 100


In [16]:
def metric_average(test_data: pd.DataFrame, insta_aisle: list) -> float:
    """ Return the average of metric a and metric b

    Precondition:
    len(insta_aisle) == 1000
    """
    return (metric_a(test_data, insta_aisle) + metric_b(test_data, insta_aisle)) / 2

In [17]:
metric_average(data, my_max_aisle)

73.77741557105233

In [21]:
metric_average(data, baseline_aisle)

72.91187584176839