# Rotman Data Science Competition
## Section 2.1: Baselines & Evaluation Metrics
Some part of this code may be slow, but everything should run in less than 5 minutes.

### 0. Load Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
def load_competition_data() -> pd.DataFrame:
    """ Load the data for the competition """

    # Path where you saved your data
    DATA_PATH = "data/mma_mart.csv" # <- change this to your data path
    data = pd.read_csv(DATA_PATH)
    return data

In [3]:
mma_data = load_competition_data()
mma_data.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department
0,1,49302,Bulgarian Yogurt,120,yogurt,16,dairy eggs
1,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,other creams cheeses,16,dairy eggs
2,1,10246,Organic Celery Hearts,83,fresh vegetables,4,produce
3,1,49683,Cucumber Kirby,83,fresh vegetables,4,produce
4,1,43633,Lightly Smoked Sardines in Olive Oil,95,canned meat seafood,15,canned goods


#### 0.1 Split Data into Train and Test Sets

In [4]:
TRAIN_TEST_SEED = 42 # Specify random state 42 for reproducibility.
TEST_SIZE = 0.2 # Use 20% of the data for testing.
mma_orders = mma_data["order_id"].unique() # Get all unique order ids

# Split the orders into train and test sets
train_orders, test_orders = train_test_split(mma_orders, test_size=TEST_SIZE, random_state=TRAIN_TEST_SEED)

# Retrieve all the purchases in the train and test order sets
mma_train = mma_data[mma_data["order_id"].isin(train_orders)]
mma_test = mma_data[mma_data["order_id"].isin(test_orders)]

## 1. Evaluation Metrics
### a) The number of orders that utilize the in-aisle items

In [5]:
def metric_a(test_data: pd.DataFrame, insta_aisle: list) -> float:
    """ Return the number of orders that utilize the in-aisle items as a percentage of the total number of orders in the test_data dataset.

    Precondition:
    len(insta_aisle) == 1000
    len(test_data) > 0
    """
    # Count the orders that utilize in-aisle items
    ## Filter the data to only include orders that purchased at least one item in the aisle
    purchases_in_aisle = test_data[test_data["product_id"].isin(insta_aisle)]

    ## Count the number of orders that include at least one item in the aisle
    orders_in_aisle = purchases_in_aisle["order_id"].nunique()

    ## Count the total number of orders in the test data
    total_orders = test_data["order_id"].nunique()

    # Return the percentage of orders that utilize in-aisle items
    result = (orders_in_aisle / total_orders) * 100


    # Check for errors for debugging purposes
    if total_orders <= 0:
        raise ValueError("There must be at least one order in the test data")

    if len(insta_aisle) != 1000:
        raise ValueError("insta_aisle must contain 1000 product ids")


    return result

### (b) Average % of items in each order that utilize in-aisle items

In [6]:
def metric_b(test_data: pd.DataFrame, insta_aisle: list) -> float:
    """ Calculate the percentage of items in each order that utilize in-aisle items and return the average of these percentages.

    Precondition:
    len(insta_aisle) == 1000
    """
    # Check for errors for debugging purposes
    if len(insta_aisle) != 1000:
        raise ValueError("There should be 1000 items in the aisle")


    # Copy data to prevent altering test_data
    test_data_copy = test_data.copy()

    # Add column that is 1 if the product id is in insta_aisle and 0 if not
    test_data_copy["is_utilized"] = test_data_copy["product_id"].isin(insta_aisle).astype(int)

    # Sum the new column and divide by total number of items in order
    ## Get a map mapping order_id to the number of items in the order that are utilized
    order_to_ultilized_items = test_data_copy.groupby("order_id")["is_utilized"].sum()

    ## Get a map mapping order_id to the total number of items in the order
    order_to_total_items = test_data_copy.groupby("order_id")["order_id"].count()

    ## Divide the two maps to get order to the percentage of items in the order that are utilized
    order_to_precent_ultilization = order_to_ultilized_items/order_to_total_items


    # Return average of percentage ultilization over all orders
    return order_to_precent_ultilization.mean() * 100

### (c) Average of Metric a and Metric b (Do not use on large datasets)

In [7]:
def metric_average(test_data: pd.DataFrame, insta_aisle: list) -> float:
    """ Return the average of metric a and metric b

    Precondition:
    len(insta_aisle) == 1000
    """
    return (metric_a(test_data, insta_aisle) + metric_b(test_data, insta_aisle)) / 2

### (d) Average % of items in each order that utilize in-aisle items accounting for any identified substitutes.

In [8]:
pass

## 2. Baseline Approach
Identify top 1000 items sold

In [9]:
def baseline(data: pd.DataFrame, k: int) -> pd.DataFrame:
    """ Return the top k items by sales

    Precondition:
    k is less than the number of items in the dataset
    """
    # Count number of orders for each product
    sales = pd.DataFrame(data.groupby("product_id")["order_id"].count())

    # Rename column to sales and sort by sales
    sales.rename(columns = {"order_id": "sales"}, inplace=True)
    sales.sort_values(by="sales", ascending=False, inplace=True)
    sales.reset_index(inplace=True)

    # Catch errors for debugging purposes
    if k > sales.shape[0]:
        raise ValueError("k must be less than the number of items in the dataset")

    # Return top k items by sales (i.e. top k items with most orders)
    return sales.iloc[:k]

#### Evaluate the baseline approach

In [10]:
# Get the top 1000 items by sales
baseline_pred = baseline(mma_train, 1000)
baseline_pred.head()

Unnamed: 0,product_id,sales
0,24852,11623
1,13176,9335
2,21137,6466
3,21903,5947
4,47209,5081


In [11]:
# Convert baseline_pred to a list
baseline_insta_aisle = baseline_pred["product_id"].tolist()

# Check that there are 1000 items in the aisle
if len(baseline_insta_aisle) != 1000:
    raise ValueError("There should be 1000 items in the aisle")

# Run metrics on the baseline aisle
met_a_base = metric_a(mma_test, baseline_insta_aisle)
met_b_base = metric_b(mma_test, baseline_insta_aisle)
print(f"Metric a: {met_a_base}, Metric b: {met_b_base}, Avg Score: {(met_a_base + met_b_base) / 2}")

Metric a: 92.42091276128176, Metric b: 53.77052784045304, Avg Score: 73.0957203008674


## 3. Optimizing Metric a
Create algorithm that optimizes over the test set to serve as a theoretical maximum of how well our approaches can do. (I personally call them psychic algorithms because you would have to have superpowers and apply them over the test set, while in real life the test set exists only in the future)

In [12]:
def max_metric_a_aisle(data: pd.DataFrame) -> list:
    """ Return the aisle that maximizes metric a over the data. Do this by first getting the item with most sales, then getting the item with most sales in orders that do not contain the first item, and so on. For why this produces max list, see proof writeup.
    """
    return _max_metric_a_recur_helper(data, [])

def _max_metric_a_recur_helper(data: pd.DataFrame, aisle: list) -> list:
    """ Return the aisle that maximizes metric a over the data. Do this by first getting the item with most sales, then getting the item with most sales in orders that do not contain the first item, and so on.

    Precondition:
    - data must be a subset of the test data
    - aisle must be a subset of the top 1000 items by sales
    """
    # Base case
    if len(aisle) == 1000:
        return aisle

    else:
        # Get the top item by sales in data
        top_item = get_top_product(data)

        # Add top item to aisle
        aisle.append(top_item)

        # Filter data to only include orders that do not contain the top item
        top_item_orders = data[data["product_id"] == top_item]
        data_excluding_top_item = data[~data["order_id"].isin(top_item_orders["order_id"])]

        # Recursively call helper function
        return _max_metric_a_recur_helper(data_excluding_top_item, aisle)

def get_top_product(data: pd.DataFrame) -> list:
    """ Return the top item by sales in data. """
    # Count number of orders for each product
    sales = pd.DataFrame(data.groupby("product_id")["order_id"].count())

    # Rename column to sales
    sales.rename(columns = {"order_id": "sales"}, inplace=True)

    # Get top item by sales
    sales.reset_index(inplace=True) # Reset index so we don't get multiple items if there is a tie
    top_item =  sales.loc[sales["sales"].idxmax()]["product_id"]

    return top_item

In [13]:
%%time
# Max Metric a for the test set
met_a_top_1000 = max_metric_a_aisle(mma_test)
print(f"Max Metric a: {metric_a(mma_test, met_a_top_1000)}")

Max Metric a: 96.78029335105023
CPU times: user 3.42 s, sys: 476 ms, total: 3.9 s
Wall time: 3.95 s


## 4) Optimizing Metric b
### a) Add more columns to the data
Optimizing metric b is more complicated. We will start by adding an "order size" column to the data.


In [14]:
%%time
order_id_to_order_size = mma_data.groupby("order_id")["order_id"].count()
mma_data_aug = mma_data.copy()
mma_data_aug["order_size"] = mma_data_aug["order_id"].map(order_id_to_order_size)

CPU times: user 63.8 ms, sys: 28.9 ms, total: 92.7 ms
Wall time: 90.9 ms


#### Calculate the proportion of each product in its current order and add it as a column to the data

In [15]:
# This is just 1 / the order size
mma_data_aug.loc[:, "portion_of_order"] = 1/mma_data_aug["order_size"]

#### Get the same train test split from mma_data_aug

In [16]:
# Get the same train test split from mma_data_aug
mma_train_aug = mma_data_aug[mma_data_aug["order_id"].isin(train_orders)]
mma_test_aug = mma_data_aug[mma_data_aug["order_id"].isin(test_orders)]

# Check that mma_data_aug is the same as mma_data
data_is_same = mma_data_aug.loc[:, "order_id" : "department"].equals(mma_data)
train_is_same = mma_train_aug.loc[:, "order_id" : "department"].equals(mma_train)
test_is_same = mma_test_aug.loc[:, "order_id" : "department"].equals(mma_test)

# Check that all of these are the same as their non-augmented counterparts if we remove the augmented column
if not (data_is_same and train_is_same and test_is_same):
    raise ValueError("the new train test split is not the same as the old ones")

### c) Demonstrate an alternative way of calculating metric b that would allow us to optimize over it

#### First way of calculating metric b (the one from Case pdf)

In [17]:
def metric_b_method_1(data, aisle) -> float:
    """ Calculate metric b over data with aisle using the 1st process clearly represented in the mathematical proof

    Warning:
    This is really slow, but it imitates the math closely. This should only be used for demonstration purposes.
    """

    A = aisle
    B = data.set_index("order_id")
    B_size = data["order_id"].nunique()

    met_b = 0
    for k in data["order_id"].unique():
        b_k = B.loc[k]

        # Handle special case where there is only one item in the order and pandas freaks out
        if isinstance(b_k["product_id"], np.int64): # If this is true, then there is only one item in the order
            if b_k["product_id"] in A:
                met_b += 1.0
            else:
                met_b += 0.0 # This is not necessary, but done for clarity

        # Handle all the other cases
        else:
            b_k_intersect_A = b_k[b_k["product_id"].isin(A)]
            b_k_intersect_A_size = b_k_intersect_A.shape[0]
            b_k_size = b_k.shape[0]

            met_b += b_k_intersect_A_size / b_k_size

    return (met_b / B_size) * 100

#### Second way of calculating metric b (the one from my mathematical proof)
The alternate way is written in more detail in the proof write up

In [18]:
def metric_b_impact_score(data_aug, product_id) -> float:
    """ Return the metric b impact score for a single product id given data_aug. """
    return data_aug[data_aug["product_id"] == product_id]["portion_of_order"].sum()

In [19]:
def metric_b_method_2(data_aug, aisle) -> float:
    """ Calculate metric b over data with aisle using the second process clearly represented in the mathematical proof

    Warning:
    This is also kinda slow, but is done intentionally to show that it reproduces the math

    Precondition:
    - data_aug must have the column "portion_of_order"
    """
    A = aisle
    B = data_aug
    B_size = data_aug["order_id"].nunique()

    met_b = 0
    for p in A:
        met_b += metric_b_impact_score(B, p)

    return (met_b / B_size) * 100

#### Show that the alternate ways of calculating metric b are equivalent via code

In [20]:
%%time
# Calculate metric b on the baseline isle with method 1
met_b_method_1 = metric_b_method_1(mma_test_aug, baseline_insta_aisle)
print(met_b_method_1)

53.770527840453155
CPU times: user 9.62 s, sys: 35.9 ms, total: 9.66 s
Wall time: 9.67 s


In [21]:
%%time
# Calculate metric b on the baseline isle with method 2
met_b_method_2 = metric_b_method_2(mma_test_aug, baseline_insta_aisle)
print(met_b_method_2)

53.7705278404531
CPU times: user 750 ms, sys: 25.9 ms, total: 776 ms
Wall time: 775 ms


In [22]:
# Check that the two methods are the same to the 10th decimal place. We don't check all decimal places because of floating point errors. In fact, 10 decimal places is probably overkill.
round(met_b_method_1, 10) == round(met_b_method_2, 10)

True

In [23]:
met_b = metric_b(mma_test_aug, baseline_insta_aisle)
print(f"Method 1: {round(met_b_method_1, 3)}, Method 2: {round(met_b_method_2, 3)}, Metric b Function: {round(met_b_base, 3)}")

Method 1: 53.771, Method 2: 53.771, Metric b Function: 53.771


#### Create psychic algorithm for metric b

In [24]:
def max_metric_b_aisle(data_aug: pd.DataFrame) -> list:
    """ Return the aisle that maximizes metric b over the data. This is run on the test set rather than the train set. data_aug must contain the column "portion_of_order"""

    data_aug_copy = data_aug.copy()
    id_to_met_b_score = data_aug_copy.groupby("product_id")["portion_of_order"].sum()
    data_aug_copy["met_b_score"] = data_aug_copy["product_id"].map(id_to_met_b_score)

    # Get top 1000 metric b items
    met_b_top_1000 = data_aug_copy.sort_values(by="met_b_score", ascending=False)
    met_b_top_1000.drop(columns="order_id", inplace=True)
    met_b_top_1000.drop_duplicates(subset="product_id", inplace=True)
    met_b_top_1000.reset_index(drop=True, inplace=True)
    met_b_top_1000 = met_b_top_1000.iloc[:1000]["product_id"].tolist()

    return met_b_top_1000

In [25]:
met_b_top_1000 = max_metric_b_aisle(mma_test_aug)
print(f"Max Metric b: {metric_b(mma_test_aug, met_b_top_1000)}")

Max Metric b: 55.050803135830165


### d) Checkpoint to save our improved dataset

In [26]:
%%time
# Add a column to the data that is the metric b impact score for each product calculated over the full dataset
id_to_met_b_score = mma_data_aug.groupby("product_id")["portion_of_order"].sum()
id_to_met_b_score_dict = id_to_met_b_score.to_dict()

mma_data_aug["b_score (full dataset)"] = mma_data_aug["product_id"].map(id_to_met_b_score)
mma_data_aug.head()

CPU times: user 70 ms, sys: 16.6 ms, total: 86.6 ms
Wall time: 84.8 ms


Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department,order_size,portion_of_order,b_score (full dataset)
0,1,49302,Bulgarian Yogurt,120,yogurt,16,dairy eggs,8,0.125,0.559483
1,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,other creams cheeses,16,dairy eggs,8,0.125,13.6205
2,1,10246,Organic Celery Hearts,83,fresh vegetables,4,produce,8,0.125,63.502058
3,1,49683,Cucumber Kirby,83,fresh vegetables,4,produce,8,0.125,255.053694
4,1,43633,Lightly Smoked Sardines in Olive Oil,95,canned meat seafood,15,canned goods,8,0.125,1.889912


In [27]:
AUGMENTED_DATA_SAVE_PATH = "./data/mma_mart_augmented.csv" # <- change this to your augmented data path
SAVE_MMA_DATA_AUG = True # <- change this to True if you want to save the augmented data
LOAD_AUGMENTED_MMA_DATA = False # <- change this to True if you already saved the augmented data

if SAVE_MMA_DATA_AUG:
    # Save the augmented data so we don't have to do all of that again (could be slow on larger datasets)
    mma_data_aug.to_csv(AUGMENTED_DATA_SAVE_PATH, index=False)
    SAVE_MMA_DATA = False
elif LOAD_AUGMENTED_MMA_DATA:
    # Load the augmented data
    mma_data_aug = pd.read_csv(AUGMENTED_DATA_SAVE_PATH)
    LOAD_AUGMENTED_MMA_DATA = False
