# Rotman Data Science Competition
## Section 1.3: Experiments on Approaches
### 0. Load Data


In [276]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [277]:
SHOW_GRAPHS = False

In [278]:
def load_competition_data() -> pd.DataFrame:
    DATA_PATH = "data/mma_mart.csv"
    data = pd.read_csv(DATA_PATH)
    return data

GRAPH_OUT_PATH = "graphs/"

In [279]:
mma_data = load_competition_data()
mma_data.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department
0,1,49302,Bulgarian Yogurt,120,yogurt,16,dairy eggs
1,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,other creams cheeses,16,dairy eggs
2,1,10246,Organic Celery Hearts,83,fresh vegetables,4,produce
3,1,49683,Cucumber Kirby,83,fresh vegetables,4,produce
4,1,43633,Lightly Smoked Sardines in Olive Oil,95,canned meat seafood,15,canned goods


In [280]:
mma_orders = mma_data["order_id"].unique()
train_orders, test_orders = train_test_split(mma_orders, test_size=0.2, random_state=42) # Specify random state 42 for reproducibility

mma_train = mma_data[mma_data["order_id"].isin(train_orders)]
mma_test = mma_data[mma_data["order_id"].isin(test_orders)]

## 1. Evaluation Metrics
### a) The number of orders that utilize the in-aisle items

In [281]:
def metric_a(test_data: pd.DataFrame, insta_aisle: list) -> float:
    """ Return the number of orders that utilize the in-aisle items as a percentage of the total number of orders in the test_data dataset. insta_aisle should contain 1000 product ids.

    Precondition:
    len(insta_aisle) == 1000
    len(test_data) > 0
    """
    # Count the orders that utilize in-aisle items
    purchases_in_aisle = test_data[test_data["product_id"].isin(insta_aisle)]
    orders_in_aisle = purchases_in_aisle["order_id"].nunique()
    total_orders = test_data["order_id"].nunique()

    if total_orders <= 0:
        raise ValueError("There must be at least one order in the test data")

    if len(insta_aisle) != 1000:
        raise ValueError("insta_aisle must contain 1000 product ids")

    return (orders_in_aisle / total_orders) * 100

### (b) Average % of items in each order that utilize in-aisle items

In [282]:
def metric_b(test_data: pd.DataFrame, insta_aisle: list) -> float:
    """ Calculate the percentage of items in each order that utilize in-aisle items and return the average of these percentages. insta_aisle should contain 1000 product ids.

    Precondition:
    len(insta_aisle) == 1000
    """
    # Check that there are not more than 1000 orders in the test data
    if len(insta_aisle) != 1000:
        raise ValueError("There should be 1000 items in the aisle")

    # Get the order ids of all the orders in the test data
    order_ids = test_data["order_id"].unique()

    # Create a dataframe that allows us to input an order id and get a dataframe of the products with that order id
    ord_id_to_products = test_data.set_index("order_id")

    # Calculate the percentage of items in each order that utilize in-aisle items
    order_percentages = np.zeros(order_ids.shape[0])
    idx = 0
    for order_id in order_ids:
        order = ord_id_to_products.loc[order_id]
        order_percentages[idx] = _get_order_in_aisle_percentage(order, insta_aisle)
        idx += 1

    return np.mean(order_percentages)

def _get_order_in_aisle_percentage(order: pd.Series, insta_aisle: list) -> float:
    """ Return the percentage of items in the order that utilize in-aisle items. insta_aisle should contain 1000 product ids. order is a dataframe containing all the purchases associated with a single order id.

    Preconditions:
    - len(insta_aisle) == 1000
    - order contains one order id only
    - order contains at least one item
    """

    # Check that there is at least one item in the order
    if order.shape[0] <= 0:
        raise ValueError("Order must contain at least one item")

    # Catch edge case
    elif len(order.shape) == 1:
        # When there is only one order, pandas turns it into a vertical series
        return 100.0 if order.loc["product_id"] in insta_aisle else 0.0

    # Calculate the percentage of items in the order that utilize in-aisle items
    else:
        items_found_in_aisle = order[order["product_id"].isin(insta_aisle)]
        num_items_found_in_aisle = items_found_in_aisle.shape[0]
        total_items_in_order = order.shape[0]
        return (num_items_found_in_aisle / total_items_in_order) * 100

### (c) Average % of items in each order that utilize in-aisle items accounting for any identified substitutes.

In [283]:
pass

### 2. Baseline Approach
Identify top 1000 items sold

In [284]:
def baseline(data: pd.DataFrame, k: int) -> pd.DataFrame:
    """ Return the top k items by sales

    Precondition:
    k is less than the number of items in the dataset
    """
    sales = pd.DataFrame(data.groupby(["product_id", "product_name"])["order_id"].count())
    sales.rename(columns = {"order_id": "sales"}, inplace=True)
    sales.sort_values(by="sales", ascending=False, inplace=True)
    sales.reset_index(inplace=True)

    if k > sales.shape[0]:
        raise ValueError("k must be less than the number of items in the dataset")

    return sales.iloc[:k]

#### Evaluate the baseline approach

In [285]:
baseline_pred = baseline(mma_train, 1000)
baseline_pred.head()

Unnamed: 0,product_id,product_name,sales
0,24852,Banana,11623
1,13176,Bag of Organic Bananas,9335
2,21137,Organic Strawberries,6466
3,21903,Organic Baby Spinach,5947
4,47209,Organic Hass Avocado,5081


In [286]:
baseline_insta_aisle = baseline_pred["product_id"].tolist()
if len(baseline_insta_aisle) != 1000:
    raise ValueError("There should be 1000 items in the aisle")
met_a = metric_a(mma_test, baseline_insta_aisle)
met_b = metric_b(mma_test, baseline_insta_aisle)

Metric a: 92.42091276128176, Metric b: 53.77052784045304


In [290]:
print(f"Metric a: {met_a}, Metric b: {met_b}, Avg Score: {(met_a + met_b) / 2}")

Metric a: 92.42091276128176, Metric b: 53.77052784045304, Avg Score: 73.0957203008674
