# Rotman Data Science Competition
### Section 4.3: Using sentence similarity to calculate similarity score between different substitute products
## 0. Imports


In [1]:
import pandas as pd

## 1. Data Preprocessing

In [2]:
data = pd.read_csv("data/mma_mart_augmented.csv")
data.drop(columns=['portion_of_order', 'b_score (full dataset)'], inplace=True)
data.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department,order_size
0,1,49302,Bulgarian Yogurt,120,yogurt,16,dairy eggs,8
1,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,other creams cheeses,16,dairy eggs,8
2,1,10246,Organic Celery Hearts,83,fresh vegetables,4,produce,8
3,1,49683,Cucumber Kirby,83,fresh vegetables,4,produce,8
4,1,43633,Lightly Smoked Sardines in Olive Oil,95,canned meat seafood,15,canned goods,8


In [3]:
id_to_products = data.groupby("order_id")["product_id"].apply(set)
id_to_aisles = data.groupby("order_id")["aisle_id"].apply(set)
id_to_departments = data.groupby("order_id")["department_id"].apply(set)

In [4]:
orders = pd.DataFrame({"product_ids": id_to_products, "aisle_ids": id_to_aisles, "department_ids": id_to_departments})
order_id_to_order_size = data.groupby("order_id")["order_id"].count()
orders["order_size"] = orders.index.map(order_id_to_order_size)
orders.head()

Unnamed: 0_level_0,product_ids,aisle_ids,department_ids,order_size
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,"{11109, 10246, 47209, 43633, 49683, 22035, 493...","{108, 83, 21, 120, 24, 95}","{16, 4, 15}",8
2,"{33120, 17794, 40141, 9327, 30035, 43668, 2898...","{104, 105, 17, 19, 83, 86, 88, 123}","{16, 4, 13}",9
3,"{17668, 24838, 17704, 46667, 21903, 17461, 326...","{35, 91, 112, 83, 120, 123}","{16, 3, 4, 12}",8
4,"{26434, 32645, 10054, 21351, 22598, 39758, 348...","{64, 3, 107, 11, 78, 48, 125, 93, 31}","{3, 7, 11, 14, 19}",13
5,"{48002, 45698, 18569, 37011, 15005, 8479, 9633...","{1, 131, 4, 21, 24, 32, 33, 45, 49, 54, 61, 78...","{4, 6, 7, 9, 11, 12, 13, 16, 17, 19, 20}",26


In [5]:
orders.sort_values(by="order_size", ascending=True, inplace=True)
orders = orders[orders["order_size"] < 10]
orders = orders[orders["order_size"] > 2]
orders.shape

(45181, 4)

In [6]:
%%time

from time import time


cur_time = time()
count = 0

similar_orders = []
for i in orders.index:

    if count % 10 == 0:
        time_elapsed_for_10_iters = time() - cur_time
        print(f"Finished {count} orders in {time_elapsed_for_10_iters} seconds")
        cur_time = time()

        # Print expected time to finish
        num_orders_left = orders.shape[0] - count
        expected_time_left = (num_orders_left / 10) * time_elapsed_for_10_iters
        print(f"Expected time left: {expected_time_left} seconds")

    count += 1

    order_i = orders.loc[i]
    i_size = len(order_i["product_ids"])
    for j in orders.index:
        order_j = orders.loc[j]
        j_size = len(order_j["product_ids"])
        if i <= j:
            continue
        elif j_size < i_size - 1:
            continue
        elif j_size > i_size + 1:
            break
        else:
            product_intersection = order_i["product_ids"].intersection(order_j["product_ids"])
            percent_prod_shared = len(product_intersection) / i_size

            is_prod_shared = percent_prod_shared > 0.5
            if is_prod_shared:
                similar_orders.append((i, j))
                break

Finished 0 orders in 1.8835067749023438e-05 seconds
Expected time left: 0.0850987195968628 seconds
Finished 10 orders in 7.91727089881897 seconds
Expected time left: 35763.10437705517 seconds
Finished 20 orders in 7.700314044952393 seconds
Expected time left: 34775.388258409504 seconds
Finished 30 orders in 8.669085264205933 seconds
Expected time left: 39141.78687641621 seconds
Finished 40 orders in 10.215246200561523 seconds
Expected time left: 46112.64287395478 seconds
Finished 50 orders in 8.47151517868042 seconds
Expected time left: 38232.7951529026 seconds
Finished 60 orders in 11.255697250366211 seconds
Expected time left: 50786.83156337738 seconds
Finished 70 orders in 10.209503889083862 seconds
Expected time left: 46056.092994046216 seconds
Finished 80 orders in 10.036381006240845 seconds
Expected time left: 45265.08197624684 seconds
Finished 90 orders in 8.824197053909302 seconds
Expected time left: 39789.18693578243 seconds
Finished 100 orders in 10.871473789215088 seconds
Ex

KeyboardInterrupt: 

In [9]:
count

157

In [10]:
len(similar_orders)

35

In [22]:
similar_orders[:10]

[(60365, 50921),
 (15991, 6068),
 (84571, 76816),
 (84043, 50312),
 (60095, 15201),
 (84846, 69424),
 (14368, 7563),
 (36623, 15598),
 (14955, 8230),
 (35963, 5815)]

In [14]:
orders.head()

Unnamed: 0_level_0,product_ids,aisle_ids,department_ids,order_size
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
60366,"{274, 22667, 31509}","{43, 52, 87}","{17, 3, 1}",3
60365,"{9896, 18531, 27845}","{106, 84, 53}","{16, 12}",3
60096,"{15200, 30720, 11365}","{32, 83}",{4},3
61999,"{3856, 35762, 24518}","{96, 3, 52}","{1, 19, 20}",3
15991,"{28626, 16797, 6615}","{24, 21, 111}","{16, 17, 4}",3


In [19]:
orders.loc[[60365, 50921]]

Unnamed: 0_level_0,product_ids,aisle_ids,department_ids,order_size
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
60365,"{9896, 18531, 27845}","{106, 84, 53}","{16, 12}",3
50921,"{21938, 18531, 27845}","{83, 84, 53}","{16, 4}",3


In [23]:
data[data["order_id"] == 84043]

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department,order_size
828930,84043,34126,Organic Italian Parsley Bunch,16,fresh herbs,4,produce,3
828931,84043,42701,Organic Sour Cream,108,other creams cheeses,16,dairy eggs,3
828932,84043,34358,Garlic,83,fresh vegetables,4,produce,3


In [24]:
data[data["order_id"] == 50312]

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department,order_size
496687,50312,34126,Organic Italian Parsley Bunch,16,fresh herbs,4,produce,3
496688,50312,34358,Garlic,83,fresh vegetables,4,produce,3
496689,50312,13198,85% Lean Ground Beef,122,meat counter,12,meat seafood,3
