# Rotman Data Science Competition
## Section 2: Experimentation
### Part 1. Case Studies
#### a) Testing Viability of Phrasing Task as Common NLP Tasks

In [1]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
generator = pipeline("text-generation", model="distilgpt2")
prompt = "Order 17053 -- Items: Yellow Potato, garlic, Organic Italian Salad, Delmonicos Dress Pepper Rnch, Smoked Uncured Kielbasa. Order 58719 -- Items: Original Hummus, Sunny Days Strawberry Snack Bars, Chicken Salad, Banana, Organic Cherry Tomatoes. Order 89470 -- Items: Lime Seltzer, "

In [14]:
outputs = generator(prompt, max_length=200, num_return_sequences=2)
outputs

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Order 17053 -- Item 1: Yellow Potato, Item 2: garlic, Item 3: Organic Italian Salad, Item 4: Delmonicos Dress Pepper Rnch, Item 5: Smoked Uncured Kielbasa, Item 6: Fresh Mushroom Cheese, Item 7: Old Sausage Salad, Item 8: Cheese Wrapping, Item 9: Beef Soup, Item 10: Hot Fried Chicken, Item 11: Cheddar Roasted Meat, Item 12: Green Potatoes, Item 13: Cheddar Cheese, Item 14: Apple Sauce, Item 15: Sweet Potato Soup, Item 16: Potato Soup, Item 17: Carrots, Item 18: Chives, Item 19: Yellow Potatoes, Item 20: Yellow Potatoes, Item 21: Yellow Potatoes, Item 22: Yellow Potatoes, Item 23: Yellow Potatoes, Item 24: Yellow Potatoes, Item 25: Yellow Potatoes, Item 26: Yellow Potatoes, Item 27: Yellow Potatoes'},
 {'generated_text': 'Order 17053 -- Item 1: Yellow Potato, Item 2: garlic, Item 3: Organic Italian Salad, Item 4: Delmonicos Dress Pepper Rnch, Item 5: Smoked Uncured Kielbasa, Item 6: Spicy Spicy Spicy Mushroom, Item 7: The Yellow Potato, Item 8: The Yellow Potato, It

In [16]:
mask_filler = pipeline("fill-mask", model="distilbert-base-uncased")
prompt = "Order 17053 -- Item 1: Yellow Potato, Item 2: [MASK], Item 3: Organic Italian Salad, Item 4: Delmonicos Dress Pepper Rnch, Item 5: Smoked Uncured Kielbasa."
outputs = mask_filler(prompt, top_k=10)

In [17]:
correct_answer = "garlic"
accuracy = 0
for output in outputs:
    if correct_answer in output["token_str"]:
        accuracy += 1
accuracy = accuracy / len(outputs)
print(f"Accuracy: {accuracy}")
outputs

Accuracy: 0.1


[{'score': 0.21543923020362854,
  'token': 20856,
  'token_str': 'tomato',
  'sequence': 'order 17053 - - item 1 : yellow potato, item 2 : tomato, item 3 : organic italian salad, item 4 : delmonicos dress pepper rnch, item 5 : smoked uncured kielbasa.'},
 {'score': 0.10297118127346039,
  'token': 14557,
  'token_str': 'potato',
  'sequence': 'order 17053 - - item 1 : yellow potato, item 2 : potato, item 3 : organic italian salad, item 4 : delmonicos dress pepper rnch, item 5 : smoked uncured kielbasa.'},
 {'score': 0.040197696536779404,
  'token': 28540,
  'token_str': 'cabbage',
  'sequence': 'order 17053 - - item 1 : yellow potato, item 2 : cabbage, item 3 : organic italian salad, item 4 : delmonicos dress pepper rnch, item 5 : smoked uncured kielbasa.'},
 {'score': 0.030098844319581985,
  'token': 20548,
  'token_str': 'garlic',
  'sequence': 'order 17053 - - item 1 : yellow potato, item 2 : garlic, item 3 : organic italian salad, item 4 : delmonicos dress pepper rnch, item 5 : smok

In [6]:
prompt = "Order 17053 -- Item 1: Yellow Potato, Item 2: garlic, Item 3: Organic Italian Salad, Item 4: Delmonicos Dress Pepper Rnch, Item 5: Smoked Uncured Kielbasa. Purpose of Order: [MASK]."
outputs = mask_filler(prompt, top_k=10)
outputs

[{'score': 0.17831414937973022,
  'token': 20548,
  'token_str': 'garlic',
  'sequence': 'order 17053 - - item 1 : yellow potato, item 2 : garlic, item 3 : organic italian salad, item 4 : delmonicos dress pepper rnch, item 5 : smoked uncured kielbasa. purpose of order : garlic.'},
 {'score': 0.018526792526245117,
  'token': 20856,
  'token_str': 'tomato',
  'sequence': 'order 17053 - - item 1 : yellow potato, item 2 : garlic, item 3 : organic italian salad, item 4 : delmonicos dress pepper rnch, item 5 : smoked uncured kielbasa. purpose of order : tomato.'},
 {'score': 0.01784643717110157,
  'token': 14629,
  'token_str': 'potatoes',
  'sequence': 'order 17053 - - item 1 : yellow potato, item 2 : garlic, item 3 : organic italian salad, item 4 : delmonicos dress pepper rnch, item 5 : smoked uncured kielbasa. purpose of order : potatoes.'},
 {'score': 0.014760144054889679,
  'token': 24444,
  'token_str': 'onions',
  'sequence': 'order 17053 - - item 1 : yellow potato, item 2 : garlic, i

In [7]:
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")
prompt = "Order 17053 -- Item 1: Yellow Potato, Item 2: garlic, Item 3: Organic Italian Salad, Item 4: Delmonicos Dress Pepper Rnch, Item 5: Smoked Uncured Kielbasa"
cand_labels = ["meal", "baking", "camping", "utility"]
classifier(prompt, cand_labels)

{'sequence': 'Order 17053 -- Item 1: Yellow Potato, Item 2: garlic, Item 3: Organic Italian Salad, Item 4: Delmonicos Dress Pepper Rnch, Item 5: Smoked Uncured Kielbasa',
 'labels': ['meal', 'utility', 'baking', 'camping'],
 'scores': [0.9301444888114929,
  0.05034911260008812,
  0.011968020349740982,
  0.007538415491580963]}

In [18]:
from transformers import AutoTokenizer, AutoModel
import torch

checkpoint = "sentence-transformers/msmarco-distilbert-base-tas-b"
model = AutoModel.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [19]:
query_pie = "I bought apples, flour, cinnamon, salt, butter, and sugar"
docs_pie = ["I bought apple curry, flour, cinnamon, salt, butter, and sugar",
        "I bought apple extract, flour, cinnamon, salt, butter, and sugar",
        "I bought bananas, flour, cinnamon, salt, butter, and sugar",
        "I bought apple juice, flour, cinnamon, salt, butter, and sugar"]

query_basket = "I bought apples, pears, cherries, oranges, AA batteries, and grapes"
docs_basket = ["I bought apple candy, pears, cherries, oranges, AA batteries, and grapes",
               "I bought apple extract, pears, cherries, oranges, AA batteries, and grapes",
               "I bought bananas, pears, cherries, oranges, AA batteries, and grapes",
               "I bought apple juice, pears, cherries, oranges, AA batteries, and grapes"]


In [20]:
#CLS Pooling - Take output from first token
def cls_pooling(model_output):
    return model_output.last_hidden_state[:,0]

#Encode text
def encode(texts):
    # Tokenize sentences
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)

    # Perform pooling
    embeddings = cls_pooling(model_output)

    return embeddings

In [21]:
query_pie_emb = encode(query_pie)
docs_pie_emb = encode(docs_pie)

In [22]:
#Compute dot score between query and all document embeddings
scores = torch.mm(docs_pie_emb, query_pie_emb.transpose(0, 1)).cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(docs_pie, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

#Output passages & scores
for doc, score in doc_score_pairs:
    print(score, doc)


[112.56742858886719] I bought apple juice, flour, cinnamon, salt, butter, and sugar
[112.05168151855469] I bought apple extract, flour, cinnamon, salt, butter, and sugar
[111.874755859375] I bought bananas, flour, cinnamon, salt, butter, and sugar
[110.29551696777344] I bought apple curry, flour, cinnamon, salt, butter, and sugar


This is correct. Since the person is buying the things to make apple pie, it is likely that they would want apple juice or apple extract over bananas over apple curry in the occasion where apples are sold out.

In [13]:
# Repeat with query_basket and docs_basket
query_basket_emb = encode(query_basket)
docs_basket_emb = encode(docs_basket)

scores = torch.mm(docs_basket_emb, query_basket_emb.transpose(0, 1)).cpu().tolist()

doc_score_pairs = list(zip(docs_basket, scores))
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

for doc, score in doc_score_pairs:
    print(score, doc)

[114.98710632324219] I bought apple juice, pears, cherries, oranges, AA batteries, and grapes
[114.76957702636719] I bought bananas, pears, cherries, oranges, AA batteries, and grapes
[114.55928039550781] I bought apple candy, pears, cherries, oranges, AA batteries, and grapes
[114.43769073486328] I bought apple extract, pears, cherries, oranges, AA batteries, and grapes


This is, again, correct (though by a smaller margin) because somebody buying a fruit basket would want apple juice or bananas over apple candy over apple extract.

Apple Pie Recipe From:
https://www.ricardocuisine.com/en/recipes/6435-apple-pie-the-best

In [1]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="data/mma_mart.csv")
dataset.shape

  from .autonotebook import tqdm as notebook_tqdm
Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 3862.16it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 163.53it/s]
Generating train split: 987259 examples [00:01, 571513.01 examples/s]


{'train': (987259, 7)}

In [7]:
dataset['train'][0]

{'order_id': 1,
 'product_id': 49302,
 'product_name': 'Bulgarian Yogurt',
 'aisle_id': 120,
 'aisle': 'yogurt',
 'department_id': 16,
 'department': 'dairy eggs'}

In [8]:
len("'order_id': 1, 'product_id': 49302, 'product_name': 'Bulgarian Yogurt', 'aisle_id': 120, 'aisle': 'yogurt 'department_id': 16, 'department': 'dairy eggs'}")


154