# Rotman Data Science Competition
### A no-brainer implementation of DistilBERT for prediction of substitutes to sold out goods when purchasing via instacart
Model created by fine-tuning DistilBERT on the MMA competition dataset using methods described by huggingface nlp tutorial.
### Imports

In [160]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer, pipeline, default_data_collator
from datasets import load_dataset
import collections
import pandas as pd
import numpy as np

## Data Preprocessing
### Turning our data into format for text masking

In [44]:
DATA_PATH = "./data/mma_mart.csv"
raw_data = pd.read_csv(DATA_PATH)
raw_data.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department
0,1,49302,Bulgarian Yogurt,120,yogurt,16,dairy eggs
1,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,other creams cheeses,16,dairy eggs
2,1,10246,Organic Celery Hearts,83,fresh vegetables,4,produce
3,1,49683,Cucumber Kirby,83,fresh vegetables,4,produce
4,1,43633,Lightly Smoked Sardines in Olive Oil,95,canned meat seafood,15,canned goods


In [174]:
products = raw_data.loc[:, 'product_id' : 'product_name']
products.drop_duplicates("product_id", inplace=True)
products.set_index("product_id", inplace=True)

In [45]:
raw_data = raw_data.loc[:, 'order_id':'product_name']
raw_data.head()

Unnamed: 0,order_id,product_id,product_name
0,1,49302,Bulgarian Yogurt
1,1,11109,Organic 4% Milk Fat Whole Milk Cottage Cheese
2,1,10246,Organic Celery Hearts
3,1,49683,Cucumber Kirby
4,1,43633,Lightly Smoked Sardines in Olive Oil


In [46]:
# Put orders into string format
FORMAT = "On order {x} the user bought {item 1, ... , item y}."

# raw_data["str_p_id"] = [f"(id {str(i)})" for i in raw_data["product_id"]]
# raw_data["name_id"] = raw_data["product_name"] + raw_data["str_p_id"]
# raw_data.drop(columns=["product_id", "product_name", "str_p_id"], inplace=True)

In [None]:
raw_data = raw_data.set_index("order_id")

In [80]:
# formated_data = pd.DataFrame(columns=["order"])
# for idx in raw_data.index.unique():
#     items = ", ".join(raw_data.loc[idx]["product_name"])
#     formated_data.loc[idx, "order"] = f"On order {idx}, the user purchased {items}"
# formated_data.head()
# formated_data.to_csv(SAVE_PATH, index=False)

SAVE_PATH = "./data/mma_cart_preprocessed_naive"

formated_data = pd.read_csv(SAVE_PATH)
formated_data.head()

Unnamed: 0,order
0,"On order 1, the user purchased Bulgarian Yogur..."
1,"On order 2, the user purchased Organic Egg Whi..."
2,"On order 3, the user purchased Total 2% with S..."
3,"On order 4, the user purchased Plain Pre-Slice..."
4,"On order 5, the user purchased Bag of Organic ..."


## Load Data into Huggingface Dataset object

In [85]:
data_set = load_dataset("csv", data_files=SAVE_PATH)
data_set = data_set['train'].train_test_split(test_size=0.2)
data_set

DatasetDict({
    train: Dataset({
        features: ['order'],
        num_rows: 78266
    })
    test: Dataset({
        features: ['order'],
        num_rows: 19567
    })
})

In [84]:
model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [87]:
def tokenize_function(examples):
    result = tokenizer(examples["order"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = data_set.map(
    tokenize_function, batched=True, remove_columns=["order"]
)
tokenized_datasets

Map:   0%|          | 0/78266 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (648 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/19567 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 78266
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 19567
    })
})

In [89]:
chunk_size = 128

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/78266 [00:00<?, ? examples/s]

Map:   0%|          | 0/19567 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 46266
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 11500
    })
})

In [91]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'silk pie, pasilla pepper, sharp cheddar cheese, natural sharp cheddar sliced cheese, cranberry juice, lemonade, classic soda, half and half [SEP] [CLS] on order 36247, the user purchased organic raspberries, ice cream, super premium, dutch chocolate, raspberry sorbet pops, stage 1 just peaches baby food, freshly squeezed orange juice [SEP] [CLS] on order 71365, the user purchased organic broccoli, organic hass avocado, organic honey sweet whole wheat bread, avocado roll, organic shredded mild cheddar, honey graham sticks, organic lemon'

In [94]:
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [161]:
wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

def whole_product_masking_data_collator(features, products: pd.DataFrame):
    """ Mask whole products """
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [162]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] on [MASK] 67513, the user purchased [MASK] blueberries clamshell, [MASK] [MASK] cheese, mini seedless watermelon pack, organic strawberries, hass avocado variety [SEP] [CLS] on order 66048, the user purchased [MASK] [MASK] [MASK] whiskey, scoopable scented clumping cat litter [MASK] jalapeno pepper, pure leaf unsweetened [MASK] brewed [MASK] [MASK] beer [MASK] n bratwurst, lemon sparkling water [MASK] spicy brown mustard, [MASK] / [MASK] ground beef, [MASK] hamburger buns, double rolls bath [MASK], paper towels choose - a - [MASK], mega rolls, one - ply, chocolate'

'>>> silk pie, pasilla pepper, [MASK] cheddar cheese, natural sharp cheddar sliced [MASK] [MASK] cranberry [MASK] [MASK] lemonade [MASK] classic [MASK] [MASK] half and half [SEP] [CLS] on [MASK] [MASK] [MASK] [MASK], the user purchased organic raspberries [MASK] ice cream, super [MASK] [MASK] dutch chocolate, raspberry sorbet pops, stage 1 [MASK] peaches baby food [MASK] freshly squeezed orange juice [SEP] [CLS]

In [109]:
train_size = 40_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 4000
    })
})

## Model

In [110]:
batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir="./substitute_classifier",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    logging_steps=logging_steps,
)

In [111]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

## Training

In [112]:
CHECKPOINT_PATH = "./substitute_classifier/naive_distillbert_classifier_checkpoint_40000"
trainer.train(CHECKPOINT_PATH)
trainer.save_model(CHECKPOINT_PATH)

Epoch,Training Loss,Validation Loss


## Evaluation

In [113]:
trainer.evaluate()

{'eval_loss': 0.1854913979768753,
 'eval_runtime': 52.2443,
 'eval_samples_per_second': 76.563,
 'eval_steps_per_second': 1.206,
 'epoch': 3.0}

In [115]:
my_model = AutoModelForMaskedLM.from_pretrained(CHECKPOINT_PATH)

In [152]:
my_pipeline = pipeline("fill-mask", model=my_model, tokenizer=tokenizer, targets=["banana", "jerk", "energy"])

In [153]:
sample_input = "On order 50113, the user purchased Organic Peeled Whole Baby Carrots, Zen Tea, Sugar Free Energy Drink, Pretzel Crisps Original Deli Style Pretzel Crackers, Madagascar Vanilla Almond Nuts & Spices Bars, [MASK], Popped Corn Chips Kettle, 100 Calorie  Per Bag Popcorn, Sweet Chipotle Beef Jerky, Chili Lime Beef Jerky Pack, Roasted & Salted Shelled Pistachios, Sugar Free Gum Winter Frost, Brownie Brittle Chocolate Chip, Sunburst Tangerine, Double Chocolate Chip Cookies"

correct_answer = "Crunchy Peanut Butter Energy Bar"

In [154]:
my_pipeline(sample_input, top_k = 2)

[{'score': 0.33547356724739075,
  'token': 15212,
  'token_str': 'banana',
  'sequence': 'on order 50113, the user purchased organic peeled whole baby carrots, zen tea, sugar free energy drink, pretzel crisps original deli style pretzel crackers, madagascar vanilla almond nuts & spices bars, banana, popped corn chips kettle, 100 calorie per bag popcorn, sweet chipotle beef jerky, chili lime beef jerky pack, roasted & salted shelled pistachios, sugar free gum winter frost, brownie brittle chocolate chip, sunburst tangerine, double chocolate chip cookies'},
 {'score': 0.00013964167737867683,
  'token': 2943,
  'token_str': 'energy',
  'sequence': 'on order 50113, the user purchased organic peeled whole baby carrots, zen tea, sugar free energy drink, pretzel crisps original deli style pretzel crackers, madagascar vanilla almond nuts & spices bars, energy, popped corn chips kettle, 100 calorie per bag popcorn, sweet chipotle beef jerky, chili lime beef jerky pack, roasted & salted shelled 