# Rotman Data Science Competition
### A no-brainer implementation to use text-infilling to calculate similarity score between different substitute products
## Imports


In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
from datasets import Dataset, DatasetDict

In [3]:
import torch

In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [5]:
today_date = datetime.today()

DATA_SAVE_PATH = "./data/formated_data_for_seq2seq.csv"
CHECKPOINT_SAVE_PATH = f"./substitute_classifier/T5/checkpoint_{today_date}"

## Data Preprocessing
### Turning our data into format for text-infilling

In [6]:
REPROCESS_DATA = False
if REPROCESS_DATA:
    DATA_PATH = "./data/mma_mart.csv"
    raw_data = pd.read_csv(DATA_PATH)
    raw_data.head()

In [7]:
if REPROCESS_DATA:
    products = raw_data.loc[:, 'product_id' : 'product_name']
    products.drop_duplicates("product_id", inplace=True)
    products.set_index("product_id", inplace=True)
    raw_data = raw_data.loc[:, 'order_id':'product_name']
    raw_data.head()

In [8]:
if REPROCESS_DATA:
    # Put orders into string format
    FORMAT = "On order {x} a customer bought {item 1, ... , item y}."

    raw_data["str_p_id"] = [f"(id {str(i)})" for i in raw_data["product_id"]]
    raw_data["target_text"] = raw_data["product_name"] + raw_data["str_p_id"]
    # raw_data.drop(columns=["product_id", "product_name", "str_p_id"], inplace=True)
    raw_data.head()

In [9]:
if REPROCESS_DATA:
    # Pick about 15% of the items to mask out
    MASK_RATE = 0.15
    RANDOM_SEED = 42
    random_generator = np.random.default_rng(RANDOM_SEED)
    mask_idx = random_generator.choice(raw_data.index, size= int(raw_data.shape[0] * MASK_RATE), replace=False, axis=0)
    mask_idx[:10]

In [10]:
if REPROCESS_DATA:
    raw_data_masked = raw_data.copy()
    raw_data_masked.loc[mask_idx, "target_text"] = "[MASK]"
    raw_data_masked.head()

In [11]:
if REPROCESS_DATA:
    formated_data = pd.DataFrame(columns=["source_text", "target_text"], index=raw_data.order_id.unique())
    formated_data.head(10)

In [12]:
if REPROCESS_DATA:
    FORMAT = "On order {} a customer bought {}."
    raw_data = raw_data.set_index("order_id")
    raw_data_masked = raw_data_masked.set_index("order_id")
    for idx in raw_data.index.unique():
        items = ", ".join(raw_data.loc[idx]["target_text"])
        items_masked = ", ".join(raw_data_masked.loc[idx]["target_text"])
        formated_data.loc[idx, "target_text"] = FORMAT.format(idx, items)
        formated_data.loc[idx, "source_text"] = FORMAT.format(idx, items_masked)

    formated_data.to_csv(DATA_SAVE_PATH, index=False)

## Training

In [13]:
my_data = pd.read_csv(DATA_SAVE_PATH)
my_data.head()

Unnamed: 0,source_text,target_text
0,"On order 1 a customer bought [MASK], Organic 4...",On order 1 a customer bought Bulgarian Yogurt(...
1,On order 2 a customer bought Organic Egg White...,On order 2 a customer bought Organic Egg White...
2,On order 3 a customer bought Total 2% with Str...,On order 3 a customer bought Total 2% with Str...
3,"On order 4 a customer bought [MASK], Honey/Lem...",On order 4 a customer bought Plain Pre-Sliced ...
4,On order 5 a customer bought Bag of Organic Ba...,On order 5 a customer bought Bag of Organic Ba...


### 3. Set model, tokenizer, and data_collator variables

In [14]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


### 4. Get data and divide into train, eval, and test sets
We use 80% of the data for training, 10% for evaluation, and 10% for testing.

In [15]:
TRAIN_SPLIT_SEED = 42
train_df = my_data.sample(frac = 0.8, random_state=TRAIN_SPLIT_SEED)
eval_df = my_data.drop(train_df.index).sample(frac = 0.5, random_state=TRAIN_SPLIT_SEED)
test_df = my_data.drop(train_df.index).drop(eval_df.index)
train_df.head()

Unnamed: 0,source_text,target_text
22652,On order 23139 a customer bought Organic Avoca...,On order 23139 a customer bought Organic Avoca...
42695,On order 43680 a customer bought First Prunes(...,On order 43680 a customer bought First Prunes(...
38279,"On order 39159 a customer bought [MASK], Koshe...",On order 39159 a customer bought Organic Diced...
78622,On order 80380 a customer bought Organic Baby ...,On order 80380 a customer bought Organic Baby ...
15252,On order 15570 a customer bought Sharp Cheddar...,On order 15570 a customer bought Sharp Cheddar...


### 5. Create a dataset dict from the dataframes

In [16]:
my_dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "eval": Dataset.from_pandas(eval_df),
    "test": Dataset.from_pandas(test_df),
    })
my_dataset.shape

{'train': (78266, 3), 'eval': (9784, 3), 'test': (9783, 3)}

Downsample the dataset to 10,000 examples for training, 1,000 for evaluation, and 1,000 for testing for the sake of fast training

In [17]:
train_size = 10000
test_size = train_size // 10
eval_size = train_size // 10

down_sampled_ds = my_dataset["train"].train_test_split(train_size=train_size, test_size=test_size + eval_size, seed=TRAIN_SPLIT_SEED)
test_valid = down_sampled_ds["test"].train_test_split(train_size=eval_size, test_size=test_size, seed=TRAIN_SPLIT_SEED)
down_sampled_ds["eval"] = test_valid["train"]
down_sampled_ds["test"] = test_valid["test"]
down_sampled_ds

DatasetDict({
    train: Dataset({
        features: ['source_text', 'target_text', '__index_level_0__'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['source_text', 'target_text', '__index_level_0__'],
        num_rows: 1000
    })
    eval: Dataset({
        features: ['source_text', 'target_text', '__index_level_0__'],
        num_rows: 1000
    })
})

### 6. Tokenize the dataset

In [18]:
MAX_LENGTH = 512
def tokenize(source_texts, target_texts):
    model_inputs = tokenizer(text=source_texts, max_length=MAX_LENGTH, truncation=True)
    labels = tokenizer(text_target=target_texts, max_length=MAX_LENGTH, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
tokenized_dataset = down_sampled_ds.map(tokenize, input_columns=["source_text", "target_text"], remove_columns=["source_text", "target_text"])

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

### 7. Set training arguments
Change "output_directory" to your desired output directory. You can also change the batch_size, learning_rate, num_train_epochs and other parameters here. See the documentation for more details: [https://huggingface.co/docs/transformers/v4.21.3/en/main_classes/trainer#transformers.TrainingArguments](https://huggingface.co/docs/transformers/v4.21.3/en/main_classes/trainer#transformers.TrainingArguments)

In [19]:
ON_CUDA_GPU = False
if ON_CUDA_GPU:
    training_arguments = Seq2SeqTrainingArguments(
        "output_directory",
        learning_rate=0.0001,
        weight_decay=0.01,
        fp16=True,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=2,
        num_train_epochs=20,
        evaluation_strategy="epoch",
        report_to="all"
    )
else:
    training_arguments = Seq2SeqTrainingArguments(
        "output_directory",
        learning_rate=0.0001,
        weight_decay=0.01,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=2,
        num_train_epochs=20,
        evaluation_strategy="epoch",
        report_to="all"
    )

### 8. Create a trainer

In [20]:
trainer = Seq2SeqTrainer(
    model,
    training_arguments,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["eval"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

Enabling MPS

In [21]:
torch.__version__

'2.2.0.dev20231001'

In [22]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    torch.mps.set_per_process_memory_fraction(0.0)
else:
    print("MPS is not available")

In [23]:
torch.cuda.is_available()

False

### 9. Train the model

In [None]:
trainer.train()

### 10. Save the tokenizer and model

In [None]:
tokenizer.save_pretrained(CHECKPOINT_SAVE_PATH)
model.save_pretrained(CHECKPOINT_SAVE_PATH)
