# Rotman Data Science Competition
### Section 4.3: Using sentence similarity to calculate similarity score between different substitute products
## 0. Installs and Imports


In [None]:
%%capture
DO_INSTALLS = False
if DO_INSTALLS:
  !pip install sentence-transformers
  !pip install datasets

In [None]:
import pandas as pd
from datasets import Dataset
from sentence_transformers import SentenceTransformer, models, InputExample, losses
import torch
from torch.utils.data import DataLoader

## 1. Data Preprocessing

In [None]:
data = pd.read_csv("data/mma_mart_augmented.csv")
data.drop(columns=['portion_of_order', 'b_score (full dataset)'], inplace=True)
data.head()

## Hyperparameters

In [None]:
n_min = 100

### Drop Uncommon Products


In [None]:
data.shape

In [None]:
# Drop products that appear less than 10 times
cleaned_data = data.groupby('product_name').filter(lambda x: len(x) > n_min)
cleaned_data.shape

### Make pairs of products that appear in the same order

In [None]:
processed_data = cleaned_data.loc[:, ['order_id', 'product_name']]
processed_data['order_2_id'] = processed_data['order_id'].shift(-1)
processed_data['product_2_name'] = processed_data['product_name'].shift(-1)
processed_data = processed_data[processed_data['order_id'] == processed_data['order_2_id']]

In [None]:
processed_data.drop(columns=['order_id', 'order_2_id'], inplace=True)
processed_data.head()

In [None]:
list_of_product_1 = processed_data['product_name'].apply(lambda x: [x])
list_of_product_2 = processed_data['product_2_name'].apply(lambda x: [x])

In [None]:
product_pairs = list_of_product_1 + list_of_product_2
product_pairs.head()

In [None]:
dataset_dict = {"train": product_pairs.to_list()}

## 2. Sentence Transformer

In [None]:
model = SentenceTransformer('sentence-transformers/stsb-roberta-base')

In [None]:
my_dataset = Dataset.from_dict(dataset_dict)

In [None]:
my_dataset

In [None]:
my_dataset['train'][0]

In [None]:
train_examples = []
train_data = my_dataset['train']
n_examples = len(my_dataset['train'])

for i in range(n_examples):
  example = train_data[i]
  train_examples.append(InputExample(texts=[example[0], example[1]]))

In [None]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=64)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
num_epochs = 10
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data

In [None]:
# that's the sentence transformer
print(model.max_seq_length)
# that's the underlying transformer
print(model[0].auto_model.config.max_position_embeddings)

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps)

In [None]:
SAVE_PATH = ""
model.save(SAVE_PATH)
