# Rotman Data Science Competition
### Section 4.3: Using sentence similarity to calculate similarity score between different substitute products
## 0. Installs and Imports


In [None]:
%%capture
DO_INSTALLS_FOR_TRAINING = False
DO_INSTALL_FOR_EVALUATION = True

if DO_INSTALLS_FOR_TRAINING:
  !pip install sentence-transformers
  !pip install datasets
elif DO_INSTALL_FOR_EVALUATION:
  !pip install sentence-transformers
  # !pip install \
  #   --extra-index-url=https://pypi.nvidia.com \
  #   cudf-cu11 dask-cudf-cu11 cuml-cu11 cugraph-cu11 cuspatial-cu11 cuproj-cu11 cuxfilter-cu11 cucim
  !pip install hdbscan
  !pip install umap-learn

In [None]:
import pandas as pd
# from datasets import Dataset
from sentence_transformers import SentenceTransformer, models, InputExample, losses, util
# import torch
# from torch.utils.data import DataLoader

## 1. Data Preprocessing

### Data import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_csv("/content/drive/My Drive/rotman_datascience_competition/mma_mart_augmented.csv")
data.drop(columns=['portion_of_order', 'b_score (full dataset)'], inplace=True)
data.head()

### Hyperparameters

In [None]:
n_min = 100

### Drop Uncommon Products


In [None]:
data.shape

In [None]:
# Drop products that appear less than n_min times
cleaned_data = data.groupby('product_name').filter(lambda x: len(x) > n_min)
cleaned_data.shape

### Make pairs of products that appear in the same order

In [None]:
processed_data = cleaned_data.loc[:, ['order_id', 'product_name']]
processed_data['order_2_id'] = processed_data['order_id'].shift(-1)
processed_data['product_2_name'] = processed_data['product_name'].shift(-1)
processed_data = processed_data[processed_data['order_id'] == processed_data['order_2_id']]

In [None]:
processed_data.drop(columns=['order_id', 'order_2_id'], inplace=True)
processed_data.head()

In [None]:
list_of_product_1 = processed_data['product_name'].apply(lambda x: [x])
list_of_product_2 = processed_data['product_2_name'].apply(lambda x: [x])

In [None]:
product_pairs = list_of_product_1 + list_of_product_2
product_pairs.head()

In [None]:
dataset_dict = {"train": product_pairs.to_list()}

## 2. Sentence Transformer Fine-tuning

In [None]:
model = SentenceTransformer('sentence-transformers/stsb-roberta-base')

In [None]:
my_dataset = Dataset.from_dict(dataset_dict)

In [None]:
my_dataset

In [None]:
my_dataset['train'][0]

In [None]:
train_examples = []
train_data = my_dataset['train']
n_examples = len(my_dataset['train'])

for i in range(n_examples):
  example = train_data[i]
  train_examples.append(InputExample(texts=[example[0], example[1]]))

In [None]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=64)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
num_epochs = 10
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data

In [None]:
# that's the sentence transformer
print(model.max_seq_length)
# that's the underlying transformer
print(model[0].auto_model.config.max_position_embeddings)

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps)

In [None]:
SAVE_PATH = "/content/drive/My Drive/rotman_datascience_competition/st_checkpoint_final"
model.save(SAVE_PATH)

## 3.Clustering For Subtitute Identification

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import hdbscan
import umap
import sklearn.cluster as cluster

### Load & Test Model

In [None]:
SAVE_PATH = "/content/drive/My Drive/rotman_datascience_competition/st_checkpoint_final"
my_model = SentenceTransformer(SAVE_PATH)

In [None]:
# Testing out model
item = "bananas"
substitute_candidate_1 = "Bag of Bananas"
substitute_candidate_2 = "Cucumber Kirby"
substitute_candidate_3 = "Organic Bananas"
candidates = [substitute_candidate_1, substitute_candidate_2, substitute_candidate_3]

item_embedding = my_model.encode(item)
candidates_embeddings = my_model.encode(candidates)
print("Similarity:", util.pytorch_cos_sim(item_embedding, candidates_embeddings))

### Process Data For Clustering

In [None]:
products = data['product_name'].unique()
print(len(products))
products[:10]

In [None]:
%%time
prod_st_embeddings = my_model.encode(products)
prod_st_embeddings.shape

In [None]:
%%time
# Dimension Reduction
umap_embedder = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine')
prod_umap_embeddings = umap_embedder.fit_transform(prod_st_embeddings)
prod_umap_embeddings.shape

In [None]:
# normalize umap embeddings
normalized_prod_embeddings = prod_umap_embeddings / np.linalg.norm(prod_umap_embeddings, axis=1, keepdims=True)

In [None]:
%%time
# Do the same dimensional reduction on products for visualization purposes
umap_products_data = umap.UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine').fit_transform(normalized_prod_embeddings)

### Helper Functions

In [None]:
def get_clustered_products(cluster_labels, products):
  clustered_products = {}
  for sentence_id, cluster_id in enumerate(cluster_labels):
    if cluster_id not in clustered_products:
      clustered_products[cluster_id] = [products[sentence_id]]
    else:
      clustered_products[cluster_id].append(products[sentence_id])
  return clustered_products

In [None]:
def visualize_clusters(cluster_labels, product_data):
  result = pd.DataFrame(product_data, columns=['x', 'y'])
  result['labels'] = cluster_labels

  fig, ax = plt.subplots(figsize=(20, 10))
  outliers = result.loc[result.labels == -1, :]
  clustered = result.loc[result.labels != -1, :]
  plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
  plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r')
  plt.colorbar()
  return fig, ax

### Cluster Using HDBSCAN

In [None]:
hdbscan_cluster_maker = hdbscan.HDBSCAN(min_cluster_size=2,
                                        metric='euclidean',
                                        cluster_selection_method='eom')

In [None]:
%%time
hdbscan_clusters = hdbscan_cluster_maker.fit(normalized_prod_embeddings)

In [None]:
fig, ax = visualize_clusters(hdbscan_clusters.labels_, umap_products_data)
plt.show()

In [None]:
clustered_products_hdbscan = get_clustered_products(hdbscan_clusters.labels_, products)
clustered_products_hdbscan[3]

### Cluster Using Agglomerative Clustering

In [None]:
ac_cluster_maker = cluster.AgglomerativeClustering(n_clusters=None,
                                                   distance_threshold=0.001,
                                                   linkage='average')

In [None]:
ac_clusters = ac_cluster_maker.fit(normalized_prod_embeddings)

In [None]:
fig, ax = visualize_clusters(ac_clusters.labels_, umap_products_data)
plt.show()

In [None]:
clustered_products_ac = get_clustered_products(ac_clusters.labels_, products)
len(clustered_products_ac[3])

In [None]:
largest_cluster = max(clustered_products_ac, key=lambda x: len(clustered_products_ac[x]))
clustered_products_ac[largest_cluster]

### Output Cluster Data

In [None]:
clustered_products = clustered_products_ac
clusters_df = pd.DataFrame.from_dict(clustered_products, orient='index').T
clusters_df

In [None]:
clusters_df.to_csv('/content/drive/My Drive/rotman_datascience_competition/clusters_df.csv', header=False)

4.