# Create paires (query, product) sentence embedding and save them

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
% cd drive/MyDrive/sna/

/content/drive/MyDrive/sna


In [4]:
! pip install -U -q sentence-transformers
! pip install -q transformers

[K     |████████████████████████████████| 79 kB 5.1 MB/s 
[K     |████████████████████████████████| 4.0 MB 53.6 MB/s 
[K     |████████████████████████████████| 1.2 MB 50.0 MB/s 
[K     |████████████████████████████████| 77 kB 6.5 MB/s 
[K     |████████████████████████████████| 895 kB 53.5 MB/s 
[K     |████████████████████████████████| 6.6 MB 47.2 MB/s 
[K     |████████████████████████████████| 596 kB 45.9 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


## Check out GPU

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue Apr 26 04:52:51 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Check out RAM Size

In [5]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


## Import all packages

In [23]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from sentence_transformers import InputExample
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split

## Read in three dataset, product description, training, and test

In [7]:
product = pd.read_csv('data/product_catalogue-v0.2.csv', dtype="string")
train = pd.read_csv("data/train-v0.2.csv", dtype="string")
test = pd.read_csv("data/test_public-v0.2.csv", dtype="string")

## Join product dataset with training dataset on product id

In [8]:
alled = train.merge(product, on="product_id", how="inner", validate="many_to_many")
alled = alled.fillna("")

## Concatenate all each query and its matched product attributes into one column

In [14]:
alled['query_pro'] = alled['query'] + ": " + alled['product_title'] + ". " +alled['product_brand']+ ". " + alled['product_color_name'] + ". " + alled["product_bullet_point"] + ". " + alled['product_description'] + "."
alled['esci_label'].unique()

<StringArray>
['irrelevant', 'exact', 'substitute', 'complement']
Length: 4, dtype: string

In [11]:
alled['query_pro']

0         !awnmower tires without rims: American Lawn Mo...
1         landmowers: American Lawn Mower Company 1204-1...
2         lawn mower without motor: American Lawn Mower ...
3         reel mower with grass catcher: American Lawn M...
4         !awnmower tires without rims: Oregon 72-107 Un...
                                ...                        
793361    ﾒｽﾃｨﾝ: メスティン アルミ飯盒 固形燃料ストーブ ポットホルダー 風除板 9枚付き キ...
793362    ﾒｽﾃｨﾝ: メスティン キャンプ用メスティン アルミ飯盒 ポータブル飯盒 ハンゴウ クッカ...
793363    ﾒｽﾃｨﾝ: トランギア(trangia) メスティン(TR-210) [並行輸入品]. t...
793364    ﾒｽﾃｨﾝ: MiliCamp MR-250 メスティン4点セットキャンプ 飯ごう バリ取り...
793365    ﾒｽﾃｨﾝ: Platinum Loops メスティン ラージメスティン アルミ飯盒 キャン...
Name: query_pro, Length: 793366, dtype: string

## Create sentence embedding for `query_pro` column

In [17]:
model_name = 'sentence-transformers/use-cmlm-multilingual'
embedder = SentenceTransformer(model_name, device='cuda')
embeddings = embedder.encode(['Hello World', 'Hallo Welt', 'Hola mundo'], show_progress_bar=True)
print(embeddings)

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/sentence-transformers_use-cmlm-multilingual/ were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[[ 0.03651673 -0.00734439  0.02748546 ...  0.04279241  0.01741163
  -0.00943234]
 [ 0.03533889  0.00157126  0.02592023 ...  0.03731735  0.01532319
  -0.01807112]
 [ 0.0349112  -0.00218738  0.02584786 ...  0.04132452  0.02516746
  -0.00748832]]


## Subset the matched dataset and only run embedding on it

In [19]:
train_count = 1000
train_flag = True
dev_count = 200
dev_flag = True
test_count = 200
test_flag = True
frac_train = 0.9
frac_test = 0.8

train_all, test = train_test_split(alled, train_size=frac_train, shuffle=True)
train, dev = train_test_split(train_all, train_size=frac_test, shuffle=True)
print('Before training shape:',train.shape)
print('Before dev shape:',dev.shape)
print('Before test shape:',test.shape)

train = train.sample(n=train_count)
dev = dev.sample(n=dev_count)
test = test.sample(n=test_count)

print('After training shape:',train.shape)
print('After dev shape:',dev.shape)
print('After test shape:',test.shape)

train.reset_index(drop=True, inplace=True)
dev.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

Before training shape: (571223, 12)
Before dev shape: (142806, 12)
Before test shape: (79337, 12)
After training shape: (1000, 12)
After dev shape: (200, 12)
After test shape: (200, 12)


In [21]:
!ls

01-read-data.ipynb			    05-Train-CrossEncoder-scores.ipynb
02-sentence-embedding.ipynb		    data
02-TrainingBert.ipynb			    output
03-Word2Vec.ipynb			    README.md
04-Train-CrossEncoder-classification.ipynb


In [24]:
import time
start = time.time()

train_embeddings = embedder.encode(train['query_pro'].tolist(), show_progress_bar=True)
dev_embeddings = embedder.encode(dev['query_pro'].tolist(), show_progress_bar=True)
test_embeddings = embedder.encode(test['query_pro'].tolist(), show_progress_bar=True)

end = time.time()
print((end - start)/60)

#Store sentences & embeddings on disc
with open('embedding/train_embeddings.pkl', "wb") as fOut:
    pickle.dump({'sentences': train['esci_label'].tolist(), 'embeddings': train_embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

with open('embedding/dev_embeddings.pkl', "wb") as fOut:
    pickle.dump({'sentences': dev['esci_label'].tolist(), 'embeddings': dev_embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

with open('embedding/test_embeddings.pkl', "wb") as fOut:
    pickle.dump({'sentences': test['esci_label'].tolist(), 'embeddings': test_embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

0.1652729312578837
