In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from annoy import AnnoyIndex
from tqdm import tqdm
import pandas as pd
import numpy as np

import torch
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, BertConfig

import sys
sys.path.append('../src')

In [92]:
products = pd.read_csv('../data/pareto_training.csv')
matching_products = pd.read_csv('../data/pareto_training.csv').dropna(subset=['master_product'])
matching_products = matching_products.reset_index(drop=True)
master_products = matching_products.master_product.unique()
matching_products['master_product_id'] = matching_products.apply(lambda x: np.where(x.master_product == master_products)[0][0], axis=1)
products['master_product_id'] = products.apply(lambda x: np.where(x.master_product == master_products)[0][0] if not pd.isna(x.master_product) else None, axis=1)


In [4]:
from preprocessing.feature_extraction import make_features_index
index = make_features_index('indobenchmark/indobert-base-p1', matching_products)

Some layers from the model checkpoint at indobenchmark/indobert-base-p1 were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at indobenchmark/indobert-base-p1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
0it [00:00, ?it/s]

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


1it [00:08,  8.80s/it]
845it [00:00, 30449.65it/s]


In [7]:
matches = []
for i, prod in tqdm(matching_products.iterrows()):
    nn = index.get_nns_by_item(i, 10)
    nn = [n for n in nn if matching_products[matching_products.index == n].iloc[0].master_product != prod.master_product]
    for idx in nn:
        matches.append({
            "sent1": prod['name'],
            "sent2": matching_products.iloc[idx].master_product,
            "match": 0
        })
    matches.append({
        "sent1": prod['name'],
        "sent2": prod.master_product,
        "match": 1
    })
matches = pd.DataFrame(matches)
matches.to_csv('../data/oneshot_matches.csv')

In [3]:
matches = pd.read_csv('../data/oneshot_matches.csv')

In [22]:
train, test = train_test_split(matches, test_size=0.2)
test, val = train_test_split(test, test_size=0.5)

In [30]:
lm = 'indobenchmark/indobert-base-p1'
config = BertConfig.from_pretrained(lm, num_labels=2)  
model = TFAutoModelForSequenceClassification.from_pretrained('out/checkpoint-17500', config=config, from_pt=True)
tokenizer = AutoTokenizer.from_pretrained(lm)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

In [45]:
from utils.utils import make_dataset
train_dataset = make_dataset(train, tokenizer)
val_dataset = make_dataset(val, tokenizer)
test_dataset = make_dataset(test, tokenizer)

In [42]:
model.fit(
    train_dataset.shuffle(len(train_dataset)).batch(32), epochs=4, 
    batch_size=32, validation_data=val_dataset.shuffle(len(val_dataset)).batch(32),
)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f31405a4350>

In [47]:
y_pred = model.predict(train_dataset.batch(32), batch_size=32, verbose=1).logits
print(classification_report(train.match.values, np.argmax(y_pred, axis=1)))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      2232
           1       0.98      0.85      0.91       653

    accuracy                           0.96      2885
   macro avg       0.97      0.92      0.94      2885
weighted avg       0.96      0.96      0.96      2885



In [50]:
model.save_pretrained('../models/pareto_oneshot/')

In [4]:
from blocking.train_blocker import train_blocker, create_sbert_model
lm = 'indobenchmark/indobert-base-p1'
sbert = train_blocker(lm, matches=matches)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/113 [00:00<?, ?it/s]

Iteration:   0%|          | 0/113 [00:00<?, ?it/s]

In [7]:
torch.save(sbert.state_dict(), '../models/pareto_oneshot_sbert.pt')

In [None]:
sbert = create_sbert_model(lm)
sbert.load_state_dict(torch.load('../models/pareto_oneshot_sbert.pt'))

In [19]:
sbert_index = AnnoyIndex(768)
embedding = sbert.encode(products['name'].values)
for i, vec in enumerate(embedding):
    sbert_index.add_item(i, vec)
sbert_index.build(10)
sbert_index.save('index/oneshot_sbert.ann')

True

In [20]:
sbert_index = AnnoyIndex(768)
sbert_index.load('index/oneshot_sbert.ann')

  """Entry point for launching an IPython kernel.


True

In [55]:
np.where(master_products == "Zee Platinum Kidz Powdered Vanilla Milk")

(array([55]),)

In [58]:
only_masters = products[products.master_product.notnull()]
candid_matches = []
for i, prod in tqdm(products.iterrows()):
    nn = sbert_index.get_nns_by_item(i, 100)
    nn = [n for n in nn if n in only_masters.index][:10]
    for idx in nn:
        mp = products.iloc[idx].master_product
        candid_matches.append({
            "id1": prod.id,
            "id2": np.where(master_products == mp)[0][0],
            "sent1": prod['name'],
            "sent2": mp,
        })
candid_matches = pd.DataFrame(candid_matches)
candid_matches = candid_matches.dropna(subset=['sent1', 'sent2'])
candid_matches.to_csv('../data/oneshot_candid.csv')

5542it [00:11, 491.46it/s]


In [44]:
candid_matches = pd.read_csv('../data/oneshot_candid.csv')

In [25]:
lm = 'indobenchmark/indobert-base-p1'
config = BertConfig.from_pretrained(lm, num_labels=2)  
model = TFAutoModelForSequenceClassification.from_pretrained('../models/pareto_oneshot/', config=config)
tokenizer = AutoTokenizer.from_pretrained(lm)

Some layers from the model checkpoint at ../models/pareto_oneshot/ were not used when initializing TFBertForSequenceClassification: ['dropout_74']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ../models/pareto_oneshot/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [60]:
from utils.utils import make_dataset
match_dataset = make_dataset(candid_matches, tokenizer)

In [141]:
from tensorflow.nn import softmax
# match_logits = softmax(model.predict(match_dataset.batch(32), verbose=1).logits).numpy()
# match_probs = match_logits[:, 1]
match_results = [p > 0.7 for p in match_probs]

In [145]:
match_preds = pd.DataFrame({'id': candid_matches.id1, 'master_product': candid_matches.id2, 'match': match_results, 'prob': match_probs})
master_product_results = []
for i, prod in tqdm(products.iterrows()):
    predictions = match_preds[match_preds.id == prod.id]
    predictions = predictions[predictions.match]
    if len(predictions) == 0:
        continue

    use_prediction = predictions.iloc[np.argmax(predictions.prob.values)]
    master_product_results.append({
        "product_id": prod.id,
        "master_product_pred_id": use_prediction.master_product,
        "master_product_pred": master_products[use_prediction.master_product],
        "master_product_actual": prod.master_product,
        "master_product_actual_id": prod.master_product_id,
        "prob": use_prediction.prob
    })
master_product_results = pd.DataFrame(master_product_results)

5542it [00:06, 857.19it/s]


In [146]:
master_product_results

Unnamed: 0,product_id,master_product_pred_id,master_product_pred,master_product_actual,master_product_actual_id,prob
0,5.228522e+08,11,Anlene Gold Chocolate Milk Powder 51 Years,,,0.709275
1,8.662367e+08,1,Bear Brand Gold White Malt Sterilized Milk,Bear Brand Gold White Malt Sterilized Milk,1.0,0.951578
2,8.662300e+08,2,Bear Brand Gold White Tea Sterilized Milk,Bear Brand Gold White Tea Sterilized Milk,2.0,0.800559
3,1.524497e+09,3,Bear Brand Sterilized Milk,Bear Brand Sterilized Milk,3.0,0.996810
4,1.543150e+09,3,Bear Brand Sterilized Milk,,,0.990962
...,...,...,...,...,...,...
1599,2.647240e+08,50,Ultra Milk Chocolate UHT Milk,Ultra Milk Chocolate UHT Milk,50.0,0.924581
1600,7.672856e+08,48,Ovaltine Milk Chocolate Drink,,,0.784786
1601,3.066684e+08,22,Diamond Full Cream UHT Milk,,,0.895300
1602,8.481935e+08,55,Zee Platinum Kidz Powdered Vanilla Milk,,,0.815013


In [133]:
only_masters = master_product_results.dropna(subset=['master_product_actual_id'])
is_match = [(m.master_product_pred == m.master_product_actual) for (i, m) in only_masters.iterrows()]
acc = len(only_masters[is_match]) / len(only_masters)
acc

0.9886524822695035

In [149]:
master_product_merged = master_product_results.merge(products.rename(columns={'id': 'product_id'})[['product_id','name', 'price']], how='left')
master_product_merged['product_id'] = master_product_merged['product_id'].astype(int)
master_product_merged['price'] = master_product_merged['price'].astype(int)
master_product_merged.to_csv('../data/oneshot_results.csv')

In [150]:
master_product_merged

Unnamed: 0,product_id,master_product_pred_id,master_product_pred,master_product_actual,master_product_actual_id,prob,name,price
0,522852231,11,Anlene Gold Chocolate Milk Powder 51 Years,,,0.709275,ANLENE ACTIFIT COKLAT 600 GR,64500
1,866236725,1,Bear Brand Gold White Malt Sterilized Milk,Bear Brand Gold White Malt Sterilized Milk,1.0,0.951578,(1 dus = 24 pcs) Susu Beruang Bear Brand Gold ...,209000
2,866229955,2,Bear Brand Gold White Tea Sterilized Milk,Bear Brand Gold White Tea Sterilized Milk,2.0,0.800559,(1 dus = 24 pcs) Susu Beruang Bear Brand White...,209000
3,1524496976,3,Bear Brand Sterilized Milk,Bear Brand Sterilized Milk,3.0,0.996810,1 Karton / Dus (30 pcs) Susu Beruang Nestle Be...,254500
4,1543150207,3,Bear Brand Sterilized Milk,,,0.990962,12 kaleng Bear Brand thailand susu beruang mur...,114900
...,...,...,...,...,...,...,...,...
1599,264724032,50,Ultra Milk Chocolate UHT Milk,Ultra Milk Chocolate UHT Milk,50.0,0.924581,ultramilk 125ml perdus isi 40pcs.gojek only - ...,98000
1600,767285606,48,Ovaltine Milk Chocolate Drink,,,0.784786,whey protein susu rasa coklat/plain 1kg,129000
1601,306668366,22,Diamond Full Cream UHT Milk,,,0.895300,Whipping Cream Anchor 1 Ltr,83000
1602,848193546,55,Zee Platinum Kidz Powdered Vanilla Milk,,,0.815013,Zee renceng cokelat10 sachet,21999
