In [57]:
import pandas as pd
import numpy as np
from annoy import AnnoyIndex
from itertools import product, combinations
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer, BertConfig, TFBertModel
import tensorflow as tf
import torch
import sys
from tqdm import tqdm
import math

sys.path.append('../src')

In [58]:
products = pd.read_csv('../data/pareto_training.csv')
matching_products = pd.read_csv('../data/pareto_training.csv').dropna(subset=['master_product'])
matching_products = matching_products.reset_index(drop=True)
master_products = matching_products.master_product_fill.unique()

In [59]:
matching_products['master_product_fill'] = matching_products.apply(lambda x: np.where(x.master_product_fill == master_products)[0][0], axis=1)

In [61]:
id_combinations = np.array(list(combinations(matching_products.id.values, 2)))

In [62]:
match_df = []
for id1, id2 in tqdm(id_combinations, position=0, leave=True):
    match = 1 if matching_products[matching_products.id == id1].master_product_fill.iloc[0] == matching_products[matching_products.id == id2].master_product_fill.iloc[0] else 0
    match_df.append({
        "id1":id1,
        "id2":id2,
        "match":match
    })
match_df = pd.DataFrame(match_df)

 47%|████▋     | 166730/356590 [02:58<03:23, 932.27it/s] 


KeyboardInterrupt: 

In [63]:
from preprocessing.feature_extraction import make_features_index
index = make_features_index('indobenchmark/indobert-base-p1', matching_products)

Some layers from the model checkpoint at indobenchmark/indobert-base-p1 were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at indobenchmark/indobert-base-p1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
0it [00:00, ?it/s]



1it [00:04,  4.66s/it]
845it [00:00, 32617.82it/s]


In [64]:
from preprocessing.batch_selection import batch_selection
match_df = batch_selection(matching_products, index)

100%|██████████| 356590/356590 [06:20<00:00, 937.32it/s] 


In [70]:
from preprocessing.serialize import serialize
serialized_match_df = serialize(match_df, matching_products, keep_columns=['name', 'price'])
serialized_match_df.to_csv('../data/serialized_matches.csv')

In [67]:
serialized_match_df = pd.read_csv('../data/serialized_matches.csv')

## Training

In [7]:
train, test = train_test_split(serialized_match_df, test_size=0.2)
test, val = train_test_split(test, test_size=0.5)

In [71]:
lm = 'indobenchmark/indobert-base-p1'
config = BertConfig.from_pretrained(lm, num_labels=2)  
model = TFAutoModelForSequenceClassification.from_pretrained('out/checkpoint-17500', config=config, from_pt=True)
tokenizer = AutoTokenizer.from_pretrained(lm)

In [73]:
model = TFAutoModelForSequenceClassification.from_pretrained('../models/pareto_full2_bert', config=config)

Some layers from the model checkpoint at ../models/pareto_full2_bert were not used when initializing TFBertForSequenceClassification: ['dropout_75']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ../models/pareto_full2_bert.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [9]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [10]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[f1_m, precision_m, recall_m, 'accuracy']) # can also use any keras loss fn


In [75]:
from utils.utils import make_dataset
# train_dataset = make_dataset(train, tokenizer)
val_dataset = make_dataset(val, tokenizer)
# test_dataset = make_dataset(test, tokenizer)

In [13]:
next(iter(val_dataset))

({'input_ids': <tf.Tensor: shape=(150,), dtype=int32, numpy=
  array([    2,  4360, 12706,  3659,   223, 14027, 30357, 25100,  4615,
         30358,  5724,   563,  9108,   462,  4360, 12669,  3659, 10616,
          8138, 30470,   502,     3,  4360, 12706,  3659, 17833,  9841,
         30356,  3091, 11099, 30358, 20449, 30472,  5724,  4534, 30468,
          5747, 30468, 21588, 30468,  4654,  5866, 30469,  5724,   563,
          4360, 12669,  3659, 27797,  4246, 30470,   502,     3,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
   

In [None]:
model.fit(
    train_dataset.shuffle(len(train_dataset)).batch(32), epochs=2, 
    batch_size=32, validation_data=val_dataset.shuffle(len(val_dataset)).batch(32),
)

In [49]:
model.evaluate(val_dataset.batch(32), return_dict=True, batch_size=32)



{'loss': 0.08076349645853043,
 'f1_m': 0.06565715372562408,
 'precision_m': 0.034840650856494904,
 'recall_m': 0.6790779829025269,
 'accuracy': 0.978537917137146}

In [79]:
test_pred = pd.concat([serialized_match_df[serialized_match_df.match == 1].sample(frac=1), serialized_match_df.sample(24649)])

In [80]:
y_pred = model.predict(make_dataset(test_pred, tokenizer).batch(32), batch_size=32, verbose=1).logits
print(classification_report(test_pred.match.values, np.argmax(y_pred, axis=1)))

              precision    recall  f1-score   support

           0       0.86      0.93      0.89     15508
           1       0.97      0.93      0.95     33087

    accuracy                           0.93     48595
   macro avg       0.91      0.93      0.92     48595
weighted avg       0.93      0.93      0.93     48595



In [17]:
model.save_pretrained('../models/pareto_full_bert')

## Matching

In [54]:
from blocking.train_blocker import train_blocker, create_sbert_model
lm = 'indobenchmark/indobert-base-p1'

In [7]:
# train blocker
sbert = train_blocker(lm, matches=serialized_match_df)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3821 [00:00<?, ?it/s]

In [25]:
torch.save(sbert.state_dict(), '../models/pareto_full_sbert.pt')

In [55]:
sbert = create_sbert_model(lm)
sbert.load_state_dict(torch.load('../models/pareto_full_sbert.pt'))

<All keys matched successfully>

In [56]:
sbert.save('../models/pareto_full_sbert')

In [11]:
clusters = pd.read_csv('../data/clusters.csv')
products = products[~products.id.isin(clusters.id.values)].reset_index(drop=True)

In [45]:
# run blocker
from blocking.blocker import blocker
tokenizer = AutoTokenizer.from_pretrained(lm)
match_df = blocker(sbert, products, 'index.ann', 100, threshold=0.5)

5542it [00:00, 13797.89it/s]
5542it [00:00, 34459.05it/s]
  nn = nn[([nn[:, 1] < (2-(threshold*2))])]
100%|██████████| 5542/5542 [00:18<00:00, 304.64it/s]


In [46]:
match_df

Unnamed: 0,id1,id2
0,2.148352e+08,5.228522e+08
1,5.228522e+08,6.896538e+08
2,6.221519e+08,6.221528e+08
3,9.783610e+08,9.783760e+08
4,6.221519e+08,6.221528e+08
...,...,...
15576,2.183304e+08,2.183304e+08
15577,2.183304e+08,2.183304e+08
15578,2.183303e+08,2.183304e+08
15579,1.917154e+08,3.222484e+08


In [47]:
len(np.unique(np.append(match_df.id1, match_df.id2)))

2704

In [14]:
index = AnnoyIndex(768)
index.load('index.ann')

  """Entry point for launching an IPython kernel.


True

In [15]:
def get_nn(prod_index):
    indices, distances = index.get_nns_by_item(prod_index, 100, include_distances=True)
    for i in range(len(indices)):
        print(i, products.loc[indices[i]]['name'], distances[i])

In [48]:
from preprocessing.serialize import serialize
serialized_match_df = serialize(match_df, products, keep_columns=['name', 'price'])

100%|██████████| 15581/15581 [00:31<00:00, 494.47it/s]


In [49]:
from susubert.matcher import matcher
match_results = matcher(lm, '../models/pareto_full2_bert', serialized_match_df, match_df, 128, threshold=0.5)

Some layers from the model checkpoint at ../models/pareto_full2_bert were not used when initializing TFBertForSequenceClassification: ['dropout_75']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ../models/pareto_full2_bert.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.
  0%|          | 0/122 [00:00<?, ?it/s]Asking to trunca



  1%|          | 1/122 [00:02<05:58,  2.96s/it]



  2%|▏         | 3/122 [00:06<03:46,  1.90s/it]



100%|██████████| 122/122 [01:20<00:00,  1.51it/s]


In [51]:
match_results.merge(products.rename(columns={'id':'id1', 'name': 'name1'})[['name1', 'id1']], on='id1', how='left').merge(products.rename(columns={'id':'id2', 'name': 'name2'})[['name2', 'id2']], on='id2', how='left').to_csv('../data/matches.csv')

In [52]:
match_results[match_results.match == 1]

Unnamed: 0,id1,id2,match,prob
0,2.148352e+08,5.228522e+08,1,0.687599
2,6.221519e+08,6.221528e+08,1,0.973209
4,6.221519e+08,6.221528e+08,1,0.973209
13,5.228384e+08,7.605644e+08,1,0.885138
14,5.228384e+08,5.228926e+08,1,0.549251
...,...,...,...,...
15487,1.132397e+09,1.339084e+09,1,0.959689
15488,1.276634e+09,1.339084e+09,1,0.879727
15526,2.183304e+08,4.837768e+08,1,0.983698
15536,2.183304e+08,4.837768e+08,1,0.983698


In [19]:
from utils.fin import fin
fins = fin(products, match_results)

In [18]:
products.fin.value_counts().index

Float64Index([ 481.0,  821.0,   44.0,   46.0,   36.0,  479.0, 1930.0,  136.0,
              1477.0, 2279.0,
              ...
              4068.0, 1495.0,  597.0, 3566.0, 3909.0, 4021.0, 1308.0, 2951.0,
              3074.0, 4532.0],
             dtype='float64', length=4605)

In [20]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(products.iloc[:1000])

               id                                               name  \
0    5.413560e+08  ( gojek/ grab) susu kental manis tiga sapi ( k...   
1    2.307242e+08     ( grosir) Susu NEPHRISOL-D rasa vanila 185gram   
2    4.035532e+08                     ANLENE ACTIFIT 600 GR - COKLAT   
3    6.864235e+08  Anlene Actifit 600g rasa Coklat / Original / V...   
4    6.221500e+08                       Anlene Actifit Cokelat 250gr   
5    1.244153e+09                      anlene actifit cokelat 600 gr   
6    1.097741e+09  Anlene Actifit Cokelat 600gr Susu Untuk Tulang...   
7    2.829851e+08      ANLENE Actifit Cokelat Susu Kalsium Box 600 g   
8    5.228522e+08                       ANLENE ACTIFIT COKLAT 600 GR   
9    6.221519e+08                      Anlene Actifit Original 250gr   
10   9.783610e+08              Anlene Actifit Original 250gr - 2 Pcs   
11   5.228314e+08                     ANLENE ACTIFIT ORIGINAL 600 GR   
12   6.221528e+08                      Anlene Actifit Original 6