# SIMILARITY BERT

In [None]:
!pip install -qq transformers

[K     |████████████████████████████████| 3.8 MB 23.3 MB/s 
[K     |████████████████████████████████| 67 kB 4.9 MB/s 
[K     |████████████████████████████████| 6.5 MB 42.4 MB/s 
[K     |████████████████████████████████| 596 kB 45.1 MB/s 
[K     |████████████████████████████████| 895 kB 63.6 MB/s 
[?25h

In [None]:
import re
import string
from torch import clamp
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

MODEL_NAME = 'cahya/bert-base-indonesian-522M'

class TokenSimilarity:

    def __init__(self, from_pretrained:str=MODEL_NAME):
        self.tokenizer = AutoTokenizer.from_pretrained(from_pretrained)
        self.model = AutoModel.from_pretrained(from_pretrained)
        
    def __process(self, first_token:str, second_token:str):
        inputs = self.tokenizer([first_token, second_token],
                                max_length=self.max_length,
                                truncation=self.truncation,
                                padding=self.padding,
                                return_tensors='pt')

        attention = inputs.attention_mask
        outputs = self.model(**inputs)
        embeddings = outputs[0]
        mask = attention.unsqueeze(-1).expand(embeddings.shape).float()
        masked_embeddings = embeddings * mask
        
        summed = masked_embeddings.sum(1)
        counts = clamp(mask.sum(1), min=1e-9)
        mean_pooled = summed/counts

        return mean_pooled.detach().numpy()
        
    def predict(self, first_token:str, second_token:str, max_length:int=40,
                truncation:bool=True, padding:str="max_length"):
        self.max_length = max_length
        self.truncation = truncation
        self.padding = padding

        mean_pooled_arr = self.__process(first_token, second_token)
        similarity = cosine_similarity([mean_pooled_arr[0]], [mean_pooled_arr[1]])

        return similarity

In [None]:
model = TokenSimilarity()

In [None]:
token1 = 'menjuarai lomba'
token2 = 'Anak kutu buku itu menjuarai lomba matematika.'
similarity_score1 = model.predict(token1, token2)
similarity_score1

array([[0.59802735]], dtype=float32)

In [None]:
import torch
PATH = '/content/similarity.bin'
torch.save(model, PATH)

In [None]:
model1 = torch.load(PATH)

In [None]:
token1 = 'anak'
token2 = 'bawang'
similarity_score1 = model1.predict(token1, token2)
similarity_score1

array([[0.8052218]], dtype=float32)

In [None]:
token3 = 'menjadi'
token4 = 'anak'
similarity_score2 = model.predict(token3, token4)
similarity_score2

array([[0.7665013]], dtype=float32)