In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer,AutoModel,AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
from sklearn.model_selection import train_test_split
import torch
import re

In [2]:
device = torch.device('cuda:0')

In [3]:
matches = pd.read_csv('../data/batch_selection.csv').drop_duplicates(subset=['name1', 'name2']).reset_index()
train, test = train_test_split(matches, test_size=0.2)
test, val = train_test_split(test, test_size=0.5)

In [4]:
model_name='indobenchmark/indobert-base-p1'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, num_labels=2)
model = model.to(device)

In [7]:
from torch import nn
dropout = nn.Dropout(0.1).to(device)
fully_connected = nn.Linear(768, 2).to(device)

In [12]:
inputs = tokenizer('[CLS] susu beruang [SEP] susu very nice', return_tensors='pt').to(device)
labels = torch.tensor([1]).unsqueeze(0).to(device)
outputs = model(**inputs)

In [13]:
pooled_output = outputs[0][:, 0, :].to(device)

In [14]:
dropout_output = dropout(pooled_output)

In [16]:
logits = fully_connected(dropout_output)

In [19]:
labels

tensor([[1]], device='cuda:0')

In [21]:
logits

tensor([[ 1.4678, -1.9619]], device='cuda:0', grad_fn=<AddmmBackward>)

In [18]:
loss = criterion(logits, labels)

RuntimeError: multi-target not supported at /pytorch/aten/src/THCUNN/generic/ClassNLLCriterion.cu:15

In [8]:
class ProductCorpus(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.dataset['name1'] = self.dataset['name1'].map(self.clean_text)
        self.dataset['name2'] = self.dataset['name2'].map(self.clean_text)
        self.tokenizer = tokenizer

    def clean_text(self, text):
        if text == None:
            return 'None'
        text = re.sub(r'\\.', '', text)  # Remove all \n \t etc..
        text = re.sub(r'[^\w\s]*', '', text)  # Remove anything not a digit, letter, or space
        return text.strip().lower()

    def __len__(self):
        return len(self.dataset)

    def getColumn(self, col_name):
        return self.dataset[col_name]

    def __getitem__(self, idx):

        if torch.is_tensor(idx):
            idx = idx.tolist()
        row = self.dataset.iloc[idx, :]

        encoded_dict = self.tokenizer.encode_plus(
            row['name1'], row['name2'],  # Sentence to encode.
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=165,  # Pad and Truncate sentences.
            pad_to_max_length=True,
            return_attention_mask=True,  # Construct attention masks.
            return_token_type_ids=True,  # token type ids
            return_tensors='pt',  # Return tensors.
            truncation=True
        )

        return {'input': encoded_dict['input_ids'],
                'attn': encoded_dict['attention_mask'],
                'token': encoded_dict['token_type_ids'],
                'label': row['match']}

In [9]:
pc = ProductCorpus(train, tokenizer) # Load dataset
dataloader = DataLoader(pc, sampler=RandomSampler(pc),  batch_size=32)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [10]:
epochs = 2
optimizer = AdamW(model.parameters(),lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0, num_training_steps=len(dataloader) * epochs)
criterion = nn.CrossEntropyLoss()

In [11]:
from tqdm import tqdm
import time
import datetime

model.train()
fn = torch.nn.Linear(768, 2)
dropout = torch.nn.Dropout(0.1)

for epoch in range(epochs):
    total_train_loss = 0  # Reset loss for the epoch
    t0 = time.time()  # set timer
    for step, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
        b_input_ids = torch.squeeze(batch['input'],1).to(device)
        b_input_attn = torch.squeeze(batch['attn'],1).to(device)
        b_token_type_ids = torch.squeeze(batch['token'],1).to(device)
        b_labels = batch['label'].to(device)

        # forward pass
        output = model(b_input_ids,
                             token_type_ids=b_token_type_ids,
                             attention_mask=b_input_attn)

        # get training loss
        pooled_output = output[0][:, 0, :]
        pooled_output = dropout(pooled_output)
        logits = fully_connected(pooled_output)
        loss = criterion(logits, b_labels)

        total_train_loss += loss

        loss = loss

        loss.backward()     # Perform a backward pass to calculate the gradients.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)     # Clip the norm of the gradients to 1.0.
        optimizer.step()    # Update parameters and take a step using the computed gradient.
        scheduler.step()    # Update the learning rate.
        model.zero_grad()   # Clear gradients before performing a backward
    
    avg_train_loss = total_train_loss / len(dataloader)
    training_time = str(datetime.timedelta(seconds=time.time() - t0))
    print("Average training loss: {0:.2f}".format(avg_train_loss))
    print("Training epcoh took: {:}".format(training_time))

100%|██████████| 64/64 [00:56<00:00,  1.13it/s]
  0%|          | 0/64 [00:00<?, ?it/s]

Average training loss: 0.32
Training epcoh took: 0:00:56.883958


100%|██████████| 64/64 [00:55<00:00,  1.15it/s]


Average training loss: 0.12
Training epcoh took: 0:00:55.760976


In [57]:
np.append(np.empty(0), [1,0,0,1,1,0])

array([1., 0., 0., 1., 1., 0.])

In [61]:
test_cp = TestCorpus(val, tokenizer, cap=1000)  # Load dataset
pair_ids = test_cp.getIndices().to_list()  # get pair ids

y_pred, y_true = eval(model, test_cp)    # predict label



tensor([1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0], device='cuda:0')


 12%|█▎        | 1/8 [00:00<00:02,  2.53it/s]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]
tensor([0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 0, 0], device='cuda:0')


 25%|██▌       | 2/8 [00:00<00:02,  2.80it/s]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
tensor([1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')


 38%|███▊      | 3/8 [00:01<00:01,  2.90it/s]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
tensor([1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')


 50%|█████     | 4/8 [00:01<00:01,  2.86it/s]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0], device='cuda:0')


 62%|██████▎   | 5/8 [00:01<00:01,  2.91it/s]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
tensor([0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 

 75%|███████▌  | 6/8 [00:02<00:00,  2.92it/s]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1]
[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

 88%|████████▊ | 7/8 [00:02<00:00,  2.89it/s]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1]
[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,

100%|██████████| 8/8 [00:02<00:00,  2.92it/s]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,




In [65]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_true, y_pred)

(array([0.95566502, 0.94      ]),
 array([0.98477157, 0.83928571]),
 array([0.97      , 0.88679245]),
 array([197,  56]))

In [59]:
from sklearn.metrics import classification_report
import time
def eval(model, dataset):

    model.eval()
    tester = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=32)

    # collect labels
    predict_label = []
    gold_label = []
    t0 = time.time()  # set timer

    # evaluate data
    for idx, batch in tqdm(enumerate(tester), total=len(tester)):
        b_input_ids = torch.squeeze(batch['input'], 1).to(device)
        b_input_attn = torch.squeeze(batch['attn'], 1).to(device)
        b_token_type_ids = torch.squeeze(batch['token'], 1).to(device)
        b_labels = batch['label'].to(device)
        print(b_labels)

        with torch.no_grad():
            output = model(b_input_ids,
                            token_type_ids=b_token_type_ids,
                            attention_mask=b_input_attn)
        
        pooled_output = output[0][:, 0, :]
        logits = fully_connected(pooled_output)

        predict_label += logits.argmax(-1).tolist()
        gold_label += b_labels.tolist()

    print(classification_report(gold_label, predict_label))
    return predict_label, gold_label

In [22]:
from torch.utils.data import Dataset
import re

class TestCorpus(Dataset):
    def __init__(self, dataset, tokenizer, cap=None, random=False):
        self.dataset = dataset
        if cap:
            self.dataset = self.dataset.head(cap)
        if random:
            self.dataset = self.dataset.sample(frac=1)
        self.dataset['name1'] = self.dataset.loc[:,'name1'].map(self.clean_text)
        self.dataset['name2'] = self.dataset.loc[:,'name2'].map(self.clean_text)
        self.tokenizer = tokenizer

    def clean_text(self, text):
        if text == None:
            return 'None'
        text = re.sub(r'\\.', '', text)  # Remove all \n \t etc..
        text = re.sub(r'[^\w\s]*', '', text)  # Remove anything not a digit, letter, or space
        return text.strip().lower()

    def __len__(self):
        return len(self.dataset)
    
    def getIndices(self):
        return self.dataset.index

    def getColumn(self, col_name):
        return self.dataset[col_name]

    def __getitem__(self, idx):

        if torch.is_tensor(idx):
            idx = idx.tolist()
        row = self.dataset.iloc[idx, :]

        encoded_dict = self.tokenizer.encode_plus(
            row['name1'], row['name2'],  # Sentence to encode.
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=165,  # Pad and Truncate sentences.
            pad_to_max_length=True,
            return_attention_mask=True,  # Construct attention masks.
            return_token_type_ids=True,  # token type ids
            return_tensors='pt',  # Return tensors.
            truncation=True
        )

        return {'input': encoded_dict['input_ids'],
                'attn': encoded_dict['attention_mask'],
                'token': encoded_dict['token_type_ids'],
                'label': row['match']}

In [55]:
test_row.name2

'stock terbatas keju mozarella perfetto 1kg  mozzarella cheese murah'

In [28]:
test[test.match == 1].iloc[1].name2

'Dijual Anchor Keju cheddar 2kg Easy to Grate Murah'

In [15]:
def is_match(name1, name2):
    encoded_dict = tokenizer.encode_plus(
        name1, name2,  # Sentence to encode.
        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
        max_length=165,  # Pad and Truncate sentences.
        pad_to_max_length=True,
        return_attention_mask=True,  # Construct attention masks.
        return_token_type_ids=True,  # token type ids
        return_tensors='pt',  # Return tensors.
        truncation=True
    )
    b_input_ids = torch.squeeze(encoded_dict['input_ids'], 1).to(device)
    b_input_attn = torch.squeeze(encoded_dict['attention_mask'], 1).to(device)
    b_token_type_ids = torch.squeeze(encoded_dict['token_type_ids'], 1).to(device)
    


    with torch.no_grad():
        output = model(b_input_ids,
                        token_type_ids=b_token_type_ids,
                        attention_mask=b_input_attn)
    
    print(output[0])
    print(torch.argmax(output[0]))

In [1]:
is_match("Keju cheddar 2kg", "Dijual Anchor Keju cheddar 2kg Easy to Grate Murah")

NameError: name 'is_match' is not defined