# SIBUR entities pairs classification


## XLM_ROBERTA


This XLM-ROBERTA model was trained on Kaggle GPU Kernel and output submission scores **~ 0.49 score** on public leaderboard

The idea was to fine-tune xlm-robberta-multilingual model for SequenceClassification Task already known for Pytorch Pretrained models (using <code>transformers</code> library, which is slightly different in use from <code>pytorch-pretrained-bert</code> library).

### Required installations

In [1]:
!pip3 install transformers

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


### Imports

In [2]:
import os
import csv
import logging
import random
import sys

import pandas as pd
import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from torch.nn import CrossEntropyLoss, MSELoss
from sklearn.metrics import matthews_corrcoef, f1_score
from sklearn.model_selection import StratifiedShuffleSplit

from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer, XLMRobertaConfig, XLMRobertaModel, AdamW, get_linear_schedule_with_warmup



In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [4]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

In [5]:
train_data = pd.read_csv('/kaggle/input/sibur-2020-nlp-classification/train_xlmroberta.csv')
train_data = train_data.reset_index()
train_data.drop('index', axis=1, inplace=True)
train_data['is_duplicate'] = train_data['is_duplicate'].astype(str)

train_data.head()

Unnamed: 0,pair_id,name_1,name_2,is_duplicate
0,0,xavier,xavier uiversity,1
1,1,c,intermational,0
2,2,poder logistico,logisticos,0
3,3,xinshili,ranyal,0
4,4,rihmeek williams robert,ihmeek williams robert,1


In [6]:
# cv_split = StratifiedShuffleSplit(1, train_size=0.8, random_state=42)
# tridx, cvidx = list(cv_split.split(train_data[['name_1', 'name_2']], train_data["is_duplicate"]))[0]

In [7]:
# train = train_data.iloc[tridx]
# valid = train_data.iloc[cvidx]

In [8]:
# valid.is_duplicate.value_counts()

### Input Feature Representation

Classes to represent each Example and InputFeature (example tokenization, segment ids etc..)

In [9]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

        
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

### Data Processor

For reading the dataset and converting each sample in dataset into examples.

In [10]:
class DataPreProcessor():
    
    def get_labels(self):
        """Returns all the types of labels present in the data."""
        
        return ["0", "1"]
    
    def get_train_examples(self, data):
        """Creates the examples for the given data."""
        examples = []
        
        for (i, line) in enumerate(data.values.tolist()):
            
            guid = line[0]
            text_a = line[1]
            text_b = line[2]
            label = line[-1]
            
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        
        return examples

    def get_dev_examples(self, data):
        """Creates the examples for the given data."""
        examples = []
        
        for (i, line) in enumerate(data.values.tolist()):
            
            guid = line[0]
            text_a = line[1]
            text_b = line[2]
            label = line[-1]
            
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        
        return examples

### Converting examples into input features

In [11]:
# def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, output_mode):
    
#     features = []

#     for (ex_index, example) in enumerate(examples):
#         if ex_index % 100000 == 0:
#             print("writing example %d of %d" % (ex_index, len(examples)))
        
#         tokens_a = tokenizer.tokenize(example.text_a)
#         tokens_b = tokenizer.tokenize(example.text_b)

#         # Modify the tokens_a and tokens_b in place, so that the total length is less than the specified length.
#         # account for [CLS], [SEP], [SEP] tokens
#         _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)

#         # convert the tokens into BERT format
#         # add the CLS and SEP tokens to the first sequence a
#         tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
#         # for the first sequence a, segment ids are 0
#         segment_ids = [0] * len(tokens)
        
#         # add the SEP token at the end of sequence b
#         tokens += tokens_b + ["[SEP]"]
#         # for the second sequence b, segmenet ids are 1
#         segment_ids += [1] * (len(tokens_b) + 1)
        
#         input_ids = tokenizer.convert_tokens_to_ids(tokens)
        
#         # The mask has 1 for real tokens and 0 for padding tokens.
#         # Only real tokens are attended to.
#         input_mask = [1] * len(input_ids)
        
#         # Zero-padding upto max_seq_length
#         padding = [0] * (max_seq_length - len(input_ids))
#         input_ids += padding
#         input_mask += padding
#         segment_ids += padding
        
#         assert len(input_ids) == max_seq_length
#         assert len(input_mask) == max_seq_length
#         assert len(segment_ids) == max_seq_length
        
#         label_id = example.label
       
#         features.append(
#             InputFeatures(
#                 input_ids=input_ids,
#                 input_mask=input_mask,
#                 segment_ids=segment_ids,
#                 label_id=label_id))

#     return features

In [12]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""
    
    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

### Metrics

In [13]:
def simple_accuracy(preds, labels):
    return (preds == labels).mean()

def acc_and_f1(preds, labels):
    acc = simple_accuracy(preds, labels)
    f1 = f1_score(y_true=labels, y_pred=preds)
    
    return {"acc": acc, "f1": f1}

### Model Parameters

In [14]:
output_mode = 'classification'
roberta_model = 'xlm-roberta-base'

train_batch_size = 32
gradient_accumulation_steps = 1
max_grad_norm = 1.0
num_train_epochs = 3.0

learning_rate = 2e-5
warmup_proportion = 0.1
num_training_steps = 1000
num_warmup_steps = 100
max_seq_length = 128

eval_batch_size = 16

cache_dir = os.path.join("", "pytorch_pretrained_models")
output_dir = os.path.join("", "out_dir") 

### Initial Setup 

Defining the device 

Setting the random seeds

In [15]:
# random.seed(42)
# np.random.seed(42)
# torch.manual_seed(42)
# torch.cuda.manual_seed_all(42)

Creating the output dir, if not present

In [16]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

**Creating the Processor and getting the BERT tokenizer**

In [17]:
processor = DataPreProcessor()

label_list = processor.get_labels()
num_labels = len(label_list)

# bert tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained(roberta_model)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




## Training

Read the train data and convert the training data into examples

In [18]:
train_examples = processor.get_train_examples(train_data)

In [19]:
num_train_optimization_steps = int(len(train_examples) / train_batch_size / gradient_accumulation_steps) * num_train_epochs
print(num_train_optimization_steps)

56160.0


### Load the pretrained BERT model

In [20]:
model = XLMRobertaForSequenceClassification.from_pretrained(roberta_model, cache_dir=cache_dir, num_labels=num_labels)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1115590446.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

Move the model to appropriate device

In [21]:
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

### Training Parameters

Create the Optimizer for training the model

In [22]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, correct_bias=False)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_optimization_steps)

In [23]:
global_step = 0
nb_tr_steps = 0
tr_loss = 0

### Training Input

Convert the training examples into training features

In [24]:
from transformers import glue_convert_examples_to_features as convert_examples_to_features

train_features = convert_examples_to_features(
    train_examples,
    tokenizer,
    max_length=max_seq_length,
    label_list=label_list,
    output_mode='classification'
)

In [25]:
print("***** Running training *****")
print("  Num examples = %d", len(train_examples))
print("  Batch size = %d", train_batch_size)
print("  Num steps = %d", num_train_optimization_steps)

***** Running training *****
  Num examples = %d 599059
  Batch size = %d 32
  Num steps = %d 56160.0


### Data Loader for Training data

- Wrap the all the training data as a TensorDataset
- Create a RandomSampler for sampling the data
- Create a DataLoader for loading a batch of data


In [26]:
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in train_features], dtype=torch.long)
all_labels = torch.tensor([f.label for f in train_features], dtype=torch.long)

train_data = TensorDataset(all_input_ids, all_attention_mask, all_labels)

train_sampler = RandomSampler(train_data)

train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

### Train the model

-  Load a batch of data
- Get the logits for the batch of data
- Calculate the loss 
- Update the model parameters using Optimizer w.r.t loss


In [27]:
model.train()

weights = torch.tensor([1, 3], dtype=torch.float, device=device)

for _ in trange(int(num_train_epochs), desc='Epoch'):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        
#         input_ids, input_mask, segment_ids, label_ids = batch
#         print(model(input_ids, segment_ids, input_mask, labels=None))
#         logits = model(input_ids, segment_ids, input_mask, labels=None)


        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
        
        input_ids = batch[0]
        label_ids = batch[2]
        
        outputs = model(**inputs)
        
        logits = outputs[1]
        
        loss_fct = CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
        
        optimizer.zero_grad()
        loss.backward()
        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
        
        optimizer.step()
        scheduler.step()
        global_step += 1
        
    print("Training Loss: %s" % (str(tr_loss)))

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]
Iteration:   0%|          | 0/18721 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/18721 [00:01<6:23:29,  1.23s/it][A
Iteration:   0%|          | 2/18721 [00:01<5:07:33,  1.01it/s][A
Iteration:   0%|          | 3/18721 [00:02<4:14:30,  1.23it/s][A
Iteration:   0%|          | 4/18721 [00:02<3:37:28,  1.43it/s][A
Iteration:   0%|          | 5/18721 [00:02<3:11:18,  1.63it/s][A
Iteration:   0%|          | 6/18721 [00:03<2:53:04,  1.80it/s][A
Iteration:   0%|          | 7/18721 [00:03<2:40:13,  1.95it/s][A
Iteration:   0%|          | 8/18721 [00:04<2:31:13,  2.06it/s][A
Iteration:   0%|          | 9/18721 [00:04<2:25:08,  2.15it/s][A
Iteration:   0%|          | 10/18721 [00:04<2:20:46,  2.22it/s][A
Iteration:   0%|          | 11/18721 [00:05<2:17:39,  2.27it/s][A
Iteration:   0%|          | 12/18721 [00:05<2:15:35,  2.30it/s][A
Iteration:   0%|          | 13/18721 [00:06<2:15:59,  2.29it/s][A
Iteration:   0%|          | 14/18721 

Training Loss: 2037.4400487455423



Iteration:   0%|          | 1/18721 [00:00<2:19:59,  2.23it/s][A
Iteration:   0%|          | 2/18721 [00:00<2:16:34,  2.28it/s][A
Iteration:   0%|          | 3/18721 [00:01<2:14:06,  2.33it/s][A
Iteration:   0%|          | 4/18721 [00:01<2:12:26,  2.36it/s][A
Iteration:   0%|          | 5/18721 [00:02<2:11:46,  2.37it/s][A
Iteration:   0%|          | 6/18721 [00:02<2:10:46,  2.39it/s][A
Iteration:   0%|          | 7/18721 [00:02<2:10:03,  2.40it/s][A
Iteration:   0%|          | 8/18721 [00:03<2:10:08,  2.40it/s][A
Iteration:   0%|          | 9/18721 [00:03<2:09:52,  2.40it/s][A
Iteration:   0%|          | 10/18721 [00:04<2:09:46,  2.40it/s][A
Iteration:   0%|          | 11/18721 [00:04<2:09:29,  2.41it/s][A
Iteration:   0%|          | 12/18721 [00:04<2:09:08,  2.41it/s][A
Iteration:   0%|          | 13/18721 [00:05<2:09:30,  2.41it/s][A
Iteration:   0%|          | 14/18721 [00:05<2:09:40,  2.40it/s][A
Iteration:   0%|          | 15/18721 [00:06<2:09:17,  2.41it/s][A
Ite

Training Loss: 852.3208468740631



Iteration:   0%|          | 1/18721 [00:00<2:26:01,  2.14it/s][A
Iteration:   0%|          | 2/18721 [00:00<2:21:07,  2.21it/s][A
Iteration:   0%|          | 3/18721 [00:01<2:18:05,  2.26it/s][A
Iteration:   0%|          | 4/18721 [00:01<2:16:04,  2.29it/s][A
Iteration:   0%|          | 5/18721 [00:02<2:14:06,  2.33it/s][A
Iteration:   0%|          | 6/18721 [00:02<2:13:13,  2.34it/s][A
Iteration:   0%|          | 7/18721 [00:02<2:12:07,  2.36it/s][A
Iteration:   0%|          | 8/18721 [00:03<2:11:46,  2.37it/s][A
Iteration:   0%|          | 9/18721 [00:03<2:11:06,  2.38it/s][A
Iteration:   0%|          | 10/18721 [00:04<2:14:28,  2.32it/s][A
Iteration:   0%|          | 11/18721 [00:04<2:15:22,  2.30it/s][A
Iteration:   0%|          | 12/18721 [00:05<2:16:14,  2.29it/s][A
Iteration:   0%|          | 13/18721 [00:05<2:14:51,  2.31it/s][A
Iteration:   0%|          | 14/18721 [00:05<2:13:15,  2.34it/s][A
Iteration:   0%|          | 15/18721 [00:06<2:12:04,  2.36it/s][A
Ite

Training Loss: 480.11032148247614





### Save and Load the trained model

- save the model weights
- save the model config
- Load the model for evaluation


In [28]:
# model.save_pretrained('./my_saved_model_directory/')
# tokenizer.save_pretrained('./my_saved_model_directory/')

# model = BertForSequenceClassification.from_pretrained('./my_saved_model_directory/')
# tokenizer = BertTokenizer.from_pretrained('./my_saved_model_directory/')

model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('out_dir/tokenizer_config.json',
 'out_dir/special_tokens_map.json',
 'out_dir/sentencepiece.bpe.model',
 'out_dir/added_tokens.json')

In [29]:
model_to_save = model.module if hasattr(model, 'module') else model
output_model_file = os.path.join(output_dir, 'roberta_model')
torch.save(model_to_save.state_dict(), output_model_file)

output_config_file = os.path.join(output_dir, 'roberta_config')
with open(output_config_file, 'w') as f:
    f.write(model_to_save.config.to_json_string())

# config = XLMRobertaConfig(output_config_file)
# model = XLMRobertaForSequenceClassification(config, num_labels=num_labels)
# model.load_state_dict(torch.load(output_model_file))

In [30]:
model = XLMRobertaForSequenceClassification.from_pretrained(output_dir)
tokenizer = XLMRobertaTokenizer.from_pretrained(output_dir)

In [31]:
model

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

Move the model to appropriate device

In [32]:
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

## Evaluation

Read the evaluation data and convert the evaluation data into examples

Convert the examples into features

In [34]:
# eval_examples = processor.get_dev_examples(valid)
# eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer, output_mode)

In [35]:
# print("***** Running evaluation *****")
# print("  Num examples = %d", len(eval_examples))
# print("  Batch size = %d", eval_batch_size)

### Data Loader for Evaluation data

- Wrap the all the evaluation data as a TensorDataset
- Create a SequentialSampler for sampling the data
- Create a DataLoader for loading a batch of data

In [36]:
# all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
# all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
# all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
# all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)

In [37]:
# eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
# eval_sampler = SequentialSampler(eval_data)
# eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

In [38]:
# eval_loss = 0
# nb_eval_steps = 0
# preds = []

### Evaluation and Metrics on Predictions

In [39]:
# model.eval()
# for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
#     input_ids = input_ids.to(device)
#     input_mask = input_mask.to(device)
#     segment_ids = segment_ids.to(device)
#     label_ids = label_ids.to(device)

#     with torch.no_grad():
#         logits = model(input_ids, segment_ids, input_mask, labels=None)

#     # create eval loss and other metric required by the task
#     loss_fct = CrossEntropyLoss()
#     tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))

#     eval_loss += tmp_eval_loss.mean().item()
#     nb_eval_steps += 1
#     if len(preds) == 0:
#         preds.append(logits.detach().cpu().numpy())
#     else:
#         preds[0] = np.append(
#             preds[0], logits.detach().cpu().numpy(), axis=0)

# eval_loss = eval_loss / nb_eval_steps
# preds = preds[0]
# preds = np.argmax(preds, axis=1)

# result = acc_and_f1(preds, all_label_ids.numpy())
# print(result)

In [40]:
# !ls

In [41]:
test = pd.read_csv('/kaggle/input/sibur-2020-nlp-classification/test_xlmroberta.csv')

test.name_1 = test.name_1.str.strip()
test.name_2 = test.name_2.str.strip()

test['is_duplicate'] = 0
test.fillna('', inplace=True)

test.insert(0, 'pair_id', test.index)
test['is_duplicate'] = test['is_duplicate'].astype(str)

test.head()

Unnamed: 0,pair_id,name_1,name_2,is_duplicate
0,0,blinds decoration,indl cuautitlan,0
1,1,eih,plastic product,0
2,2,jsh,shipbuilding repair yard c,0
3,3,better,farmacap,0
4,4,equipos inoxidables,bel,0


In [42]:
test.shape

(213249, 4)

In [43]:
test_examples = processor.get_dev_examples(test)

test_features = convert_examples_to_features(
    test_examples,
    tokenizer,
    max_length=max_seq_length,
    label_list=label_list,
    output_mode='classification'
)

In [44]:
all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in test_features], dtype=torch.long)
all_labels = torch.tensor([f.label for f in test_features], dtype=torch.long)

In [45]:
test_batch_size = 32

test_data = TensorDataset(all_input_ids, all_attention_mask, all_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=test_batch_size)

In [46]:
test_loss = 0
nb_test_steps = 0
preds = []

In [47]:
model.eval()

for batch in tqdm(test_dataloader, desc="Predicting"):
    batch = tuple(t.to(device) for t in batch)

    inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
        
    input_ids = batch[0]
    label_ids = batch[2]

    with torch.no_grad():
        outputs = model(**inputs)
        
    logits = outputs[1]

    # create eval loss and other metric required by the task
    loss_fct = CrossEntropyLoss()
    
    tmp_test_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
    test_loss += tmp_test_loss.mean().item()
    
    nb_test_steps += 1
    
    if len(preds) == 0:
        preds.append(logits.detach().cpu().numpy())
    else:
        preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0)

test_loss = test_loss / nb_test_steps

print('test loss', test_loss)
print(preds)

preds = preds[0]
preds = np.argmax(preds, axis=1)

print(preds.shape)

print(preds)

Predicting: 100%|██████████| 6665/6665 [13:54<00:00,  7.98it/s]

test loss 0.07157496331623657
[array([[ 1.5040808, -1.1931344],
       [ 3.9255476, -4.435331 ],
       [ 3.925737 , -4.426691 ],
       ...,
       [ 3.9251678, -4.435127 ],
       [ 3.9260662, -4.4165974],
       [ 3.9252508, -4.4347453]], dtype=float32)]
(213249,)
[0 0 0 ... 0 0 0]





In [48]:
preds.shape

(213249,)

In [49]:
type(preds[0])

numpy.int64

In [50]:
test['is_duplicate'] = preds

In [52]:
pd.RangeIndex(start=1, stop=213250, step=1)

RangeIndex(start=1, stop=213250, step=1)

In [53]:
test['pair_id'] = pd.RangeIndex(start=1, stop=213250, step=1)

In [54]:
test

Unnamed: 0,pair_id,name_1,name_2,is_duplicate
0,1,blinds decoration,indl cuautitlan,0
1,2,eih,plastic product,0
2,3,jsh,shipbuilding repair yard c,0
3,4,better,farmacap,0
4,5,equipos inoxidables,bel,0
...,...,...,...,...
213244,213245,dyna,easero,0
213245,213246,dsi,kavalani sons wll,0
213246,213247,saint gobain abrasives,zarabi,0
213247,213248,covestro,terabytes,0


In [55]:
test[['pair_id', 'is_duplicate']].to_csv('submission.csv', index=None)

In [56]:
x = pd.read_csv('/kaggle/working/submission.csv')
x

Unnamed: 0,pair_id,is_duplicate
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
213244,213245,0
213245,213246,0
213246,213247,0
213247,213248,0
