In [1]:
import os
import csv
import torch
import transformers
import numpy as np
import pandas as pd
import torch.utils.data as Data

from transformers import AutoTokenizer,WEIGHTS_NAME,CONFIG_NAME,XLNetForSequenceClassification,AdamW
from sklearn.model_selection import train_test_split

In [2]:
class DataProcessor:
    def __init__(self, filename):
        with open(filename) as f:
            rows = [row for row in csv.reader(f)]
            rows = np.array(rows[1:]) # all data, 2D
            self.label_list = [label for _, label in rows] # label list
            self.classes_list = list(set(self.label_list)) # non-repeated label list
            self.num_classes = len(self.classes_list) # num of classes
            for i in range(len(self.label_list)):
                self.label_list[i] = self.classes_list.index(self.label_list[i]) # index of label

            self.name_list, self.sentence_list = [], []
            for sentence, _ in rows:
                begin = sentence.find('<e1>')
                end = sentence.find('</e1>')
                e1 = sentence[begin:end + 5]

                begin = sentence.find('<e2>')
                end = sentence.find('</e2>')
                e2 = sentence[begin:end + 5]

                self.name_list.append(e1 + " " + e2)
                self.sentence_list.append(sentence)

In [3]:
class DataConverter:
    def __init__(self, names, sentences, target):
        self.input_ids, self.token_type_ids, self.attention_mask = [], [], []
        for i in range(len(sentences)):
            encoded_dict = tokenizer.encode_plus(
                sentences[i],        # input text
                add_special_tokens = True,      # add '[CLS]' and '[SEP]'
                max_length = 96,           # padding & truncation length
                pad_to_max_length = True,
                return_tensors = 'pt',         # return data in pytorch tensors format
            )
            self.input_ids.append(encoded_dict['input_ids'])
            self.token_type_ids.append(encoded_dict['token_type_ids'])
            self.attention_mask.append(encoded_dict['attention_mask'])

        self.input_ids = torch.cat(self.input_ids, dim=0)
        self.token_type_ids = torch.cat(self.token_type_ids, dim=0)
        self.attention_mask = torch.cat(self.attention_mask, dim=0)

        self.input_ids = torch.LongTensor(self.input_ids)
        self.token_type_ids = torch.LongTensor(self.token_type_ids)
        self.attention_mask = torch.LongTensor(self.attention_mask)
        self.target = torch.LongTensor(target)

In [4]:
class ModelEvaluator:
    def __init__(self, device):
        self.device = device
        self.best_score = 0

    def flat_accuracy(self, preds, labels):
        pred_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        return np.sum(pred_flat == labels_flat) / len(labels_flat)

    def save(self, model):
        torch.save(model.state_dict(), output_model_file)
        model.config.to_json_file(output_config_file)

    def eval(self, model, validation_dataloader):
        model.eval()
        eval_loss, eval_accuracy, nb_eval_steps = 0, 0, 0
        for batch in validation_dataloader:
            batch = tuple(t.to(self.device) for t in batch)
            with torch.no_grad():
                logits = model(batch[0], token_type_ids=batch[1], attention_mask=batch[2])[0]
                logits = logits.detach().cpu().numpy()
                label_ids = batch[3].cpu().numpy()
                tmp_eval_accuracy = self.flat_accuracy(logits, label_ids)
                eval_accuracy += tmp_eval_accuracy
                nb_eval_steps += 1
        print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))
        if self.best_score < eval_accuracy / nb_eval_steps:
            self.best_score = eval_accuracy / nb_eval_steps
            self.save(model)

In [5]:
transformers.logging.set_verbosity_error()
tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

best_score = 0
batch_size = 32
classes_list = list()

output_dir = './models/'
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

filename = 'train.csv'
data_processor = DataProcessor(filename)
name_list = data_processor.name_list
sentence_list = data_processor.sentence_list
label_list = data_processor.label_list
classes_list = data_processor.classes_list
num_classes = data_processor.num_classes

In [6]:
data_converter = DataConverter(name_list, sentence_list, label_list)
input_ids = data_converter.input_ids
token_type_ids = data_converter.token_type_ids
attention_mask = data_converter.attention_mask
labels = data_converter.target



In [7]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels, random_state=1, test_size=0.1)
train_token, val_token, _, _ = train_test_split(token_type_ids, labels, random_state=1, test_size=0.1)
train_mask, val_mask, _, _ = train_test_split(attention_mask, labels, random_state=1, test_size=0.1)

train_data = Data.TensorDataset(train_inputs, train_token, train_mask, train_labels)
train_dataloader = Data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

validation_data = Data.TensorDataset(val_inputs, val_token, val_mask, val_labels)
validation_dataloader = Data.DataLoader(validation_data, batch_size=batch_size, shuffle=True)

model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=num_classes).to(device)

In [8]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay_rate': 0.0}]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)



In [10]:
epoch = 2
for _ in range(epoch):
    for i, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        loss = model(batch[0], token_type_ids=batch[1], attention_mask=batch[2], labels=batch[3])[0]
        print(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        evaluator = ModelEvaluator(device)
        if i % 1 == 0:
            evaluator.eval(model, validation_dataloader)

2.5129449367523193
Validation Accuracy: 0.17503561253561253
2.623152494430542
Validation Accuracy: 0.16862535612535612
2.7546916007995605
Validation Accuracy: 0.17414529914529914
2.5247201919555664
Validation Accuracy: 0.17441239316239315
2.4481756687164307
Validation Accuracy: 0.17895299145299146
2.6424524784088135
Validation Accuracy: 0.18215811965811965
2.889400005340576
Validation Accuracy: 0.17352207977207978
2.6788556575775146
Validation Accuracy: 0.17031695156695156
2.710475206375122
Validation Accuracy: 0.18011039886039887
2.564345121383667
Validation Accuracy: 0.18108974358974358
2.64027738571167
Validation Accuracy: 0.18242521367521367
2.3970718383789062
Validation Accuracy: 0.1891025641025641
2.4989655017852783
Validation Accuracy: 0.19230769230769232
2.6359939575195312
Validation Accuracy: 0.19257478632478633
2.504774808883667
Validation Accuracy: 0.18963675213675213
2.849952459335327
Validation Accuracy: 0.19293091168091167
2.5422801971435547
Validation Accuracy: 0.1934650