In [1]:
import torch
import torch.nn as nn
import torch.utils.data as data
from torch.utils.data import DataLoader
import pandas as pd
from transformers import BertTokenizer,BertModel
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import classification_report

In [2]:
class CpsBertModel(nn.Module):
    def __init__(self):
        super(CpsBertModel, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.linear = nn.Linear(768, 8)
        self.init_weights()

    def init_weights(self):
        init_range = 0.5
        self.linear.weight.data.uniform_(-init_range, init_range)
        self.linear.weight.data.uniform_(-init_range, init_range)
        self.linear.bias.data.zero_()

    def forward(self, _input):
        pooler_output = self.bert(**_input)[1]
        y = self.decoder(pooler_output)
        return y

In [3]:
class MyDataset(data.Dataset):
    def __init__(self, df: pd.DataFrame):
        df.reset_index(inplace=True)
        self.df = df

    def __getitem__(self, index):
        sequence = self.df['Text'][index]
        label = self.df['Label'][index]
        return label, sequence

    def __len__(self):
        return self.df.__len__()

In [4]:
special_tokens = {'additional_special_tokens': ['[R_zero]', '[R_one]', '[R_two]', '[R_three]',
                                                    '[voltage]', '[current]', '[number]']}
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.add_special_tokens(special_tokens)
class Data:
    def __init__(self, dataset):
        self.label_pipeline = lambda x:[int(item) for item in x]
        self.tokenizer = tokenizer
        self.dataloader = DataLoader(dataset, batch_size=64, shuffle=False, collate_fn=self.collate_batch)

    def collate_batch(self, batch):
        labels,texts = [], []
        for (label, text) in batch:
            labels.append(label)
            texts.append(text)
        labels = self.label_pipeline(labels)
        inputs = self.tokenizer(texts,return_tensors='pt')
        return torch.tensor(labels, dtype=torch.int64), inputs

In [5]:
def train(train_data: Data, epoch: int):
    total_loss = 0
    correct = 0.0
    total = 0.0
    start_time = time.time()
    model.train()
    for idx, (label, text) in enumerate(train_data.dataloader):
        optimizer.zero_grad()
        output = model(text)

        total += label.size(0)
        for i in output.argmax(1).eq(label):
            if i:
                correct += 1

        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        log_interval = parameters.log_interval
        if idx % log_interval == 0 and idx > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.5f} | ms/batch {:5.5f} | '
                  'loss {:5.2f} | accuracy {:8.2f}%'.format(epoch + 1, idx, train_data.dataloader.__len__(),
                                                            optimizer.param_groups[0]['lr'],
                                                            elapsed * 1000 / log_interval, cur_loss,
                                                            correct / total * 100))
            total_loss = 0
            start_time = time.time()

In [6]:
def evaluate(data: Data, epoch: int):
    total_loss = 0
    correct = 0.0
    total = 0.0
    y_pred, y_true = [], []
    start_time = time.time()
    model.eval()
    for idx, (label, text) in enumerate(data.dataloader):
        optimizer.zero_grad()
        output = model(text)

        total += label.size(0)

        loss = criterion(output, label)
        total_loss += loss.item()

        predict = output.argmax(1)
        for i in predict.eq(label):
            if i:
                correct += 1

#         y_pred.extend(predict.to('cpu'))
#         y_true.extend(label.to('cpu'))

    batches = data.dataloader.__len__()
    cur_loss = total_loss / batches
    elapsed = time.time() - start_time
    kappa = cohen_kappa_score(y_pred, y_true)
    print(
        '| epoch {:3d} | {:5d} batches | ms/batch {:5.5f} | loss {:5.2f} | '
        'accuracy {:8.2f}% | Kappa {:8.4f}'.format(
            epoch + 1, batches,
            elapsed * 1000 / batches, cur_loss,
            correct / total * 100,
            kappa))
    print(classification_report(y_true, y_pred, target_names=parameters.all_names))
    return correct / total

In [7]:
df_train = pd.read_csv('../data/single-sentence-prediction/train_data.csv')
df_test = pd.read_csv('../data/single-sentence-prediction/test_data.csv')

# 初始化数据
dataset_train = MyDataset(df_train)
data_train = Data(dataset_train)

dataset_test = MyDataset(df_test)
data_test = Data(dataset_test)
                       
# 准备模型
model = CpsBertModel()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
total = 0
total2 = 0
for param in model.parameters():
    total += param.nelement()
    if param.requires_grad:
        total2 += param.nelement()
print("Number of parameter: %.2fM" % (total / 1e6))
print("Number of training parameter: %.2fM" % (total2 / 1e6))

Number of parameter: 109.49M
Number of training parameter: 109.49M


In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.5)

In [10]:
import time
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
for epoch in range(10):
    train(data_train, epoch)
    accu_val = evaluate(data_test, epoch)
    if total_accu is not None and total_accu > accu_val:
        print('scheduler runs')
        scheduler.step()
    else:
        total_accu = accu_val
return model

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length.