In [16]:
import numpy as np
import pandas as pd
pos = []
neg = []
train=pd.read_csv('../senti/training_set.csv',encoding='gbk')
train.iloc[0]
for i in range(len(train)):
    if train.iloc[i]['labels'] == 'positive':
        sentence = train.iloc[i]['text'].replace(' ', '').replace('\n', '')
        pos.append(sentence)
    else:
        sentence = train.iloc[i]['text'].replace(' ', '').replace('\n', '')
        neg.append(sentence)
np.savetxt('nnneg.txt', neg, encoding='utf-8', delimiter=',', fmt = '%s')
np.savetxt('pppos.txt', pos, encoding='utf-8', delimiter=',', fmt = '%s')

In [20]:
import numpy as np
import random
import torch
import matplotlib.pyplot as plt
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import os
import time
import datetime
from sklearn.model_selection import train_test_split

In [26]:
def readfile(filename):
    with open(filename, encoding="utf-8") as f:
        content = f.readlines()
        return content

#将每一句转成数字（大于126做截断，小于126做PADDING，加上首尾两个标识，长度总共等于128）
def convert_text_to_token(tokenizer, sentence, limit_size=126):

    tokens = tokenizer.encode(sentence[:limit_size])  #直接截断
    if len(tokens) < limit_size + 2:                  #补齐（pad的索引号就是0）
        tokens.extend([0] * (limit_size + 2 - len(tokens)))
    return tokens

#建立mask
def attention_masks(input_ids):
    atten_masks = []
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        atten_masks.append(seq_mask)
    return atten_masks

def binary_acc(preds, labels):
    correct = torch.eq(torch.max(preds, dim=1)[1], labels.flatten()).float()
    acc = correct.sum().item() / len(correct)
    return acc


def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))
def train(model, optimizer):
    t0 = time.time()
    avg_loss, avg_acc = [],[]

    model.train()
    for step, batch in enumerate(train_dataloader):

        # 每隔40个batch 输出一下所用时间.
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids, b_input_mask, b_labels = batch[0].long().to(device), batch[1].long().to(device), batch[2].long().to(device)

        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss, logits = output[0], output[1]

        avg_loss.append(loss.item())

        acc = binary_acc(logits, b_labels)
        avg_acc.append(acc)

        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 1.0)      #大于1的梯度将其设为1.0, 以防梯度爆炸
        optimizer.step()              #更新模型参数
        scheduler.step()              #更新learning rate

    avg_acc = np.array(avg_acc).mean()
    avg_loss = np.array(avg_loss).mean()
    return avg_loss, avg_acc

def evaluate(model):
    avg_acc = []
    model.eval()         #表示进入测试模式

    with torch.no_grad():
        for batch in test_dataloader:
            b_input_ids, b_input_mask, b_labels = batch[0].long().to(device), batch[1].long().to(device), batch[2].long().to(device)

            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

            acc = binary_acc(output[0], b_labels)
            avg_acc.append(acc)
    avg_acc = np.array(avg_acc).mean()
    return avg_acc
def predict(sen):
    input_id = convert_text_to_token(tokenizer, sen)
    input_token = torch.tensor(input_id).long().to(device)            #torch.Size([128])
    atten_mask = [float(i>0) for i in input_id]
    attention_token = torch.tensor(atten_mask).long().to(device)       #torch.Size([128])
    output = model(input_token.view(1, -1), token_type_ids=None, attention_mask=attention_token.view(1, -1))     #torch.Size([128])->torch.Size([1, 128])否则会报错
    print(output[0])
    return torch.max(output[0], dim=1)[1]
if __name__ == '__main__':
    # data process
    pos_text, neg_text = readfile('pppos.txt'), readfile('nnneg.txt')
    sentences = pos_text + neg_text

    # 设定标签
    pos_targets = np.ones((len(pos_text)))
    neg_targets = np.zeros((len(neg_text)))
    targets = np.concatenate((pos_targets, neg_targets), axis=0).reshape(-1, 1)  # (10000, 1)
    total_targets = torch.tensor(targets)
    # BERTTokenizer进行编码，将每一句中的单字转换成索引值，BERT中句子开头是‘CLS’，句子结尾是‘SEP’
    tokenizer = BertTokenizer.from_pretrained('./bert-base-chinese/')


    # 固定句子长度，太长的截断，不够长的用pad填充
    input_ids = [convert_text_to_token(tokenizer, sen) for sen in sentences]

    input_tokens = torch.tensor(input_ids)

    # 一个句子中，如果对应位置为pad，则mask值为0， 否则为1
    atten_masks = attention_masks(input_ids)
    attention_tokens = torch.tensor(atten_masks)
    train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_tokens, total_targets, random_state=666, test_size=0.2)
    train_masks, test_masks, _, _ = train_test_split(attention_tokens, input_tokens, random_state=666, test_size=0.2)

    SEED = 123
    BATCH_SIZE = 16
    LEARNING_RATE = 2e-5
    WEIGHT_DECAY = 1e-2
    EPSILON = 1e-8

    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    # 创建train、test dataloader，
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

    test_data = TensorDataset(test_inputs, test_masks, test_labels)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

    # 创建模型
    model = BertForSequenceClassification.from_pretrained("./bert-base-chinese/", num_labels=2)  # 调用模型
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 使用GPU加速
    model.to(device)
    # 创建优化器
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': WEIGHT_DECAY},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, eps=EPSILON)  # 优化器

    epochs = 2  # 迭代次数
    # training steps 的数量: [number of batches] x [number of epochs].
    total_steps = len(train_dataloader) * epochs

    # 设计 learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    # 开始训练
    for epoch in range(epochs):
        train_loss, train_acc = train(model, optimizer)
        print('epoch={},训练准确率={}，损失={}'.format(epoch, train_acc, train_loss))
        test_acc = evaluate(model)
        print("epoch={},测试准确率={}".format(epoch, test_acc))


Some weights of the model checkpoint at ./bert-base-chinese/ were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint

  Batch    40  of  1,231.    Elapsed: 0:00:32.
  Batch    80  of  1,231.    Elapsed: 0:01:05.
  Batch   120  of  1,231.    Elapsed: 0:01:39.
  Batch   160  of  1,231.    Elapsed: 0:02:12.
  Batch   200  of  1,231.    Elapsed: 0:02:47.
  Batch   240  of  1,231.    Elapsed: 0:03:22.
  Batch   280  of  1,231.    Elapsed: 0:03:57.
  Batch   320  of  1,231.    Elapsed: 0:04:31.
  Batch   360  of  1,231.    Elapsed: 0:05:06.
  Batch   400  of  1,231.    Elapsed: 0:05:40.
  Batch   440  of  1,231.    Elapsed: 0:06:15.
  Batch   480  of  1,231.    Elapsed: 0:06:49.
  Batch   520  of  1,231.    Elapsed: 0:07:24.
  Batch   560  of  1,231.    Elapsed: 0:07:59.
  Batch   600  of  1,231.    Elapsed: 0:08:34.
  Batch   640  of  1,231.    Elapsed: 0:09:09.
  Batch   680  of  1,231.    Elapsed: 0:09:44.
  Batch   720  of  1,231.    Elapsed: 0:10:19.
  Batch   760  of  1,231.    Elapsed: 0:10:54.
  Batch   800  of  1,231.    Elapsed: 0:11:29.
  Batch   840  of  1,231.    Elapsed: 0:12:03.
  Batch   880

In [52]:
def valid():
    test=pd.read_csv('../senti/test_set.csv',encoding='gbk')
    list2=[]
    for i in range(len(test)):
        sentence = test.iloc[i]['text'].replace(' ', '').replace('\n', '')
        label = predict(sentence).item()
        list2.append([test.iloc[i]['id'],label])
    pd.DataFrame(list2).to_csv("./key2.csv",header=None)
    print("finish")
valid()

tensor([[-4.4773,  2.9022]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[-4.9008,  3.9230]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[-4.9064,  3.9466]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[-4.8837,  3.5535]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[ 3.7643, -3.8743]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[-4.5882,  4.2643]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[ 3.5710, -3.7603]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[-4.6358,  3.3209]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[-4.5196,  4.2929]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[ 3.5813, -3.8328]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[ 4.0945, -4.1084]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[-4.8222,  4.1506]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[ 4.0908, -4.0333]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[-4.8891,  4.1233]], device='cuda:0', grad_fn=<AddmmBack

In [66]:
test=pd.read_csv('./key.csv',encoding='gbk')
test['1.1']=['positive' if i==1 else 'negative' for i in test['1.1']]
test.to_csv("./key2.csv",header=None)

Unnamed: 0,1,1.1
0,2,1
1,3,1
2,4,1
3,5,0
4,6,1
...,...,...
4994,4996,1
4995,4997,1
4996,4998,1
4997,4999,0
