In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from transformers import get_linear_schedule_with_warmup

In [2]:
np.random.seed(2020)
torch.manual_seed(2020)
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    torch.cuda.manual_seed(2020)
    print("Use GPU cuda")
else:
    print("Use CPU")
    

Use GPU cuda


In [3]:
# data=pd.read_csv('./online.csv',encoding='utf-8')
# data=pd.read_csv('./chnsenticorp/new.csv',encoding = 'utf-8')
data=pd.read_csv('./tencent.csv',encoding='utf-8')
# data = data[2000:]
# data=pd.read_csv('./data/ChnSentiCorp_htl_all.csv',encoding='utf-8')
from transformers import BertTokenizer,BertModel

tokenizer = BertTokenizer.from_pretrained("./chinese-bert-wwm/")
data
# result_comments=list(data['comments'].values)

Unnamed: 0,label,negative_prob,positive_prob,confidence,comments
0,0,0.990446,0.009554,0.978769,这张三丰一点清新脱俗的世外高人的感觉都没有
1,0,0.600003,0.399997,0.111118,张五侠的铁画银钩被导演吃了
2,0,0.996953,0.003047,0.993228,我都没生，芷若都这么老了
3,0,0.994541,0.005459,0.987868,老爸是最苦的啥都知道但啥都不能说可怜
4,1,0.001733,0.998267,0.996148,特别喜欢建国不知道为啥
...,...,...,...,...,...
21527,0,0.994942,0.005058,0.988759,那么多年，有没避孕措施，就生一个，奇怪...
21528,1,0.000103,0.999897,0.999771,有史以来最帅俞岱岩，怎么下的去手，心疼
21529,0,0.995962,0.004038,0.991027,就看不惯你俩！现在才多久就五哥素素的
21530,0,0.923040,0.076960,0.828978,周芷若:丁师姐比芷若大不了几岁。这都还没芷若呢


In [4]:
# 剔除标点符号,\xa0 空格
def pretreatment(comments):
    result_comments=[]
    punctuation='。，？！：%&~（）、；“”&|,.?!:%&~();""'
    for comment in comments:
        comment= ''.join([c for c in comment if c not in punctuation])
        comment= ''.join(comment.split())   #\xa0
        result_comments.append(comment)
    
    return result_comments

In [5]:
# result_comments=pretreatment(list(data['comments'].values))
result_comments=list(data['comments'].values)
result = tokenizer.encode_plus('这视频有点东西')
result
# token = tokenizer.convert_ids_to_tokens(result)
# token
# result_comments

{'input_ids': [101, 6821, 6228, 7574, 3300, 4157, 691, 6205, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [6]:
result_comments_id=tokenizer(result_comments,padding=True,truncation=True,max_length=100,return_tensors='pt')
result_comments_id
# for i in range(len(result_comments)):
#     result_comments_id[i] = tokenizer(result_comments[i],padding=True,truncation=True,max_length=200,return_tensors='pt')
# result_comments_id

{'input_ids': tensor([[ 101, 6821, 2476,  ...,    0,    0,    0],
        [ 101, 2476,  758,  ...,    0,    0,    0],
        [ 101, 2769, 6963,  ...,    0,    0,    0],
        ...,
        [ 101, 2218, 4692,  ...,    0,    0,    0],
        [ 101, 1453, 5711,  ...,    0,    0,    0],
        [ 101, 5307, 1073,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [7]:
result_comments_id['input_ids'].shape

torch.Size([21532, 62])

In [8]:
from sklearn.model_selection import train_test_split
X=result_comments_id['input_ids']
y=torch.from_numpy(data['label'].values).float()
predict_data = pd.read_csv('./prediction.csv',encoding='utf-8')
predict_comments=list(predict_data['comments'].values)
predict_comments_id=tokenizer(predict_comments,padding=True,truncation=True,max_length=100,return_tensors='pt')
X_pred = predict_comments_id['input_ids']
y_pred = torch.from_numpy(predict_data['label'].values).float()
X_train,X_test, y_train, y_test =train_test_split(X,y,test_size=0.3,shuffle=True,stratify=y,random_state=2020)

In [9]:
print(len(X_train))
print(len(X_test))
print(X_train.shape)
print(y_train.shape)

15072
6460
torch.Size([15072, 62])
torch.Size([15072])


In [10]:
X_valid,X_test,y_valid,y_test=train_test_split(X_test,y_test,test_size=0.5,shuffle=True,stratify=y_test,random_state=2020)
len(X_valid),len(X_test)

(3230, 3230)

In [11]:
# create Tensor datasets
train_data = TensorDataset(X_train, y_train)
valid_data = TensorDataset(X_valid, y_valid)
test_data = TensorDataset(X_test,y_test)
pred_data = TensorDataset(X_pred,y_pred)
# print(X_test.shape)
# dataloaders
batch_size = 32

# make sure the SHUFFLE your training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,drop_last=True)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size,drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size,drop_last=True)
pred_loader = DataLoader(pred_data, shuffle=False, batch_size=10,drop_last=False)

In [12]:
if(USE_CUDA):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.


In [13]:
class bert_lstm(nn.Module):
    def __init__(self, hidden_dim,output_size,n_layers,bidirectional=True, drop_prob=0.5):
        super(bert_lstm, self).__init__()
 
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional
        
        #Bert ----------------重点，bert模型需要嵌入到自定义模型里面
        self.bert=BertModel.from_pretrained("./chinese-bert-wwm/")
        for param in self.bert.parameters():
            param.requires_grad = True
        
        # LSTM layers
        self.lstm = nn.LSTM(768, hidden_dim, n_layers, batch_first=True,bidirectional=bidirectional)
        
        # dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        # linear and sigmoid layers
        if bidirectional:
            self.fc = nn.Linear(hidden_dim*2, output_size)
        else:
            self.fc = nn.Linear(hidden_dim, output_size)
          
        #self.sig = nn.Sigmoid()
 
    def forward(self, x, hidden):
        batch_size = x.size(0)
        #生成bert字向量
        x=self.bert(x)[0]     #bert 字向量
        
        # lstm_out
        #x = x.float()
        lstm_out, (hidden_last,cn_last) = self.lstm(x, hidden)
        #print(lstm_out.shape)   #[32,100,768]
        #print(hidden_last.shape)   #[4, 32, 384]
        #print(cn_last.shape)    #[4, 32, 384]
        
        #修改 双向的需要单独处理
        if self.bidirectional:
            #正向最后一层，最后一个时刻
            hidden_last_L=hidden_last[-2]
            #print(hidden_last_L.shape)  #[32, 384]
            #反向最后一层，最后一个时刻
            hidden_last_R=hidden_last[-1]
            #print(hidden_last_R.shape)   #[32, 384]
            #进行拼接
            hidden_last_out=torch.cat([hidden_last_L,hidden_last_R],dim=-1)
            #print(hidden_last_out.shape,'hidden_last_out')   #[32, 768]
        else:
            hidden_last_out=hidden_last[-1]   #[32, 384]
            
            
        # dropout and fully-connected layer
        out = self.dropout(hidden_last_out)
        #print(out.shape)    #[32,768]
        out = self.fc(out)
        
        return out
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        
        number = 1
        if self.bidirectional:
            number = 2
        
        if (USE_CUDA):
            hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float().cuda(),
                      weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float().cuda()
                     )
        else:
            hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float(),
                      weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().float()
                     )
        
        return hidden

In [14]:
output_size = 2
hidden_dim = 384   #768/2
n_layers = 6
bidirectional = True  #这里为True，为双向LSTM

net = bert_lstm(hidden_dim, output_size,n_layers, bidirectional)

#print(net)

Some weights of the model checkpoint at ./chinese-bert-wwm/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
# loss and optimization functions
lr=2e-5
epochs = 10
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
# training params

# batch_size=50
print_every = 10
clip=5 # gradient clipping
 
# move model to GPU, if available
if(USE_CUDA):
    net.cuda()

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
print(len(train_loader.dataset))
print(len(valid_loader.dataset))

15072
3230


In [None]:

# train for some number of epochs
for e in range(epochs):
    net.train()
    # initialize hidden state
    h = net.init_hidden(batch_size)
    counter = 0
    train_acc = 0
    valid_acc = 0
    num_correct = 0
    train_loss = []
    valid_loss = []
    # batch loop
    for inputs, labels in train_loader:
        counter += 1
        
        if(USE_CUDA):
            inputs, labels = inputs.cuda(), labels.cuda()
            # labels = labels.view(64, 1)
            # print(labels.shape)
        h = tuple([each.data for each in h])
        net.zero_grad()
        
        output= net(inputs, h)
        output.cuda()
        output = output.squeeze()
        # print(output.shape)
        # print(labels)
        # print(output)
        loss = criterion(output, labels.long())
        train_loss.append(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        output=torch.nn.Softmax(dim=1)(output)
        pred=torch.max(output, 1)[1]

        # compare predictions to true label
        correct_tensor = pred.eq(labels.float().view_as(pred))
        correct = np.squeeze(correct_tensor.numpy()) if not USE_CUDA else np.squeeze(correct_tensor.cpu().numpy())
        num_correct += np.sum(correct)
        # print(num_correct)
    train_acc = num_correct/len(train_loader.dataset)
        # loss stats
    num_correct = 0
    
    net.eval()
    with torch.no_grad():
        val_h = net.init_hidden(batch_size)
        # val_losses = []
        for inputs, labels in valid_loader:
            val_h = tuple([each.data for each in val_h])

            if(USE_CUDA):
                inputs, labels = inputs.cuda(), labels.cuda()

            output = net(inputs, val_h)
            val_loss = criterion(output.squeeze(), labels.long())
            valid_loss.append(val_loss.item())
            output=torch.nn.Softmax(dim=1)(output)
            pred=torch.max(output, 1)[1]

                # compare predictions to true label
            correct_tensor = pred.eq(labels.float().view_as(pred))
            correct = np.squeeze(correct_tensor.numpy()) if not USE_CUDA else np.squeeze(correct_tensor.cpu().numpy())
            num_correct += np.sum(correct)
            # print(num_correct)
        valid_acc = num_correct/len(valid_loader.dataset)
    
    #test
    test_losses = [] # track loss
    num_correct = 0
    TN = 0
    TP = 0
    FN = 0
    FP = 0 
    # init hidden state
    h = net.init_hidden(batch_size)
    
    net.eval()
    # iterate over test data
    for inputs, labels in test_loader:
        h = tuple([each.data for each in h])
        if(USE_CUDA):
            inputs, labels = inputs.cuda(), labels.cuda()
        output = net(inputs, h)
        test_loss = criterion(output.squeeze(), labels.long())
        test_losses.append(test_loss.item())
        output=torch.nn.Softmax(dim=1)(output)
        pred=torch.max(output, 1)[1]
        pred_numpy = pred.detach().cpu().numpy()
        labels_numpy = labels.detach().cpu().numpy()
        # total_f1 += flat_f1(pred_numpy, labels_numpy)
        # total_recall += flat_recall(pred_numpy, labels_numpy)
        # total_precision += flat_precision(pred_numpy, labels_numpy)
        # total_acc += flat_accuracy(pred_numpy, labels_numpy)
        # compare predictions to true label
        correct_tensor = pred.eq(labels.float().view_as(pred))
        correct = np.squeeze(correct_tensor.numpy()) if not USE_CUDA else np.squeeze(correct_tensor.cpu().numpy())
        num_correct += np.sum(correct)
        TP += ((pred_numpy == 1) & (labels_numpy == 1)).sum()
        TN += ((pred_numpy == 0) & (labels_numpy == 0)).sum()
        FN += ((pred_numpy == 0) & (labels_numpy == 1)).sum()
        FP += ((pred_numpy == 1) & (labels_numpy == 0)).sum()

    print(TP)
    print(TN)
    print(FP)
    print(FN)
    p = TP / (TP + FP)
    r = TP / (TP + FN)
    F1 = 2 * r * p / (r + p)
    acc = (TP + TN) / (TP + TN + FP + FN)
    print("Test loss: {:.3f}".format(np.mean(test_losses)))
    
    # accuracy over all test data
    test_acc = num_correct/len(test_loader.dataset)
    
    # net.train()
    print("Epoch: {}/{}...".format(e+1, epochs),
          "Loss: {:.6f}...".format(np.mean(train_loss)),
          "train accracy: {:.4f}...".format(train_acc),
           "Val Loss: {:.6f}...".format(np.mean(valid_loss)),
           "valid accracy: {:.4f}.".format(valid_acc))
    print()
    print("Test accuracy: {:.4f}".format(test_acc),
    "Sk test accuracy: {:.4f}".format(acc),
    "F1: {:.4f}".format(F1),
    "Precision: {:.4f}".format(p),
    "Recall: {:.4f}".format(r))

1028
1737
113
322
Test loss: 0.326
Epoch: 1/10... Loss: 0.481312... train accracy: 0.7767... Val Loss: 0.350450... valid accracy: 0.8477.

Test accuracy: 0.8560 Sk test accuracy: 0.8641 F1: 0.8254 Precision: 0.9010 Recall: 0.7615
1186
1697
153
164
Test loss: 0.259
Epoch: 2/10... Loss: 0.282328... train accracy: 0.8875... Val Loss: 0.277742... valid accracy: 0.8728.

Test accuracy: 0.8926 Sk test accuracy: 0.9009 F1: 0.8821 Precision: 0.8857 Recall: 0.8785
1155
1724
125
196
Test loss: 0.278
Epoch: 3/10... Loss: 0.207530... train accracy: 0.9275... Val Loss: 0.306935... valid accracy: 0.8765.

Test accuracy: 0.8913 Sk test accuracy: 0.8997 F1: 0.8780 Precision: 0.9023 Recall: 0.8549
1132
1740
112
216
Test loss: 0.342
Epoch: 4/10... Loss: 0.149167... train accracy: 0.9532... Val Loss: 0.375534... valid accracy: 0.8786.

Test accuracy: 0.8892 Sk test accuracy: 0.8975 F1: 0.8735 Precision: 0.9100 Recall: 0.8398
1168
1736
112
184
Test loss: 0.338
Epoch: 5/10... Loss: 0.121610... train accrac

In [None]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

def flat_accuracy(preds, labels):
    
    """A function for calculating accuracy scores"""
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, pred_flat)
def flat_f1(preds, labels):
    
    """A function for calculating accuracy scores"""
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, pred_flat)
def flat_recall(preds, labels):
    
    """A function for calculating accuracy scores"""
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return recall_score(labels_flat, pred_flat)
def flat_precision(preds, labels):
    
    """A function for calculating accuracy scores"""
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return precision_score(labels_flat, pred_flat)

In [None]:
test_losses = [] # track loss
num_correct = 0
TN = 0
TP = 0
FN = 0
FP = 0 
# init hidden state
h = net.init_hidden(batch_size)
 
net.eval()
# iterate over test data
for inputs, labels in test_loader:
    h = tuple([each.data for each in h])
    if(USE_CUDA):
        inputs, labels = inputs.cuda(), labels.cuda()
    output = net(inputs, h)
    
    test_loss = criterion(output.squeeze(), labels.long())
    test_losses.append(test_loss.item())
    output=torch.nn.Softmax(dim=1)(output)
    # print(output.shape)
    pred=torch.max(output, 1)[1]
    pred_numpy = pred.detach().cpu().numpy()
    labels_numpy = labels.detach().cpu().numpy()
    # total_f1 += flat_f1(pred_numpy, labels_numpy)
    # total_recall += flat_recall(pred_numpy, labels_numpy)
    # total_precision += flat_precision(pred_numpy, labels_numpy)
    # total_acc += flat_accuracy(pred_numpy, labels_numpy)
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not USE_CUDA else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)
    TP += ((pred_numpy == 1) & (labels_numpy == 1)).sum()
    TN += ((pred_numpy == 0) & (labels_numpy == 0)).sum()
    FN += ((pred_numpy == 0) & (labels_numpy == 1)).sum()
    FP += ((pred_numpy == 1) & (labels_numpy == 0)).sum()

print(TP)
print(TN)
print(FP)
print(FN)
p = TP / (TP + FP)
r = TP / (TP + FN)
F1 = 2 * r * p / (r + p)
acc = (TP + TN) / (TP + TN + FP + FN)
print("Test loss: {:.3f}".format(np.mean(test_losses)))
 
# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.4f}".format(test_acc))
print("Sk test accuracy: {:.4f}".format(acc))
print("F1: {:.4f}".format(F1))
print("Precision: {:.4f}".format(p))
print("Recall: {:.4f}".format(r))

1168
1738
118
176
Test loss: 0.468
Test accuracy: 0.8997
Sk test accuracy: 0.9081
F1: 0.8882
Precision: 0.9082
Recall: 0.8690


In [None]:
torch.save(net, 'bert_lstm_tencent.pt')

In [None]:
# test_losses = [] # track loss
# num_correct = 0
# TN = 0
# TP = 0
# FN = 0
# FP = 0 
# # init hidden state
net = torch.load('bert_lstm_tencent.pt')
h = net.init_hidden(10)
 
net.eval()
# iterate over test data
for inputs, labels in pred_loader:
    h = tuple([each.data for each in h])
    if(USE_CUDA):
        inputs, labels = inputs.cuda(), labels.cuda()
    output = net(inputs, h)
    print(output)
    # test_loss = criterion(output.squeeze(), labels.long())
    # test_losses.append(test_loss.item())
    output=torch.nn.Softmax(dim=1)(output)
    # _,output = torch.max(output, 1)
    # print(output)
    pred=torch.max(output, 1)[1]
    pred_numpy = pred.detach().cpu().numpy()
    labels_numpy = labels.detach().cpu().numpy()
    
print(output)
print(pred_numpy)

tensor([[-2.9395,  3.0506],
        [ 2.7989, -2.7241],
        [-2.9547,  3.0668],
        [-2.9531,  3.0652],
        [ 2.9301, -2.8449],
        [-1.3944,  1.4334],
        [ 3.2558, -3.1441],
        [ 2.5662, -2.5126],
        [ 3.2521, -3.1407],
        [ 3.1321, -3.0314]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([[0.0025, 0.9975],
        [0.9960, 0.0040],
        [0.0024, 0.9976],
        [0.0024, 0.9976],
        [0.9969, 0.0031],
        [0.0558, 0.9442],
        [0.9983, 0.0017],
        [0.9938, 0.0062],
        [0.9983, 0.0017],
        [0.9979, 0.0021]], device='cuda:0', grad_fn=<SoftmaxBackward>)
[1 0 1 1 0 1 0 0 0 0]
