## 查看分配的GPU

In [4]:
!nvidia-smi

Thu Mar 18 06:24:44 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## 挂载谷歌云盘

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 导入包

In [6]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/d8/5144b0712f7f82229a8da5983a8fbb8d30cec5fbd5f8d12ffe1854dcea67/transformers-4.4.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 17.2MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 49.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 53.6MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=ac1b

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer,BertModel
from torchtext.legacy import data,datasets
import numpy as np
import time
import random

## 参数

In [8]:
SEED = 1234
TRAIN = False
BATCH_SIZE = 128
N_EPOCHS = 5
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

## 固定模型种子，便于重复实验

In [39]:
TEXT = 'I hate you!'
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## 应用transformers中Tokenizer

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
init_token_id = tokenizer.cls_token_id
eos_token_id = tokenizer.sep_token_id
pad_token_id = tokenizer.pad_token_id
unk_token_id = tokenizer.unk_token_id

max_input_len = tokenizer.max_model_input_sizes['bert-base-uncased']

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




## 将句子长度分割成510长，为了加上开头和最后一个token

In [11]:
def tokenizer_and_crop(sentence):
  tokens = tokenizer.tokenize(sentence)
  tokens = tokens[:max_input_len-2]
  return tokens

## 加载pytorch提供的的IMDB数据

In [12]:
def load_data():
  text = data.Field(
      batch_first = True,
      use_vocab = False,
      tokenize = tokenizer_and_crop,
      preprocessing = tokenizer.convert_tokens_to_ids,
      init_token = init_token_id,
      pad_token = pad_token_id,
      unk_token = unk_token_id
  )
  label = data.LabelField(dtype = torch.float)

  train_data,test_data = datasets.IMDB.splits(text,label)
  train_data,valid_data = train_data.split(random_state = random.seed(SEED))

  print(f"training example count:{len(train_data)}")
  print(f"test example count:{len(test_data)}")
  print(f"validation example count:{len(valid_data)}")

  label.build_vocab(train_data)

  train_iter,valid_iter,test_iter = data.BucketIterator.splits(
      (train_data,valid_data,test_data),
      batch_size = BATCH_SIZE,
      device = device
  )
  return train_iter,valid_iter,test_iter

## 查看是否有GPU

In [13]:
device = 'cuda' if torch.cuda.is_available() else 'cup'
print(device)

cuda


## 通过transformer包建立bert模型

In [14]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




## 此处用bert作为基础模型完成情感分析任务
    在Bert之上加俩层GRU
    在最后一层线性层用于完成分类任务

In [15]:
class SentimentModel(nn.Module):
  def __init__(self,bert,hidden_dim,output_dim,n_layers,bidirectional,dropout):
    super(SentimentModel,self).__init__()

    self.bert = bert
    embedding_dim = bert.config.to_dict()['hidden_size']
    self.rnn = nn.GRU(
        embedding_dim,
        hidden_dim,
        num_layers = n_layers,
        bidirectional = bidirectional,
        batch_first = True,
        dropout = 0 if n_layers < 2 else dropout,
    )
    self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim,output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self,text):
    with torch.no_grad():
      embedded = self.bert(text)[0]
    
    _,hidden = self.rnn(embedded)

    if self.rnn.bidirectional:
      hidden= self.dropout(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim = 1))
    else:
      hidden = self.dropout(hidden[-1,:,:])
    output = self.out(hidden)

    return output

## 完整模型

In [16]:
model = SentimentModel(
    bert_model,
    HIDDEN_DIM,
    OUTPUT_DIM,
    N_LAYERS,
    BIDIRECTIONAL,
    DROPOUT
)
# print(model)

## 一个epoch 需要多长时间

In [17]:
def epoch_time(start_time,end_time):
  elasped_time = end_time - start_time
  elasped_mins = int(elasped_time/60)
  elasped_secs = int(elasped_time - (elasped_mins * 60))
  return elasped_mins,elasped_secs

## 二分类问题的accuracy

In [18]:
def binary_accuracy(preds,y):
  rounded_preds = torch.round(torch.sigmoid(preds))
  correct = (rounded_preds == y).float()
  acc = correct.sum()/len(correct)
  return acc

## 一个训练步

In [19]:
def train(model,iterator,optimizer,criterion):
  epoch_loss = 0
  epoch_acc = 0

  model.train()

  for batch in iterator:
    optimizer.zero_grad()
    predictions = model(batch.text).squeeze(1)
    loss = criterion(predictions,batch.label)
    acc = binary_accuracy(predictions,batch.label)

    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss/len(iterator),epoch_acc/len(iterator)

## 验证模型

In [20]:
def evalute(model,iterator,criterion):
  epoch_loss = 0
  epoch_acc = 0

  model.eval()

  with torch.no_grad():
    for batch in iterator:
      predictions = model(batch.text).squeeze(1)
      loss = criterion(predictions,batch.label)
      acc = binary_accuracy(predictions,batch.label)
      epoch_loss += loss.item()
      epoch_acc += acc.item()

  return epoch_loss/len(iterator),epoch_acc/len(iterator)

## 预测模型

In [37]:
def predict_sentiment(model,tokenizer,sentence):
  model.eval()
  tokens = tokenizer.tokenize(sentence)
  tokens = tokens[:max_input_len-2]
  indexed = [init_token_id] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_id]
  tensor = torch.LongTensor(indexed).to(device)
  tensor = tensor.unsqueeze(0)
  prediction = torch.sigmoid(model(tensor))
  return prediction.item()

## 训练代码

In [33]:
BASE_DIR = '/content/drive/MyDrive/Colab Notebooks'

In [35]:
train_iter,valid_iter,test_iter = load_data()

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss().to(device)
model = model.to(device)

best_val_loss = float('inf')

for epoch in range(N_EPOCHS):
  start_time =time.time()
  #训练一个epoch
  train_loss,train_acc = train(model,train_iter,optimizer,criterion)
  valid_loss,valid_acc = evalute(model,valid_iter,criterion)

  end_time = time.time()
  epoch_mins,epoch_secs = epoch_time(start_time,end_time)

  if valid_loss < best_val_loss:
    best_val_loss = valid_loss
    torch.save(model.state_dict(),'{}/model.pt'.format(BASE_DIR))

  print(f'Epoch:{epoch+1:02} | Epoch Time:{epoch_mins}m{epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100 :.2f}%')
  print(f'\t Val Loss: {valid_loss:.3f} | Val Acc: {valid_acc * 100:.2f}%')

training example count:17500
test example count:25000
validation example count:7500
Epoch:01 | Epoch Time:16m37s
	Train Loss: 0.592 | Train Acc: 67.21%
	 Val Loss: 0.410 | Val Acc: 81.99%
Epoch:02 | Epoch Time:16m46s
	Train Loss: 0.442 | Train Acc: 79.67%
	 Val Loss: 0.278 | Val Acc: 88.80%
Epoch:03 | Epoch Time:16m44s
	Train Loss: 0.388 | Train Acc: 82.50%
	 Val Loss: 0.260 | Val Acc: 89.59%
Epoch:04 | Epoch Time:16m46s
	Train Loss: 0.340 | Train Acc: 85.14%
	 Val Loss: 0.247 | Val Acc: 90.47%
Epoch:05 | Epoch Time:16m47s
	Train Loss: 0.310 | Train Acc: 86.97%
	 Val Loss: 0.243 | Val Acc: 90.33%


## 预测测试

In [40]:
model.load_state_dict(torch.load('{}/model.pt'.format(BASE_DIR), map_location=device))
sentiment = predict_sentiment(model, tokenizer, TEXT)
print(sentiment)

0.10581449419260025
