<a href="https://colab.research.google.com/github/yaoziove/TextClassification/blob/master/TextCls_6_DPCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!git clone https://github.com/yaoziove/TextClassification.git

fatal: destination path 'TextClassification' already exists and is not an empty directory.


In [0]:
ls

[0m[01;34msample_data[0m/  [01;34mTextClassification[0m/


In [0]:
cd TextClassification/

/content/TextClassification


In [0]:
ls

[0m[01;34mChinese-Text-Classification-Pytorch[0m/


In [0]:
cd Chinese-Text-Classification-Pytorch/

/content/TextClassification/Chinese-Text-Classification-Pytorch


In [0]:
ls

LICENSE  links.txt  [0m[01;34mmodels[0m/  README.md  run.py  [01;34mTHUCNews[0m/


#### 1.Utils
data preprocess

In [0]:
import os
import torch
import numpy as np
import pickle as pkl
from tqdm import tqdm
import time 
from datetime import timedelta

MAX_VOCAB_SIZE = 10000   #词表长度限制
UNK,PAD = '<UNK>','<PAD>' #未知字，padding字符

def build_vocab(file_path,tokenizer,max_size,min_freq):
  vocab_dic = {}
  with open(file_path,"r",encoding="utf-8") as f:
    for line in tqdm(f):
      line  = line.strip()
      if not line:
        continue
      content = line.split('\t')[0]
      for word in tokenizer(content):
        vocab_dic[word] = vocab_dic.get(word,0)+1
    vocab_list = sorted([_ for _ in vocab_dic.items() if _[1]>=min_freq],key=lambda x:x[1],reverse=True)[:max_size]
    vocab_dic = {word_count[0]:idx for idx,word_count in enumerate(vocab_list)}
    vocab_dic.undate({UNK:len(vocab_dic),PAD:len(vocab_dic)+1})
  return vocab_dic

def build_dataset(config,ues_word):  #ues word 翻译为“词"
  if ues_word:
    tokenizer = lambda x:x.split(' ')
  else:
    tokenizer = lambda x:[y for y in x] #char-leve

  if os.path.exists(config.vocab_path):
    vocab = pkl.load(open(config.vocab_path,'rb'))
  else:
    vocab = build_vocab(config.train_path,tokenizer=tokenizer,max_size=MAX_VOCAB_SIZE,min_freq=1)
    pkl.dump(vocab,open(config.vocab_path,'wb'))
  print(f"Vocab size: {len(vocab)}")

  def load_dataset(path, pad_size=32):
    contents = []
    with open(path, 'r', encoding='UTF-8') as f:
      for line in tqdm(f):
        lin = line.strip()
        if not lin:
          continue
        content, label = lin.split('\t')
        words_line = []
        token = tokenizer(content)
        seq_len = len(token)
        if pad_size:
          if len(token) < pad_size:
            token.extend([PAD]*(pad_size-len(token)))
          else:
            token = token[:pad_size]
            seq_len = pad_size
        # word to id
        for word in token:
          words_line.append(vocab.get(word, vocab.get(UNK)))
        contents.append((words_line, int(label), seq_len))
    return contents
  train = load_dataset(config.train_path, config.pad_size)
  dev = load_dataset(config.dev_path, config.pad_size)
  test = load_dataset(config.test_path, config.pad_size)
  return vocab, train, dev, test

class DatasetIterater(object):
  def __init__(self, batches, batch_size, device):
    self.batch_size = batch_size
    self.batches = batches
    self.n_batches = len(batches) // batch_size
    self.residue = False  # 记录batch数量是否为整数
    if len(batches) % self.n_batches != 0:
        self.residue = True
    self.index = 0
    self.device = device

  def _to_tensor(self, datas):
    x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
    y = torch.LongTensor([_[1] for _ in datas]).to(self.device)

    # pad前的长度(超过pad_size的设为pad_size)
    seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
    return (x,seq_len),y

  def __next__(self):
    if self.residue and self.index == self.n_batches:
      batches = self.batches[self.index*self.batch_size:len(self.batches)]
      self.index += 1
      batches = self._to_tensor(batches)
      return batches

    elif self.index >= self.n_batches:
      self.index = 0
      raise StopIteration
    else:
      batches = self.batches[self.index*self.batch_size:(self.index+1)*self.batch_size]
      self.index += 1
      batches = self._to_tensor(batches)
      return batches

  def __iter__(self):
    return self

  def __len__(self):
    if self.residue:
      return self.n_batches + 1
    else:
      return self.n_batches

def build_iterator(dataset, config):
  iter_ = DatasetIterater(dataset, config.batch_size, config.device)
  return iter_

def get_time_dif(start_time):
  """获取已使用时间"""
  end_time = time.time()
  time_dif = end_time - start_time
  return timedelta(seconds=int(round(time_dif)))

#### 2.Model 相关

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class Config(object):
  """配置参数"""
  def __init__(self,dataset,embedding):
    self.model_name = 'DPCNN'
    self.train_path = dataset + '/data/train.txt'    #训练街
    self.dev_path = dataset + '/data/dev.txt'      #验证集
    self.test_path = dataset + '/data/test.txt'        #测试集
    self.class_list = [x.strip() for x in open(dataset+'/data/class.txt',encoding='utf-8').readlines()]  #类别列表
    self.vocab_path = dataset + '/data/vocab.pkl'    #词表
    self.save_path = dataset + '/' + self.model_name + '.ckpt'  #模型训练结果
    self.log_path = dataset + '/log/' + self.model_name
    self.embedding_pretrained = torch.tensor(np.load(dataset + '/data/'+ 
                    embedding)["embeddings"].astype('float32')) if embedding != 'random' else None
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    self.dropout = 0.5                    #随机失活
    self.require_improvement = 1000             #若超过1000batch效果没提升，则提前结束
    self.num_classes = len(self.class_list)         #类别数
    self.n_vocab = 0                     #词表大小
    self.num_epochs = 20                  #epoch数
    self.batch_size = 128
    self.pad_size = 32                    #每句话处理成的长度（短填长切）
    self.learning_rate = 1e-3               
    #字向量维度
    self.embed = self.embedding_pretrained.size(1) if self.embedding_pretrained is not None else 300 
    self.num_filters = 250              #卷积核数


"""
    DPCNN
"""
class Model(nn.Module):
  def __init__(self,config):
    super(Model,self).__init__()
    if config.embedding_pretrained is not None:
        self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained,freeze=False)
    else:
        self.embedding = nn.Embedding(config.n_vocab,config.embed,padding_idx=config.n_vocab - 1)

    self.conv_region = nn.Conv2d(1,config.num_filters,(3,config.embed),stride=1)
    self.conv = nn.Conv2d(config.num_filters,config.num_filters,(3,1),stride=1)
    self.max_pool = nn.MaxPool2d(kernel_size=(3,1),stride=2)
    self.padding1 = nn.ZeroPad2d((0,0,1,1))  # top bottom
    self.padding2 = nn.ZeroPad2d((0,0,0,1))  # bottom
    self.relu = nn.ReLU()
    self.fc = nn.Linear(config.num_filters,config.num_classes)
    self.dropout = nn.Dropout(config.dropout)
      
  
  def forward(self,x):
    x = x[0]
    x = self.dropout(self.embedding(x))
    x = x.unsqueeze(1)       #[batch_size,1,seq_len,embed_dim]
    x = self.conv_region(x)  #[batch_size,250,seq_len-3+1,1]
    
    x = self.padding1(x)     #[batch_size,250,seq_len,1]
    x = self.relu(x)
    x = self.conv(x)         #[batch_size,250,seq_len-3+1,1]
    x = self.padding1(x)     #[batch_size,250,seq_len,1]
    x = self.relu(x)
    x = self.conv(x)         #[batch_size,250,seq_len-3+1,1]
    while x.size()[2] > 2:
        x = self._block(x)
    x = x.squeeze()          #[batch_size, num_filters(250)]
    x = self.fc(x)
    return x
  
  def _block(self, x):
    x = self.padding2(x)
    px = self.max_pool(x)

    x = self.padding1(px)
    x = F.relu(x)
    x = self.conv(x)

    x = self.padding1(x)
    x = F.relu(x)
    x = self.conv(x)

    # Short Cut
    x = x + px
    return x

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

        
conv_region = nn.Conv2d(1,256,(3,300),stride=1)
conv = nn.Conv2d(256,256,(3,1),stride=1)
max_pool = nn.MaxPool2d(kernel_size=(3,1),stride=2)
padding1 = nn.ZeroPad2d((0,0,1,1))  # top bottom
padding2 = nn.ZeroPad2d((0,0,0,1))  # bottom
relu = nn.ReLU()
fc = nn.Linear(256,10)

import numpy as np
x =  torch.rand(64,32,300)
print(f"1 orig data after embed: {x.shape}")
x = x.unsqueeze(1)
print(f"2 orig data after unsqueeze: {x.shape}")
x = conv_region(x)
print(f"3 orig data after conv_region: {x.shape}")
x = padding1(x)
print(f"4 orig data after padding1: {x.shape}")
x = relu(x)
print(f"5 orig data after relu: {x.shape}")
x = conv(x)
print(f"6 orig data after conv_region: {x.shape}")
x = padding1(x)
print(f"7 orig data after padding1: {x.shape}")
x = relu(x)
print(f"8 orig data after relu: {x.shape}")
x = conv(x)
print(f"9 orig data after conv_region: {x.shape}")

print("---------------------------------------------------------------")
x = padding2(x)
print(f"10 orig data after padding2: {x.shape}")

px = max_pool(x)
print(f"11 orig data after max_pool__px: {px.shape}")

x = padding1(px)
print(f"12 orig data after padding1: {x.shape}")
x = F.relu(x)
x = conv(x)
print(f"13 orig data after conv: {x.shape}")

x = padding1(x)
print(f"14 orig data after padding1: {x.shape}")
x = F.relu(x)
x = conv(x)
print(f"15 orig data after conv: {x.shape}")

x = x+px
print(f"16 orig data after x + px: {x.shape}")
print("---------------------------------------------------------------")

1 orig data after embed: torch.Size([64, 32, 300])
2 orig data after unsqueeze: torch.Size([64, 1, 32, 300])
3 orig data after conv_region: torch.Size([64, 256, 30, 1])
4 orig data after padding1: torch.Size([64, 256, 32, 1])
5 orig data after relu: torch.Size([64, 256, 32, 1])
6 orig data after conv_region: torch.Size([64, 256, 30, 1])
7 orig data after padding1: torch.Size([64, 256, 32, 1])
8 orig data after relu: torch.Size([64, 256, 32, 1])
9 orig data after conv_region: torch.Size([64, 256, 30, 1])
---------------------------------------------------------------
10 orig data after padding2: torch.Size([64, 256, 31, 1])
11 orig data after max_pool__px: torch.Size([64, 256, 15, 1])
12 orig data after padding1: torch.Size([64, 256, 17, 1])
13 orig data after conv: torch.Size([64, 256, 15, 1])
14 orig data after padding1: torch.Size([64, 256, 17, 1])
15 orig data after conv: torch.Size([64, 256, 15, 1])
16 orig data after x + px: torch.Size([64, 256, 15, 1])
---------------------------

In [0]:
!pip install tensorboardX



#### 3.train and eval

In [0]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
import time
from tensorboardX import SummaryWriter

#权重初始化，默认Xavier
def init_network(model,method='xavier',exclude='embedding',seed=123):
  for name,w in model.named_parameters():
    if exclude not in name:
      if 'weight' in name:
        if method == 'xavier':
          nn.init.xavier_normal_(w)
        elif method == 'kaiming':
          nn.init.kaiming_normal_(w)
        else:
          nn.init.normal_(w)
      elif 'bias' in name:
        nn.init.constant_(w,0)
      else:
        pass

def train(config,model,train_iter,dev_iter,test_iter):
  start_time = time.time()
  model.train()
  optimizer = torch.optim.Adam(model.parameters(),lr=config.learning_rate)

  #学习率指数衰减，每次epoch：学习率 = gamma * 学习率
  #scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

  total_batch = 0             #记录进行到多少batch 
  dev_best_loss = float('inf')      #验证集最好的loss、
  last_improve = 0            #记录上次验证集loss下降的batch数
  flag = False              #记录是否很久没有效果提升
  writer = SummaryWriter(log_dir=config.log_path+'/'+time.strftime('%m-%d_%H.%M',time.localtime()))

  for epoch in range(config.num_epochs):
    print('Epoch [{}/{}]'.format(epoch+1,config.num_epochs))
    #scheduler.step()           #学习率衰减

    for i,(trains,labels) in enumerate(train_iter):
      outputs = model(trains)
      model.zero_grad()
      loss = F.cross_entropy(outputs,labels)
      loss.backward()
      optimizer.step()

      if total_batch % 100 == 0:
        #每多少轮输出在训练集和验证集上的效果
        true = labels.data.cpu()
        predic = torch.max(outputs.data,1)[1].cpu()

        train_acc = metrics.accuracy_score(true,predic)
        dev_acc,dev_loss = evaluate(config,model,dev_iter)
        if dev_loss < dev_best_loss:
            dev_best_loss = dev_loss
            torch.save(model.state_dict(),config.save_path)
            improve = '*'
            last_improve = total_batch
        else:
            improve = ''
        time_dif = get_time_dif(start_time)
        msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}'
        print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
        writer.add_scalar("loss/train", loss.item(), total_batch)
        writer.add_scalar("loss/dev", dev_loss, total_batch)
        writer.add_scalar("acc/train", train_acc, total_batch)
        writer.add_scalar("acc/dev", dev_acc, total_batch)
        model.train()
      total_batch += 1
      if total_batch - last_improve > config.require_improvement:
        # 验证集loss超过1000batch没下降，结束训练
        print("No optimization for a long time, auto-stopping...")
        flag = True
        break
    if flag:
        break
  writer.close()
  test(config,model,test_iter)

def test(config,model,test_iter):
  model.load_state_dict(torch.load(config.save_path))
  model.eval()
  start_tiem = time.time()
  test_acc, test_loss, test_report, test_confusion = evaluate(config,model,test_iter,test=True)
  msg = 'Test Loss: {0:>5.2},  Test Acc: {1:>6.2%}'
  print(msg.format(test_loss, test_acc))
  print("Precision, Recall and F1-Score...")
  print(test_report)
  print("Confusion Matrix...")
  print(test_confusion)
  time_dif = get_time_dif(start_time)
  print("Time usage:", time_dif)

def evaluate(config,model,data_iter,test=False):
  model.eval()
  loss_total=0
  predict_all = np.array([],dtype=int)
  labels_all = np.array([],dtype=int)
  with torch.no_grad():
    for texts,labels in data_iter:
      outputs = model(texts)
      loss = F.cross_entropy(outputs, labels)
      loss_total += loss.item()
      labels = labels.data.cpu().numpy()
      predic = torch.max(outputs.data,1)[1].cpu()
      labels_all = np.append(labels_all,labels)
      predict_all = np.append(predict_all,predic)

  acc = metrics.accuracy_score(labels_all,predict_all)
  if test:
    report = metrics.classification_report(labels_all,predict_all,target_names=config.class_list,digits=4)
    confusion = metrics.confusion_matrix(labels_all,predict_all)
    return acc,loss_total / len(data_iter),report,confusion

  return acc,loss_total / len(data_iter)


In [0]:
'''提取预训练词向量'''
train_dir = "./THUCNews/data/train.txt"
vocab_dir = "./THUCNews/data/vocab.pkl"
pretrain_dir = "./THUCNews/data/sgns.sogou.word"
emb_dim = 300
filename_trimmed_dir = "./THUCNews/data/embedding_SougouNews.npz"

if os.path.exists(vocab_dir):
  word_to_id = pkl.load(open(vocab_dir, 'rb'))
else:
  tokenizer = lambda x:[y for y in x]  #以字为单位构建词表
  word_to_id = build_vocab(train_dir,tokenizer=tokenizer,max_size=MAX_VOCAB_SIZE,min_freq=1)
  pkl.dump(word_to_id,open(vocab_dir,'wb'))

print(f'shape of word_to_is is {len(word_to_id)}')

#通过预训练的词向量来表示词表数据
# embeddings = np.random.rand(len(word_to_id),emb_dim)
# f = open(pretrain_dir, "r", encoding='UTF-8')
# for i, line in enumerate(f.readlines()):
#   if i == 0:  # 若第一行是标题，则跳过
#     continue
#   lin = line.strip().split(" ")
#   if lin[0] in word_to_id:
#     idx = word_to_id[lin[0]]
#     emb = [float(x) for x in lin[1:301]]
#     embeddings[idx] = np.asarray(emb, dtype='float32')
# f.close()
# np.savez_compressed(filename_trimmed_dir, embeddings=embeddings)

#载入通过预训练的到的词向量表示
embeddings = np.load(filename_trimmed_dir)
embeddings = embeddings["embeddings"]
print(f'shape of embedding vocab is : {embeddings.shape}')

shape of word_to_is is 4762
shape of embedding vocab is : (4762, 300)


In [0]:
dataset = 'THUCNews'
embedding = 'embedding_SougouNews.npz'
config = Config(dataset,embedding)

np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True  # 保证每次结果一样

start_time = time.time()
print("Loading data...")

vocab,train_data,dev_data,test_data = build_dataset(config,False)
train_iter = build_iterator(train_data, config)
dev_iter = build_iterator(dev_data, config)
test_iter = build_iterator(test_data, config)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)

0it [00:00, ?it/s]

Loading data...
Vocab size: 4762


180000it [00:03, 58850.59it/s]
10000it [00:00, 65334.69it/s]
10000it [00:00, 67072.05it/s]

Time usage: 0:00:03





In [0]:
#train
config.n_vocab = len(vocab)

model = Model(config).to(config.device)
init_network(model)
print(model.parameters)
train(config,model,train_iter,dev_iter,test_iter)

<bound method Module.parameters of Model(
  (embedding): Embedding(4762, 300)
  (conv_region): Conv2d(1, 250, kernel_size=(3, 300), stride=(1, 1))
  (conv): Conv2d(250, 250, kernel_size=(3, 1), stride=(1, 1))
  (max_pool): MaxPool2d(kernel_size=(3, 1), stride=2, padding=0, dilation=1, ceil_mode=False)
  (padding1): ZeroPad2d(padding=(0, 0, 1, 1), value=0.0)
  (padding2): ZeroPad2d(padding=(0, 0, 0, 1), value=0.0)
  (relu): ReLU()
  (fc): Linear(in_features=250, out_features=10, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)>
Epoch [1/20]
Iter:      0,  Train Loss:   2.3,  Train Acc:  7.81%,  Val Loss:   2.6,  Val Acc: 10.00%,  Time: 0:00:01 *
Iter:    100,  Train Loss:  0.84,  Train Acc: 67.97%,  Val Loss:  0.79,  Val Acc: 72.79%,  Time: 0:00:06 *
Iter:    200,  Train Loss:  0.74,  Train Acc: 77.34%,  Val Loss:  0.56,  Val Acc: 81.97%,  Time: 0:00:11 *
Iter:    300,  Train Loss:  0.45,  Train Acc: 85.94%,  Val Loss:  0.53,  Val Acc: 83.32%,  Time: 0:00:16 *
Iter:    400,  Trai

In [0]:
model

Model(
  (embedding): Embedding(4762, 300)
  (conv_region): Conv2d(1, 250, kernel_size=(3, 300), stride=(1, 1))
  (conv): Conv2d(250, 250, kernel_size=(3, 1), stride=(1, 1))
  (max_pool): MaxPool2d(kernel_size=(3, 1), stride=2, padding=0, dilation=1, ceil_mode=False)
  (padding1): ZeroPad2d(padding=(0, 0, 1, 1), value=0.0)
  (padding2): ZeroPad2d(padding=(0, 0, 0, 1), value=0.0)
  (relu): ReLU()
  (fc): Linear(in_features=250, out_features=10, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)