卷积神经网络基础知识参考 https://github.com/fengdu78/deeplearning_ai_books

text cnn 本质上是把序列数据看成图像类数据，用CNN处理图像的方式，使用卷积层来提取文本中的特征，从而实现分类任务

此jupyter notebook 运行在colab 可以节省空间

In [1]:
!pip install torch torchtext numpy



In [1]:
from torchtext import data,datasets

TEXT = data.Field(lower=True,batch_first=True)
LABEL = data.Field(sequential=False)

# make splits for data
train, val, test = datasets.SST.splits(TEXT, LABEL, 'data/',fine_grained=True)


ModuleNotFoundError: No module named 'torchtext'

In [3]:
# TEXT.build_vocab(train, vectors="fasttext.en.300d")
TEXT.build_vocab(train, vectors="glove.840B.300d")
LABEL.build_vocab(train,val,test)

.vector_cache/glove.840B.300d.zip: 2.18GB [01:06, 32.5MB/s]                            
100%|█████████▉| 2195414/2196017 [04:02<00:00, 10493.84it/s]

In [4]:
print('len(TEXT.vocab)', len(TEXT.vocab))
print(LABEL.vocab.itos)
print('len(LABEL.vocab)', len(LABEL.vocab)-1)   # vocab include '<unk>'
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())


len(TEXT.vocab) 16581
['<unk>', 'negative', 'positive', 'neutral', 'very positive', 'very negative']
len(LABEL.vocab) 5
TEXT.vocab.vectors.size() torch.Size([16581, 300])


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

_DEBUG=False

def ilog(*args,**kwargs):
    if _DEBUG:
        print(*args,**kwargs)
    
class textCNN(nn.Module):
    def __init__(self,args):
        super().__init__()
        dim = args['dim']
        n_class = args['n_class']
        embedding_matrix=args['embedding_matrix']
        kernels=[3,4,5]
        kernel_number=[100,100,100]
        self.embeding = nn.Embedding.from_pretrained(embedding_matrix)
        self.convs = nn.ModuleList([nn.Conv2d(1, number, (size, dim),padding=(size-1,0)) for (size,number) in zip(kernels,kernel_number)])
        self.dropout=nn.Dropout()
        self.out = nn.Linear(sum(kernel_number), n_class)
 
    def forward(self, x):
        ilog('ori input',x.size())
        x = self.embeding(x)
        ilog('after embeding',x.size())
        x = x.unsqueeze(1)
        ilog('unsqueeze',x.size())
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        ilog(x[0].size())
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1)
        x = self.dropout(x)
        x = self.out(x)
        return x
    
class textCNNMulti(nn.Module):
    def __init__(self,args):
        super().__init__()
        dim = args['dim']
        n_class = args['n_class']
        embedding_matrix=args['embedding_matrix']
        kernels=[3,4,5]
        kernel_number=[100,100,100]
        self.static_embed = nn.Embedding.from_pretrained(embedding_matrix)
        self.non_static_embed = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.convs = nn.ModuleList([nn.Conv2d(2, number, (size, dim),padding=(size-1,0)) for (size,number) in zip(kernels,kernel_number)])
        self.dropout=nn.Dropout()
        self.out = nn.Linear(sum(kernel_number), n_class)
 
    def forward(self, x):
        ilog('ori input',x.size())
        non_static_input = self.non_static_embed(x)
        static_input = self.static_embed(x)
        x = torch.stack([non_static_input, static_input], dim=1)
        ilog('after embeding',x.size())
        ilog('unsqueeze',x.size())
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        ilog(x[0].size())
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1)
        x = self.dropout(x)
        x = self.out(x)
        return x


class textCNNNonStatic(nn.Module):
    def __init__(self,args):
        super().__init__()
        dim = args['dim']
        n_class = args['n_class']
        embedding_matrix=args['embedding_matrix']
        kernels=[3,4,5]
        kernel_number=[100,100,100]
        self.embeding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.convs = nn.ModuleList([nn.Conv2d(1, number, (size, dim),padding=(size-1,0)) for (size,number) in zip(kernels,kernel_number)])
        self.dropout=nn.Dropout()
        self.out = nn.Linear(sum(kernel_number), n_class)
 
    def forward(self, x):
        ilog('ori input',x.size())
        x = self.embeding(x)
        ilog('after embeding',x.size())
        x = x.unsqueeze(1)
        ilog('unsqueeze',x.size())
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        ilog(x[0].size())
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1)
        x = self.dropout(x)
        x = self.out(x)
        return x

In [0]:
train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train, val, test), batch_sizes=(32, 256, 256),shuffle=True)


In [7]:
args={}
args['vocb_size']=len(TEXT.vocab)
args['dim']=300
args['n_class']=len(LABEL.vocab)-1
args['embedding_matrix']=TEXT.vocab.vectors
args['lr']=1e-5
args['epochs']=400
args['log_interval']=20
args['test_interval']=100
args['save_dir']='./'

print(args['vocb_size'])
print(args['n_class'])

16581
5


In [8]:
import os
import sys

import torch

import torch.nn as nn
import torch.nn.functional as F

torch.manual_seed(1)


<torch._C.Generator at 0x7f9b3c78a9d0>

In [9]:
from collections import deque

def save(model, save_dir, save_prefix, steps):
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    save_prefix = os.path.join(save_dir, save_prefix)
    save_path = '{}_steps_{}.pt'.format(save_prefix, steps)
    torch.save(model.state_dict(), save_path)

model=textCNNMulti(args)
model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
criterion = nn.CrossEntropyLoss()

best_acc = 0
last_step = 0
model.train()
steps=0


def create_early_stopping(patience):
    recent_metric = deque(maxlen=patience)
    best_metric = None

    def check(metric, model):
        nonlocal best_metric
        is_stop = False
        if not best_metric or metric > best_metric:
            print('save best_model.pt, metric: {}'.format(metric))
            best_metric = metric
            torch.save(model, 'best_model.pt')

        recent_metric.append(metric)

        if all([i < best_metric for i in recent_metric]):
            is_stop = True
        return is_stop

    return check


def eval(data_iter, model, args):
    model.eval()
    corrects, avg_loss = 0, 0
    for i,data in enumerate(data_iter):
        x, target = data.text, data.label
        x=x.cuda()
 
        target.sub_(1)
        target=target.cuda()

        logit = model(x)
        loss = F.cross_entropy(logit, target, reduction='sum')

        avg_loss += loss.item()
        corrects += (torch.max(logit, 1)
                     [1].view(target.size()).data == target.data).sum()

    size = len(data_iter.dataset)
    avg_loss /= size
    accuracy = 100.0 * int(corrects)/size
    print('\nEvaluation - loss: {:.6f}  acc: {:.4f}%({}/{}) \n'.format(avg_loss, 
                                                                       accuracy, 
                                                                       corrects, 
                                                                       size))
    model.train()
    return accuracy

early_stop = create_early_stopping(150)

for epoch in range(1, args['epochs']+1):
    for i,data in enumerate(train_iter):
        steps+=1

        x, target = data.text, data.label
        x=x.cuda()


        target.sub_(1)
        target=target.cuda()

        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        if steps % args['log_interval'] == 0:
            corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).sum()
            accuracy = 100.0 * int(corrects)/data.batch_size
            print(
                'Epoch [{}] Batch[{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(epoch,
                                                                         steps, 
                                                                         loss.item(), 
                                                                         accuracy,
                                                                         corrects,
                                                                         data.batch_size))
        if steps % args['test_interval'] == 0:
            val_acc = eval(val_iter, model, args)
            is_stop = early_stop(val_acc, model)
            if is_stop:
                raise RuntimeError('early stop')

        model.train()
print('final_result')


Epoch [1] Batch[20] - loss: 1.603016  acc: 25.0000%(8/32)
Epoch [1] Batch[40] - loss: 1.610136  acc: 12.5000%(4/32)
Epoch [1] Batch[60] - loss: 1.583441  acc: 15.6250%(5/32)
Epoch [1] Batch[80] - loss: 1.560337  acc: 31.2500%(10/32)
Epoch [1] Batch[100] - loss: 1.565055  acc: 25.0000%(8/32)

Evaluation - loss: 1.575489  acc: 28.8828%(318/1101) 

save best_model.pt, metric: 28.88283378746594


  "type " + obj.__name__ + ". It won't be checked "


Epoch [1] Batch[120] - loss: 1.581214  acc: 15.6250%(5/32)
Epoch [1] Batch[140] - loss: 1.548652  acc: 34.3750%(11/32)
Epoch [1] Batch[160] - loss: 1.469564  acc: 46.8750%(15/32)
Epoch [1] Batch[180] - loss: 1.547225  acc: 25.0000%(8/32)
Epoch [1] Batch[200] - loss: 1.635803  acc: 31.2500%(10/32)

Evaluation - loss: 1.562015  acc: 28.4287%(313/1101) 

Epoch [1] Batch[220] - loss: 1.592762  acc: 25.0000%(8/32)
Epoch [1] Batch[240] - loss: 1.579985  acc: 18.7500%(6/32)
Epoch [1] Batch[260] - loss: 1.597851  acc: 12.5000%(4/32)
Epoch [2] Batch[280] - loss: 1.564279  acc: 21.8750%(7/32)
Epoch [2] Batch[300] - loss: 1.464458  acc: 43.7500%(14/32)

Evaluation - loss: 1.553691  acc: 31.3351%(345/1101) 

save best_model.pt, metric: 31.33514986376022
Epoch [2] Batch[320] - loss: 1.606509  acc: 15.6250%(5/32)
Epoch [2] Batch[340] - loss: 1.514509  acc: 40.6250%(13/32)
Epoch [2] Batch[360] - loss: 1.587173  acc: 34.3750%(11/32)
Epoch [2] Batch[380] - loss: 1.580979  acc: 34.3750%(11/32)
Epoch [2]

RuntimeError: ignored

In [10]:
best_model=torch.load('best_model.pt')
best_model.eval()
eval(test_iter, best_model, args)


Evaluation - loss: 1.215641  acc: 45.7466%(1011/2210) 



45.74660633484163