### 任务二：基于深度学习的文本分类

In [35]:
import torchtext
from torchtext import data,datasets
from torchtext.vocab import GloVe,FastText,CharNGram
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
#from torch.autograd import Variable
import torch
#import sys
#import numpy as np
#import pandas as pd
#from sklearn.model_selection import train_test_split

In [36]:
is_cuda = False

if torch.cuda.is_available():
    is_cuda=True
    print("Run on GPU.")

Run on GPU.


In [37]:
### 数据集分割 train_set.csv and test_set.csv
## 注意：只需运行一次

# df_train = pd.read_csv("data_task1/train.tsv", sep="\t")
# X, y = df_train['Phrase'],df_train['Sentiment']
# #print(X[:5],'\n')
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)
# #print(X_train[:5], '\n')
# train = pd.concat([X_train,y_train], axis = 1)
# #print(train[:5], '\n')
# train.to_csv('data_task1/train_set.csv')
# test = pd.concat([X_test, y_test], axis = 1)
# test.to_csv('data_task1/test_set.csv')

In [38]:
TEXT = data.Field(lower=True, batch_first=True, fix_length=100)
LABEL = data.Field(use_vocab = False, sequential=False)

In [39]:
train_data = data.TabularDataset(
        path='data_task1/train_set.csv',format='csv',
        skip_header=True,
        fields = [('Index', None), ('Phrase', TEXT),('Sentiment',LABEL)])
valid_data = data.TabularDataset(
        path='data_task1/test_set.csv',format='csv',
        skip_header=True,
        fields = [('Index', None), ('Phrase', TEXT),('Sentiment',LABEL)])

In [40]:
print(vars(train_data[25]))

{'Phrase': ['there', "'s", 'no', 'energy', '.'], 'Sentiment': '0'}


In [41]:
TEXT.build_vocab(train_data, vectors = GloVe(name='6B', dim=300), min_freq = 10)
LABEL.build_vocab(train_data, )

BATCH_SIZE = 54

device = torch.device('cuda' if is_cuda else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    device = device,
    shuffle = True,
    sort_key = lambda x: len(x.Phrase),
    repeat = False)
# train_iterator.repeat = False
# valid_iterator.repeat = False

In [42]:
## Test
batch = next(iter(train_iterator))
batch.Phrase
print(batch.Phrase.size())

torch.Size([54, 100])


In [43]:
batch.Sentiment

tensor([0, 0, 2, 1, 4, 1, 2, 2, 3, 3, 2, 2, 2, 2, 1, 2, 2, 3, 2, 1, 2, 3, 1, 1,
        2, 3, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 3, 4, 2, 3, 2, 2, 3, 2, 2,
        3, 2, 3, 2, 3, 2], device='cuda:0')

In [44]:
n_vocab = len(TEXT.vocab)
print(n_vocab)
n_hidden = 1000

8371


In [64]:
class SAMRRnn(nn.Module):
    '''  LSTM '''
    def __init__(self, n_vocab, n_hidden, n_cat, bs = 1):
        super().__init__()
        self.n_hidden = n_hidden
        self.bs = bs # batch_size
        self.e = nn.Embedding(n_vocab, n_hidden) # embedding,[vocab_size, embed_dim]
        self.rnn = nn.LSTM(n_hidden, n_hidden) # [input_size, output_size]
        self.fc2 = nn.Linear(n_hidden, n_cat) # [output_size, n_category] 5分类
        self.softmax = nn.LogSoftmax(dim = -1)
        
    def forward(self, x):
        bs = x.size()[0]
        if bs != self.bs:
            self.bs = bs
        e_out = self.e(x) # [batch_size,seqlen,dim]
        rnn_o, _ = self.rnn(e_out) # [batch_size,seqlen,output_dim]
        mean_token_output = torch.mean(rnn_o, dim=1) # Mean
        fc = F.dropout(self.fc2(mean_token_output), p = 0.1) # Dropout
        return self.softmax(fc) # softmax

In [65]:
net = SAMRRnn(n_vocab, n_hidden, 5, 54) ## LSTM
print(net)
if is_cuda:
    net.cuda()

SAMRRnn(
  (e): Embedding(8371, 1000)
  (rnn): LSTM(1000, 1000)
  (fc2): Linear(in_features=1000, out_features=5, bias=True)
  (softmax): LogSoftmax()
)


In [None]:
# ## 使用预训练Embedding
# net.e.weight.data = TEXT.vocab.vectors.cuda()
# net.e.weight.requires_grad = False
# optimizer = optim.Adam([param for param in net.parameters() if param.requires_grad == True],lr=1e-3)

In [66]:
optimizer = optim.Adam(net.parameters(),lr=1e-3)

In [67]:
## For training
for epoch in range(5):
    
    net.train() # 训练模式
    
    running_loss = 0.0
    running_correct = 0
    for batch_idx, batch in enumerate(train_iterator):
        text, label = batch.Phrase , batch.Sentiment
        if torch.cuda.is_available():
            text, label = text.cuda(), label.cuda()
        optimizer.zero_grad()
        prediction = net(text)
        
        loss = F.nll_loss(prediction,label)
        
        running_loss += F.nll_loss(prediction,label,size_average=False).item()
        preds = prediction.data.max(dim=1,keepdim=True)[1]
        running_correct += preds.eq(label.data.view_as(preds)).cpu().sum()
            
            
        loss.backward()
        optimizer.step()

    loss = running_loss / len(train_iterator.dataset)
    accuracy = 100.0 * running_correct / len(train_iterator.dataset)
    
    print(f'Training loss for epoch {epoch + 1} is {loss} and accuracy is {running_correct}/{len(train_iterator.dataset)} = {accuracy}%')
        
#     prediction = net(batch.Phrase)
    
#     loss = F.nll_loss(prediction,batch.Sentiment)
    
#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()
#     if t % 5 == 0:
#         print('Loss:', loss.detach().numpy())

Training loss for epoch 1 is 1.13603887535111 and accuracy is 59221/109242 = 54%
Training loss for epoch 2 is 0.9715498869099713 and accuracy is 66505/109242 = 60%
Training loss for epoch 3 is 0.916336085795003 and accuracy is 69105/109242 = 63%
Training loss for epoch 4 is 0.8851783196030995 and accuracy is 70590/109242 = 64%
Training loss for epoch 5 is 0.8619078225531286 and accuracy is 71719/109242 = 65%


In [68]:
## For validating
    
net.eval() #测试模式
volatile = True
    
running_loss = 0.0
running_correct = 0
    
for batch_idx, batch in enumerate(valid_iterator):
    text, label = batch.Phrase , batch.Sentiment
    if torch.cuda.is_available():
        text, label = text.cuda(), label.cuda()
            
#       optimizer.zero_grad()
    prediction = net(text)
        
    loss = F.nll_loss(prediction,label)
        
    running_loss += F.nll_loss(prediction,label,size_average=False).item()
    preds = prediction.data.max(dim=1,keepdim=True)[1]
    running_correct += preds.eq(label.data.view_as(preds)).cpu().sum()
            
            
#         loss.backward()
#         optimizer.step()

loss = running_loss/len(valid_iterator.dataset)
accuracy = 100. * running_correct/len(valid_iterator.dataset)
    
print(f'Valid loss is {loss} and accuracy is {running_correct}/{len(valid_iterator.dataset)} = {accuracy}%')


Valid loss is 1.0688864415721313 and accuracy is 28547/46818 = 60%


In [None]:
class SAMRCnn(nn.Module):
    '''CNN'''
    def __init__(self, n_vocab, n_hidden, n_cat, n_kernel, kernel_sizes, bs = 1):
        super().__init__()
        
        Vocab = n_vocab ## 已知词的数量
        Dim = n_hidden ##每个词向量长度
        Cla = n_cat  ##类别数
        Ci = 1  ##输入的channel数
        Knum = n_kernel ## 每种卷积核的数量
        Ks = kernel_sizes ## 卷积核list，形如[2,3,4]
        self.bs = bs
        self.embed = nn.Embedding(Vocab,Dim) ## 词向量，这里直接随机
        
        self.convs = nn.ModuleList([nn.Conv2d(Ci,Knum,(K,Dim)) for K in Ks]) ## 卷积层
        self.dropout = nn.Dropout(args.dropout) 
        self.fc = nn.Linear(len(Ks)*Knum,Cla) ##全连接层
        
    def forward(self,x):
        x = self.embed(x) #(N,W,D)
        
        x = x.unsqueeze(1) #(N,Ci,W,D)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs] # len(Ks)*(N,Knum,W)
        x = [F.max_pool1d(line,line.size(2)).squeeze(2) for line in x]  # len(Ks)*(N,Knum)
        
        x = torch.cat(x,1) #(N,Knum*len(Ks))
        
        x = self.dropout(x)
        logit = self.fc(x)
        return logit
