### 使用Pytorch实现多层神经网络模型

In [None]:
pip install torch

In [2]:
import pprint
import torch
import torch.nn as nn
pp = pprint.PrettyPrinter()

## 数据

任务：给一句话中的每个单词打标签，将表示地点（location）的单词标记为1，将其它单词标记为0。

只考虑词长为1的情况。

In [65]:
#训练数据集
train_sents = [s.lower().split() for s in ["we 'll always have Paris",
                                           "I live in Germany",
                                           "He comes from Denmark",
                                           "The capital of Denmark is Copenhagen"]]
train_labels = [[0, 0, 0, 0, 1],
                [0, 0, 0, 1],
                [0, 0, 0, 1],
                [0, 0, 0, 1, 0, 1]]

#确认训练数据的长度与对应的标签数据是否一致。
assert all([len(train_sents[i]) == len(train_labels[i]) for i in range(len(train_sents))])


In [66]:
# 测试数据集
test_sents = [s.lower().split() for s in ["She comes from Paris"]]
test_labels = [[0, 0, 0, 1]]

assert all([len(test_sents[i]) == len(test_labels[i]) for i in range(len(test_sents))])

## 创建数据的分批张量


Pytorch 深度学习框架，可以针对张量（tensor）进行优化学习，这里的张量可以理解为任意维度的向量（vectors）和矩阵（matrices）。

接下来，将介绍把数据转换为词典索引的list以及构建分批张量（batch tensors）的方法。（分批的数据是模型的简便的输入形式）。

使用*torch.utils.data.DataLoader*对象处理批次数次和迭代。

### 把分词后的句子list转换为词典索引。

假设有以下词典：

In [67]:
#id_2_word使用list存储词典，顺序即表示词在词典中的索引。
id_2_word = ["<pad>", "<unk>", "we", "always", "have", "paris",
              "i", "live", "in", "germany",
              "he", "comes", "from", "denmark",
              "the", "of", "is", "copenhagen"]

#word_2_id 使用dict表示词和词索引（word,word_indice）
word_2_id = {w:i for i,w in enumerate(id_2_word)}

In [68]:
#打印第一句话
instance = train_sents[0]
print(instance)

['we', "'ll", 'always', 'have', 'paris']


In [69]:
# 定义方法，将词（token）转换为在词典中的索引（indice）
def convert_tokens_to_inds(sentence, word_2_id):
    #获取每个词对应的id（索引），对于word_2_id中不存在的词使用“<unk>”的id替换。
    return [word_2_id.get(t, word_2_id["<unk>"]) for t in sentence]

In [70]:
#打印一条替换成索引后的样本数据，也即使用词典编码后的序列。
token_inds = convert_tokens_to_inds(instance, word_2_id)
pp.pprint(token_inds)

[2, 1, 3, 4, 5]


In [71]:
#将使用索引编码的语句还原成单词表示，也即解码
print([id_2_word[tok_idx] for tok_idx in token_inds])

['we', '<unk>', 'always', 'have', 'paris']


### 对窗口进行填充。

在词窗口分类问题中，对于语句中的每个单词需要使用其左右两侧的n个单词。

为了使对任意单词的取上下文词操作都可以正常进行，需要在语句的开头之前和结尾之后进行填充。


In [72]:
#句首句尾填充方法，使用"<pad>"填充。
def pad_sentence_for_window(sentence, window_size, pad_token="<pad>"):
    return [pad_token]*window_size + sentence + [pad_token]*window_size 

In [73]:
#假定窗口大小为2，打印首位填充后的语句样例。
window_size = 2
instance = pad_sentence_for_window(train_sents[0], window_size)
print(instance)

['<pad>', '<pad>', 'we', "'ll", 'always', 'have', 'paris', '<pad>', '<pad>']


测试验证在给定词典上的编码和解码操作没有问题：

In [74]:
for sent in train_sents:
    #将原始语句序列填充后转换为词典索引序列
    tok_idxs = convert_tokens_to_inds(pad_sentence_for_window(sent, window_size), word_2_id)
    #将词典索引序列还原为词序列
    print([id_2_word[idx] for idx in tok_idxs])

['<pad>', '<pad>', 'we', '<unk>', 'always', 'have', 'paris', '<pad>', '<pad>']
['<pad>', '<pad>', 'i', 'live', 'in', 'germany', '<pad>', '<pad>']
['<pad>', '<pad>', 'he', 'comes', 'from', 'denmark', '<pad>', '<pad>']
['<pad>', '<pad>', 'the', '<unk>', 'of', 'denmark', 'is', 'copenhagen', '<pad>', '<pad>']


### 使用DataLoader将语句批次化

在训练模型时，因为单条样本可能会对全局loss梯度的计算引入明显的噪声，很少每次针对一条单独的训练数据更新模型参数。因此，通常的做法时构建小规模的批数据（batches），针对每批数据更新参数。


给定batch大小，针对使用词典索引编码的样本数据进行分批化处理。

对于每个包含B条样本的输入，进行以下操作：

（1）对batch中的每条数据进行首尾填充。

（2）对batch中的每条数据的末尾进行额外填充，使得batch中的每条数据长度相等。

（3）将样本标签转换成预先设定的格式。

对于数据集：

（4）对每轮（epoch）训练，对整个训练数据进行shuffle操作。

（5）保证对输入和标签的shuffle同步进行，样本的相对顺序保持一一对应。

Pytorch提供了对象*torch.utils.data.DataLoader*，可以实现（4）和（5），对于（1），（2），（3），需要自行实现。

In [75]:
#将训练数据类别向量转换成一个LongTensor，打印张量的内容以及size。

l = torch.LongTensor(train_labels[0])
pp.pprint(("raw train label instance", l))
print(l.size())


('raw train label instance', tensor([0, 0, 0, 0, 1]))
torch.Size([5])


In [76]:
#声明一个2*len(l)的0张量，并打印其内容和维度（size）。
one_hots = torch.zeros((2, len(l)))
pp.pprint(("unfilled label instance", one_hots))
print(one_hots.size())

('unfilled label instance',
 tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]]))
torch.Size([2, 5])


In [77]:
#对声明的0张量进行替换操作，转换成one-hot形式
one_hots[1] = l
pp.pprint(("one-hot labels", one_hots))

('one-hot labels', tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.]]))


In [78]:
l_not = 1-l.byte()#0和1比特位转换
one_hots[0] = l_not
pp.pprint(("one-hot labels", one_hots))

('one-hot labels', tensor([[1., 1., 1., 1., 0.],
        [0., 0., 0., 0., 1.]]))


In [79]:
from torch.utils.data import DataLoader
from functools import partial

In [93]:
def my_collate(data, window_size, word_2_id):
    """
    对于一堆句子和标签，进行以下操作：
    -窗口填充（padding），
    -语句长度填充，
    -将类别标签转换成one-hot向量
    -返回填充后的输入，one-hot标签以及对应的长度
    """
    
    x_s, y_s = zip(*data)#将data中的数据拆解成压缩前的形式

    # 窗口填充
    window_padded = [convert_tokens_to_inds(pad_sentence_for_window(sentence, window_size), word_2_id)
                                                                                  for sentence in x_s]
    # 等长填充
    padded = nn.utils.rnn.pad_sequence([torch.LongTensor(t) for t in window_padded], batch_first=True)
    
    # 将类别标签转换成one-hot形式，使用1标识原始语句中有单词的位置，使用0表示填充的位置
    labels = []
    lengths = []
    for y in y_s:
        lengths.append(len(y))
        label = torch.zeros((len(y),2 ))
        true = torch.LongTensor(y) 
        false = 1-true.byte()
        label[:, 0] = false
        label[:, 1] = true
        labels.append(label)
    padded_labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)
    
    return padded.long(), padded_labels, torch.LongTensor(lengths)

In [94]:
# shuffle=True时对训练阶段的数据加载是有用的。
# 使用偏函数functools.partial构建一个指定了部分参数的方法。

example_loader = DataLoader(list(zip(train_sents, 
                                                      train_labels)), 
                                             batch_size=2, 
                                             shuffle=True, 
                                             collate_fn=partial(my_collate, window_size=2, word_2_id=word_2_id))

In [82]:
#对以上创建的数据加载方法进行测试：
for batched_input, batched_labels, batch_lengths in example_loader:
    pp.pprint(("inputs", batched_input, batched_input.size()))
    pp.pprint(("labels", batched_labels, batched_labels.size()))
    pp.pprint(batch_lengths)
    break

('inputs',
 tensor([[0, 0, 6, 7, 8, 9, 0, 0, 0],
        [0, 0, 2, 1, 3, 4, 5, 0, 0]]),
 torch.Size([2, 9]))
('labels',
 tensor([[[1., 0.],
         [1., 0.],
         [1., 0.],
         [0., 1.],
         [0., 0.]],

        [[1., 0.],
         [1., 0.],
         [1., 0.],
         [1., 0.],
         [0., 1.]]]),
 torch.Size([2, 5, 2]))
tensor([4, 5])


## 模型

### 词窗口的向量化

对于输入的每批语句中的每条语句i的每个单词j，我们需要基于其左右的上下文词构建一个张量表示这个单词j。

因此，我们需要一个维度为(B,L,2N+1)的词索引矩阵。

生成模拟的样本数据，使用以下数据迭代方法构建分批输入，对未填充的词索引序列构建窗口：

In [95]:
#生成一个模拟输入
dummy_input = torch.zeros(2, 8).long()
dummy_input[:,2:-2] = torch.arange(1,9).view(2,4)
pp.pprint(dummy_input)

tensor([[0, 0, 1, 2, 3, 4, 0, 0],
        [0, 0, 5, 6, 7, 8, 0, 0]])


In [54]:
#从输入数据中获取窗口对应的所有情况

dummy_output = [[[dummy_input[i, j-2+k].item() for k in range(2*2+1)] 
                                                     for j in range(2, 6)] 
                                                            for i in range(2)]
dummy_output = torch.LongTensor(dummy_output)
print(dummy_output.size())
pp.pprint(dummy_output)

torch.Size([2, 4, 5])
tensor([[[0, 0, 1, 2, 3],
         [0, 1, 2, 3, 4],
         [1, 2, 3, 4, 0],
         [2, 3, 4, 0, 0]],

        [[0, 0, 5, 6, 7],
         [0, 5, 6, 7, 8],
         [5, 6, 7, 8, 0],
         [6, 7, 8, 0, 0]]])


对于每批数据中的每条数据，对于每个原始语句中的每个单词，忽略窗口填充，可以得到中心词左右的5个词索引，但在实际应用中，这种操作很慢。

作为更好的实现，pytorch的张量运算__Tensor.unfold__可以简便高效地实现这种操作。

In [84]:
dummy_input.unfold(1, 2*2+1, 1)

tensor([[[0, 0, 1, 2, 3],
         [0, 1, 2, 3, 4],
         [1, 2, 3, 4, 0],
         [2, 3, 4, 0, 0]],

        [[0, 0, 5, 6, 7],
         [0, 5, 6, 7, 8],
         [5, 6, 7, 8, 0],
         [6, 7, 8, 0, 0]]])

### 完整模型

使用pytorch，我们通过扩展nn.Module类实现模型。最简洁的，需要实现*\_\_init\_\_* 和*forward* function两个方法。


在*\_\_init\_\_*中存储模型参数（权重）和超参数（维度）。

In [96]:
class SoftmaxWordWindowClassifier(nn.Module):
    """
    包含一层隐藏层的基于窗口的词二元分类模型
    """
    def __init__(self, config, vocab_size, pad_idx=0):
        super(SoftmaxWordWindowClassifier, self).__init__()
        """
        Instance variables.
        """
        self.window_size = 2*config["half_window"]+1
        self.embed_dim = config["embed_dim"]
        self.hidden_dim = config["hidden_dim"]
        self.num_classes = config["num_classes"]
        self.freeze_embeddings = config["freeze_embeddings"]
        
        """
        Embedding layer，词向量层
        -model holds an embedding for each layer in our vocab
        -sets aside a special index in the embedding matrix for padding vector (of zeros)
        -by default, embeddings are parameters (so gradients pass through them)
        """
        self.embed_layer = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_idx)
        if self.freeze_embeddings:
            self.embed_layer.weight.requires_grad = False
        
        """
        Hidden layer，隐藏层
        -we want to map embedded word windows of dim (window_size+1)*self.embed_dim to a hidden layer.
        -nn.Sequential allows you to efficiently specify sequentially structured models
            -first the linear transformation is evoked on the embedded word windows
            -next the nonlinear transformation tanh is evoked.
        """
        self.hidden_layer = nn.Sequential(nn.Linear(self.window_size*self.embed_dim, 
                                                    self.hidden_dim), 
                                          nn.Tanh())
        
        """
        Output layer，输出层
        -we want to map elements of the output layer (of size self.hidden dim) to a number of classes.
        """
        self.output_layer = nn.Linear(self.hidden_dim, self.num_classes)
        
        """
        Softmax，softmax分类层
        -The final step of the softmax classifier: mapping final hidden layer to class scores.
        -pytorch has both logsoftmax and softmax functions (and many others)
        -since our loss is the negative LOG likelihood, we use logsoftmax
        -technically you can take the softmax, and take the log but PyTorch's implementation
         is optimized to avoid numerical underflow issues.
        """
        self.log_softmax = nn.LogSoftmax(dim=2)
        
    def forward(self, inputs):
        """
        Let B:= batch_size
            L:= window-padded sentence length
            D:= self.embed_dim
            S:= self.window_size
            H:= self.hidden_dim
            
        inputs: a (B, L) tensor of token indices
        """
        B, L = inputs.size()
        
        """
        Reshaping.
        Takes in a (B, L) LongTensor
        Outputs a (B, L~, S) LongTensor
        """
        # Fist, get our word windows for each word in our input.
        token_windows = inputs.unfold(1, self.window_size, 1)
        _, adjusted_length, _ = token_windows.size()
        
        # Good idea to do internal tensor-size sanity checks, at the least in comments!
        assert token_windows.size() == (B, adjusted_length, self.window_size)
        
        """
        Embedding.
        Takes in a torch.LongTensor of size (B, L~, S) 
        Outputs a (B, L~, S, D) FloatTensor.
        """
        embedded_windows = self.embed_layer(token_windows)
        
        """
        Reshaping.
        Takes in a (B, L~, S, D) FloatTensor.
        Resizes it into a (B, L~, S*D) FloatTensor.
        -1 argument "infers" what the last dimension should be based on leftover axes.
        """
        embedded_windows = embedded_windows.view(B, adjusted_length, -1)
        
        """
        Layer 1.
        Takes in a (B, L~, S*D) FloatTensor.
        Resizes it into a (B, L~, H) FloatTensor
        """
        layer_1 = self.hidden_layer(embedded_windows)
        
        """
        Layer 2
        Takes in a (B, L~, H) FloatTensor.
        Resizes it into a (B, L~, 2) FloatTensor.
        """
        output = self.output_layer(layer_1)
        
        """
        Softmax.
        Takes in a (B, L~, 2) FloatTensor of unnormalized class scores.
        Outputs a (B, L~, 2) FloatTensor of (log-)normalized class scores.
        """
        output = self.log_softmax(output)
        
        return output

### 模型训练

使用以上设计的模型进行训练。

In [97]:
#损失函数

def loss_function(outputs, labels, lengths):
    """对每批模型的预测结果计算负对数损失值"""
    B, L, num_classes = outputs.size()
    num_elems = lengths.sum().float()
        
    # get only the values with non-zero labels
    loss = outputs*labels
    
    # rescale average
    return -loss.sum() / num_elems

In [98]:
def train_epoch(loss_function, optimizer, model, train_data):
    
    # 对每批数据，必须重置模型中存储的梯度
    total_loss = 0
    for batch, labels, lengths in train_data:
        # clear gradients
        optimizer.zero_grad()
        # evoke model in training mode on batch
        outputs = model.forward(batch)
        # compute loss w.r.t batch
        loss = loss_function(outputs, labels, lengths)
        # pass gradients back, startiing on loss value
        loss.backward()
        # update parameters
        optimizer.step()
        total_loss += loss.item()
    
    # return the total to keep track of how you did this time around
    return total_loss
    

In [88]:
config = {"batch_size": 4,
          "half_window": 2,
          "embed_dim": 25,
          "hidden_dim": 25,
          "num_classes": 2,
          "freeze_embeddings": False,
         }
learning_rate = .0002
num_epochs = 10000
model = SoftmaxWordWindowClassifier(config, len(word_2_id))
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [89]:
#训练数据加载器
train_loader = torch.utils.data.DataLoader(list(zip(train_sents, train_labels)), 
                                           batch_size=2, 
                                           shuffle=True, 
                                           collate_fn=partial(my_collate, window_size=2, word_2_id=word_2_id))

In [90]:
#训练每迭代100次，记录一次模型损失值
losses = []
for epoch in range(num_epochs):
    epoch_loss = train_epoch(loss_function, optimizer, model, train_loader)
    if epoch % 100 == 0:
        losses.append(epoch_loss)
print(losses)

[1.5760300159454346, 1.5034716129302979, 1.428852379322052, 1.3606516122817993, 1.294142723083496, 1.2307620644569397, 1.1729514002799988, 1.1177958846092224, 1.064444661140442, 1.0144432187080383, 0.9647728204727173, 0.9145443737506866, 0.8714427649974823, 0.827903687953949, 0.782893031835556, 0.7464790642261505, 0.7085845470428467, 0.672574907541275, 0.6383797824382782, 0.6020388603210449, 0.5740203559398651, 0.5464770495891571, 0.5153871178627014, 0.49353277683258057, 0.4680628776550293, 0.4465492516756058, 0.4240477979183197, 0.40504635870456696, 0.386140376329422, 0.36748890578746796, 0.34894420206546783, 0.3352568596601486, 0.32057951390743256, 0.3050563931465149, 0.29379647970199585, 0.2799808233976364, 0.27057111263275146, 0.2577608972787857, 0.24760308861732483, 0.23938872665166855, 0.23030203580856323, 0.2217102125287056, 0.213837131857872, 0.20592142641544342, 0.1975209042429924, 0.19177205860614777, 0.1852521151304245, 0.17805130779743195, 0.17332740873098373, 0.16762225329

### 预测

In [91]:
#测试数据加载器
test_loader = torch.utils.data.DataLoader(list(zip(test_sents, test_labels)), 
                                           batch_size=1, 
                                           shuffle=False, 
                                           collate_fn=partial(my_collate, window_size=2, word_2_id=word_2_id))

In [92]:
for test_instance, labels, _ in test_loader:
    outputs = model.forward(test_instance)
    print(torch.argmax(outputs, dim=2))
    print(torch.argmax(labels, dim=2))

tensor([[0, 0, 0, 1]])
tensor([[0, 0, 0, 1]])


#### Text classification using torchText

In [3]:
!pip install torchtext

Collecting torchtext
  Downloading https://files.pythonhosted.org/packages/e2/cc/286543384fef54588c7824803c296cbd0fa2338fb82292c5b9a35b1c96c8/torchtext-0.9.1-cp36-cp36m-win_amd64.whl (1.3MB)
Installing collected packages: torchtext
Successfully installed torchtext-0.9.1


distributed 1.21.8 requires msgpack, which is not installed.
twisted 20.3.0 has requirement attrs>=19.2.0, but you'll have attrs 18.1.0 which is incompatible.
automat 20.2.0 has requirement attrs>=19.2.0, but you'll have attrs 18.1.0 which is incompatible.
You are using pip version 10.0.1, however version 21.1.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [4]:
import torch
from torchtext.datasets import AG_NEWS
train_iter = AG_NEWS(split='train')

C:\Users\zhang\OneDrive\projects\jupyter-notebook\text_mining\09-classifier\.data\train.csv: 29.5MB [00:01, 23.7MB/s]


In [5]:
next(train_iter)

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [6]:
next(train_iter)

(3,
 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.')

In [7]:
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

tokenizer = get_tokenizer('basic_english')
train_iter = AG_NEWS(split='train')
counter = Counter()
for (label, line) in train_iter:
    counter.update(tokenizer(line))
vocab = Vocab(counter, min_freq=1)

In [8]:
[vocab[token] for token in ['here', 'is', 'an', 'example']]

[476, 22, 31, 5298]

In [9]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: int(x) - 1

In [10]:
text_pipeline('here is the an example')

[476, 22, 3, 31, 5298]

In [11]:
label_pipeline('10')

9

In [12]:
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

train_iter = AG_NEWS(split='train')
dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)

<img src='https://pytorch.org/tutorials/_images/text_sentiment_ngrams_model.png'>

In [14]:
from torch import nn

class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

1 : World
2 : Sports
3 : Business
4 : Sci/Tec

In [15]:
train_iter = AG_NEWS(split='train')
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [16]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text, offsets)
        loss = criterion(predited_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [19]:
from torch.utils.data.dataset import random_split
# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate
BATCH_SIZE = 64 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_iter, test_iter = AG_NEWS()
train_dataset = list(train_iter)
test_dataset = list(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

C:\Users\zhang\OneDrive\projects\jupyter-notebook\text_mining\09-classifier\.data\test.csv: 1.86MB [00:00, 4.65MB/s]


| epoch   1 |   500/ 1782 batches | accuracy    0.679
| epoch   1 |  1000/ 1782 batches | accuracy    0.857
| epoch   1 |  1500/ 1782 batches | accuracy    0.875
-----------------------------------------------------------
| end of epoch   1 | time: 18.55s | valid accuracy    0.886 
-----------------------------------------------------------
| epoch   2 |   500/ 1782 batches | accuracy    0.895
| epoch   2 |  1000/ 1782 batches | accuracy    0.902
| epoch   2 |  1500/ 1782 batches | accuracy    0.902
-----------------------------------------------------------
| end of epoch   2 | time: 35.89s | valid accuracy    0.903 
-----------------------------------------------------------
| epoch   3 |   500/ 1782 batches | accuracy    0.915
| epoch   3 |  1000/ 1782 batches | accuracy    0.915
| epoch   3 |  1500/ 1782 batches | accuracy    0.914
-----------------------------------------------------------
| end of epoch   3 | time: 35.02s | valid accuracy    0.907 
-------------------------------

In [18]:
ag_news_label = {1: "World",
                 2: "Sports",
                 3: "Business",
                 4: "Sci/Tec"}

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

model = model.to("cpu")

print("This is a %s news" %ag_news_label[predict(ex_text_str, text_pipeline)])

This is a Sci/Tec news
