In [3]:
import torch
import torchtext
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data

In [2]:
torch.__version__

'1.9.0'

In [3]:
torchtext.__version__

'0.10.0'

# TorchText文本分类数据集

In [4]:
train_iter, test_iter = torchtext.datasets.IMDB()

In [5]:
next(train_iter)

('neg',
 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [6]:
next(train_iter)

('neg',
 '"I Am Curious: Yellow" is a risible and pretentious steaming pile. It doesn\'t matter what one\'s political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn\'t true. I\'ve seen R-rated films with male nudity. Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia? Nowhere, because they don\'t exist. The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight. And those pretentious indie movies like The Brown Bunny, in which we\'re treated to the site of Vincent Gallo\'s throbbing johnson, but not a trace of pink visible on Chloe Sevigny. Before crying (or implying) "double-standard" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women: there are no genitals on display when actresses appears nude, 

In [7]:
train_iter, test_iter = torchtext.datasets.IMDB()
train_data, test_data = list(train_iter), list(test_iter)

In [8]:
all_classes = set([label for (label, text) in train_data])
num_class = len(all_classes)

In [9]:
from torchtext.data.utils import get_tokenizer            # 分词工具
from torchtext.vocab import build_vocab_from_iterator     # 创建词表工具

In [10]:
tokenizer = get_tokenizer('basic_english')      # 分词工具做初始化

In [11]:
def yield_tokens(data):
    for _, text in data:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_data), specials=["<pad>", "<unk>"], min_freq=3)

In [12]:
vocab.set_default_index(vocab["<unk>"])

In [15]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x == 'pos')

In [16]:
text_pipeline('this is a book about pytorch')

[14, 10, 6, 276, 50, 1]

In [17]:
label_pipeline('pos')

1

In [18]:
from torch.utils.data import DataLoader

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [20]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        precess_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(precess_text)
        offsets.append(precess_text.size(0))
    label_list = torch.tensor(label_list)
    text_list = torch.cat(text_list)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [21]:
train_dataloader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_data, batch_size=8, shuffle=False, collate_fn=collate_batch)

# 创建模型

In [22]:
class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [23]:
vocab_size = len(vocab)
emsize = 100
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [24]:
loss_fn = nn.CrossEntropyLoss()
from torch.optim import lr_scheduler
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

In [25]:
def train(dataloader):
    total_acc, total_count, total_loss, = 0, 0, 0
    model.train()
    for label, text, offsets in dataloader:
        predited_label = model(text, offsets)
        loss = loss_fn(predited_label, label)
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        with torch.no_grad():
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            total_loss += loss.item()*label.size(0)
    return total_loss/total_count, total_acc/total_count

In [26]:
def test(dataloader):
    model.eval()
    total_acc, total_count, total_loss, = 0, 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = loss_fn(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            total_loss += loss.item()*label.size(0)
    return total_loss/total_count, total_acc/total_count

In [27]:
def fit(epochs, train_dl, test_dl):
    train_loss = []
    train_acc = []
    test_loss = []
    test_acc = []

    for epoch in range(epochs):
        epoch_loss, epoch_acc = train(train_dl)
        epoch_test_loss, epoch_test_acc = test(test_dl)
        train_loss.append(epoch_loss)
        train_acc.append(epoch_acc)
        test_loss.append(epoch_test_loss)
        test_acc.append(epoch_test_acc)
        exp_lr_scheduler.step()
        template = ("epoch:{:2d}, train_loss: {:.5f}, train_acc: {:.1f}% ," 
                    "test_loss: {:.5f}, test_acc: {:.1f}%")
        print(template.format(
              epoch, epoch_loss, epoch_acc*100, epoch_test_loss, epoch_test_acc*100))
    print("Done!")
    
    return train_loss, test_loss, train_acc, test_acc

In [28]:
EPOCHS = 30

In [29]:
train_loss, test_loss, train_acc, test_acc = fit(EPOCHS, train_dataloader, test_dataloader)

epoch: 0, train_loss: 0.67152, train_acc: 59.5% ,test_loss: 0.65235, test_acc: 61.0%
epoch: 1, train_loss: 0.61600, train_acc: 68.2% ,test_loss: 0.58940, test_acc: 70.8%
epoch: 2, train_loss: 0.55567, train_acc: 73.5% ,test_loss: 0.53286, test_acc: 75.0%
epoch: 3, train_loss: 0.50354, train_acc: 77.1% ,test_loss: 0.48855, test_acc: 77.9%
epoch: 4, train_loss: 0.46238, train_acc: 79.7% ,test_loss: 0.45579, test_acc: 79.7%
epoch: 5, train_loss: 0.43155, train_acc: 81.4% ,test_loss: 0.42576, test_acc: 81.5%
epoch: 6, train_loss: 0.40552, train_acc: 82.8% ,test_loss: 0.40717, test_acc: 82.4%
epoch: 7, train_loss: 0.38623, train_acc: 83.9% ,test_loss: 0.39345, test_acc: 83.0%
epoch: 8, train_loss: 0.36868, train_acc: 84.7% ,test_loss: 0.37662, test_acc: 83.9%
epoch: 9, train_loss: 0.35405, train_acc: 85.3% ,test_loss: 0.36386, test_acc: 84.7%
epoch:10, train_loss: 0.34157, train_acc: 85.9% ,test_loss: 0.36500, test_acc: 84.4%
epoch:11, train_loss: 0.32946, train_acc: 86.7% ,test_loss: 0.348