## 文本分类

In [43]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import Counter

In [44]:
# 使用benchmark以启动CUDNN_FIND自动寻找最快的操作，
# 当计算图不会改变的时候（每次输入形状相同，模型不改变）的情况下可以提高性能，反之则降低性能。
torch.backends.cudnn.deterministic = True

In [45]:
train_file = "data/senti.train.tsv"
eval_file = "data/senti.dev.tsv"
test_file = "data/senti.test.tsv"

In [46]:
def load_text_file(filename):
    sentences = []
    label = []
    with open(filename, "r") as f:
        sent_list  = [line.strip().split('\t') for line in f]
    for sample in sent_list:
        sentences.append(sample[0].split(" "))
        label.append(int(sample[-1]))
    return sentences, label

In [47]:
train_sentences, train_label = load_text_file(train_file)
eval_sentences, eval_label = load_text_file(eval_file)
test_sentences, test_label = load_text_file(test_file)

In [48]:
(train_sentences[0], train_label[0])

(['hide', 'new', 'secretions', 'from', 'the', 'parental', 'units'], 0)

In [49]:
print(len(train_sentences), len(train_label))
print(len(eval_sentences), len(eval_label))
print(len(test_sentences), len(test_label))

print(max([len(s) for s in train_sentences]), min([len(s) for s in train_sentences]))
print(max([len(s) for s in eval_sentences]), min([len(s) for s in eval_sentences]))
print(max([len(s) for s in test_sentences]), min([len(s) for s in test_sentences]))

67349 67349
872 872
1821 1821
52 1
47 2
56 2


In [50]:
def build_vocab(sentences_list):
    words_dict = {}
    for sentence in sentences_list:
        for w in sentence:
            w = w.lower()
            if w in words_dict:
                words_dict[w] = words_dict[w]+1
            else:
                words_dict[w] = 1
    return words_dict

In [51]:
words_dict = build_vocab(train_sentences)

In [52]:
len(words_dict)

14828

In [53]:
words_list = [w for line in train_sentences for w in line]
words_counter = Counter(words_list)
TopN = 15000
wds_dict = dict(words_counter.most_common(TopN))
word2idx = {w:i for i,w  in enumerate(wds_dict.keys(), 2)}

word2idx['<pad>'] = 0
word2idx['<unk>'] = 1
print(len(word2idx))
idx2word = {i:w for w,i in word2idx.items()} 
print(len(idx2word))

15002
15002


In [54]:
i = 0
for k,v in idx2word.items():
    i += 1
    if i < 20:
        print(k, v)

2 ,
3 the
4 and
5 a
6 of
7 .
8 to
9 's
10 is
11 that
12 in
13 it
14 as
15 with
16 film
17 its
18 for
19 an
20 movie


In [55]:
def build_x_y(word2idx, sentences_list, label_list, sent_len=30):
    x = []
    y = []
    for sent, label in zip(sentences_list, label_list):
        word_x = [0]*sent_len
        if len(sent) > sent_len:
            sent = sent[:sent_len]
        for i, w in enumerate(sent):
            if w in word2idx:
                word_x[i] = word2idx[w]
            else:
                word_x[i] = word2idx['<unk>']
        x.append(word_x)
        y.append(label)
    return x, y

In [56]:
train_x, train_y = build_x_y(word2idx, train_sentences, train_label,sent_len=40)
eval_x, eval_y = build_x_y(word2idx, eval_sentences, eval_label,sent_len=40)
test_x, test_y = build_x_y(word2idx, test_sentences, test_label,sent_len=40)

In [57]:
(len(train_x), len(train_y))

(67349, 67349)

In [58]:
# 构造批次数据
def build_batch_data(data, label, batch_size=32):
    """构建 batch tensor，返回 batch 列表，每个batch为二元组包含data和label"""
    batch_data = []
    data_tensor = torch.tensor(data, dtype=torch.long)
    label_tensor = torch.tensor(label, dtype=torch.long)
    n, dim = data_tensor.size()
    for start in range(0, n, batch_size):
        end = start + batch_size
        if end > n:
            break
            dbatch = data_tensor[start: ]
            lbatch = label_tensor[start: ]
            print("最后一个batch size:", dbatch.size())
        else:
            dbatch = data_tensor[start: end]
            lbatch = label_tensor[start: end]
        batch_data.append((dbatch, lbatch))
    return batch_data

In [59]:
BATCH_SIZE = 64
train_data = build_batch_data(train_x, train_y, batch_size=BATCH_SIZE)
eval_data = build_batch_data(eval_x, eval_y, batch_size=BATCH_SIZE)
test_data = build_batch_data(test_x, test_y, batch_size=BATCH_SIZE)

In [60]:
## WordAvgModel

class WordAVGModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        super(WordAVGModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, data):
#         print("data", data.size())
        embedded = self.embedding(data) # [sent len, batch size, emb dim]
#         print("embdded", embedded.size())
#         embedded = embedded.permute(1, 0, 2) # [batch size, sent len, emb dim]
#         print("embdded2:", embedded.size())
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) # [batch size, embedding_dim]
#         print("poold:", pooled.size())
#         score = torch.tensor(self.fc(pooled), dtype=torch.long)
        score = self.fc(pooled)
        return score

In [61]:
INPUT_DIM = len(word2idx)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1
PAD_IDX = 0
LEARNING_RATE = 0.01

model = WordAVGModel(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)

In [62]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,500,301 trainable parameters


In [63]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

optimizer = optim.Adam(model.parameters(),lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)

cuda


In [64]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

def train(model, device, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for x, y in iterator:
        x, y = x.to(device), y.to(device, dtype=torch.float32) # torch.int64
        optimizer.zero_grad()
        predictions = model(x).squeeze(1)  # torch.float32 
        
        loss = criterion(predictions, y)
        acc = binary_accuracy(predictions, y)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, device, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for x, y in iterator:
            x, y = x.to(device), y.to(device, dtype=torch.float32)
            predictions = model(x).squeeze(1)
            loss = criterion(predictions, y)
            acc = binary_accuracy(predictions, y)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [65]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [66]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, device, train_data, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, device, eval_data, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model, 'wordavg-model.pth')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

  "type " + obj.__name__ + ". It won't be checked "


Epoch: 01 | Epoch Time: 0m 4s
	Train Loss: 0.368 | Train Acc: 83.60%
	 Val. Loss: 0.534 |  Val. Acc: 81.49%
Epoch: 02 | Epoch Time: 0m 4s
	Train Loss: 0.217 | Train Acc: 91.75%
	 Val. Loss: 0.661 |  Val. Acc: 81.85%
Epoch: 03 | Epoch Time: 0m 4s
	Train Loss: 0.181 | Train Acc: 93.12%
	 Val. Loss: 0.771 |  Val. Acc: 80.77%
Epoch: 04 | Epoch Time: 0m 4s
	Train Loss: 0.163 | Train Acc: 93.93%
	 Val. Loss: 0.864 |  Val. Acc: 80.53%
Epoch: 05 | Epoch Time: 0m 4s
	Train Loss: 0.151 | Train Acc: 94.32%
	 Val. Loss: 0.938 |  Val. Acc: 80.77%
Epoch: 06 | Epoch Time: 0m 4s
	Train Loss: 0.144 | Train Acc: 94.58%
	 Val. Loss: 1.006 |  Val. Acc: 80.53%
Epoch: 07 | Epoch Time: 0m 4s
	Train Loss: 0.137 | Train Acc: 94.79%
	 Val. Loss: 1.072 |  Val. Acc: 80.77%
Epoch: 08 | Epoch Time: 0m 3s
	Train Loss: 0.132 | Train Acc: 94.98%
	 Val. Loss: 1.140 |  Val. Acc: 80.65%
Epoch: 09 | Epoch Time: 0m 4s
	Train Loss: 0.127 | Train Acc: 95.14%
	 Val. Loss: 1.213 |  Val. Acc: 80.29%
Epoch: 10 | Epoch Time: 0m 4

In [67]:
model_name = 'wordavg-model.pth'
model = torch.load(model_name)
test_loss, test_acc = evaluate(model, device, test_data, criterion)
print('Test Loss: {0} | Test Acc: {1} |'.format(test_loss, test_acc))

Test Loss: 0.48911639622279574 | Test Acc: 0.8225446428571429 |


In [68]:
model.state_dict()['embedding.weight']

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0043,  1.5189, -0.1079,  ..., -0.6966, -0.4025,  0.5670],
        [ 0.4338,  0.1577, -0.1645,  ..., -0.4522,  0.1431, -1.5758],
        ...,
        [-1.1635, -2.2269,  0.1378,  ...,  0.5291,  0.7003,  2.4660],
        [ 0.8394, -0.8261, -1.5766,  ..., -1.5842, -1.6100,  0.0027],
        [-1.6016, -0.3822, -1.6694,  ..., -0.1092, -0.1342,  1.1631]],
       device='cuda:0')

In [69]:
embed = model.embedding.weight.data
embed_norm = torch.norm(embed, p=None, dim=1)
sort_embed_norm, sort_embed_norm_idx = embed_norm.sort()

In [74]:
# norm 最小的 15 个单词
print('norm 最小的 15 个单词：')
for idx in sort_embed_norm_idx[:15].tolist():
    print(idx2word[idx])

norm 最小的 15 个单词：
<pad>
came
morality
mind
Dummies
Wrote
Eisenstein
rewarded
Catcher
Errol
Solondz
unreligious
Rinzler
improvisation
hybrid


In [75]:
# norm 最大的 15 个单词
print('norm 最大的 15 个单词：')
for idx in sort_embed_norm_idx[-15:].tolist():
    print(idx2word[idx])

norm 最大的 15 个单词：
worse
wonderfully
refreshing
lacks
pointless
devoid
failure
lousy
terrific
worst
stupid
mess
wonderful
lacking
remarkable
