## Sentiment Classfication By Word Averaging
- 情感分类（2分类）示例
- 采用词向量平均
- 数据集-链接: https://pan.baidu.com/s/10iR2LvO_T_vp0eetMa6awQ  密码: tp29

In [1]:
import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import Counter

In [2]:
random.seed(2019)
# 使用benchmark以启动CUDNN_FIND自动寻找最快的操作，
# 当计算图不会改变的时候（每次输入形状相同，模型不改变）的情况下可以提高性能，反之则降低性能。
# torch.backends.cudnn.deterministic = True

In [3]:
VOCAB_SIZE = 14_828

EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 0.01
BEST_VALID_LOSS = float('inf')

EMBEDDING_DIM = 100
OUTPUT_DIM = 1

train_file = "data/senti.train.tsv"
eval_file = "data/senti.dev.tsv"
test_file = "data/senti.test.tsv"

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device('cuda:0' if USE_CUDA else 'cpu')
NUM_CUDA = torch.cuda.device_count()

In [4]:
def load_text_file(filename):
    """将样本的特征与标签分开，并将样本特征分词"""
    sentences = []
    label = []
    with open(filename, "r") as f:
        sent_list  = [line.strip().split('\t') for line in f]
    for sample in sent_list:
        sentences.append(sample[0].lower().split(" "))
        label.append(int(sample[-1]))
    return sentences, label


def build_word_dic(sentences_list, vocab_size=20_000):
    """构建words_set, word2idx, idx2word"""
    words_list = [w for line in sentences_list for w in line]
    counter = Counter(words_list)
    words_topn = counter.most_common(vocab_size)
    words_set = [item[0] for item in words_topn]
    words_set = ['<pad>', "<unk>"] + words_set
    word2idx = {w:i for i, w in enumerate(words_set)}
    idx2word = {i:w for i, w in enumerate(words_set)}
    return words_topn, word2idx, idx2word


def build_x_y(word2idx, sentences_list, label_list, sent_len=30):
    """构建输入模型的数据，对每个单词编码，每个句子通过添加pading保持一样长"""
    x = []
    y = []
    for sent, label in zip(sentences_list, label_list):
        word_x = [0]*sent_len
        if len(sent) > sent_len:
            sent = sent[:sent_len]
        for i, w in enumerate(sent):
            if w in word2idx:
                word_x[i] = word2idx[w]
            else:
                word_x[i] = word2idx['<unk>']
        x.append(word_x)
        y.append(label)
    return x, y

# 构造批次数据
def build_batch_data(data, label, batch_size=32):
    """构建 batch tensor，返回 batch 列表，每个batch为二元组包含data和label"""
    batch_data = []
    data_tensor = torch.tensor(data, dtype=torch.long)
    label_tensor = torch.tensor(label, dtype=torch.float)
    n, dim = data_tensor.size()
    for start in range(0, n, batch_size):
        end = start + batch_size
        if end > n:
            break
            dbatch = data_tensor[start: ]
            lbatch = label_tensor[start: ]
            print("最后一个batch size:", dbatch.size())
        else:
            dbatch = data_tensor[start: end]
            lbatch = label_tensor[start: end]
        batch_data.append((dbatch, lbatch))
    return batch_data

In [5]:
train_sentences, train_label = load_text_file(train_file)
eval_sentences, eval_label = load_text_file(eval_file)
test_sentences, test_label = load_text_file(test_file)

In [6]:
print("处理后的样本与标签：", train_sentences[0], train_label[0])
print("各个数据集样本数量：")
print(len(train_sentences), len(train_label))
print(len(eval_sentences), len(eval_label))
print(len(test_sentences), len(test_label))

print("各数据集最长最短句子单词数：")
print(max([len(s) for s in train_sentences]), min([len(s) for s in train_sentences]))
print(max([len(s) for s in eval_sentences]), min([len(s) for s in eval_sentences]))
print(max([len(s) for s in test_sentences]), min([len(s) for s in test_sentences]))

处理后的样本与标签： ['hide', 'new', 'secretions', 'from', 'the', 'parental', 'units'] 0
各个数据集样本数量：
67349 67349
872 872
1821 1821
各数据集最长最短句子单词数：
52 1
47 2
56 2


In [7]:
max_seq_len = 56
words_set, word2idx, idx2word = build_word_dic(train_sentences, vocab_size=VOCAB_SIZE)
train_x, train_y = build_x_y(word2idx, train_sentences, train_label,sent_len=max_seq_len)
eval_x, eval_y = build_x_y(word2idx, eval_sentences, eval_label,sent_len=max_seq_len)
test_x, test_y = build_x_y(word2idx, test_sentences, test_label,sent_len=max_seq_len)

In [8]:
print("词典长度:", len(words_set), len(word2idx), len(idx2word))
print("训练集样本数量:", len(train_x), len(train_y))

词典长度: 14828 14830 14830
训练集样本数量: 67349 67349


In [9]:
train_data = build_batch_data(train_x, train_y, batch_size=BATCH_SIZE)
eval_data = build_batch_data(eval_x, eval_y, batch_size=BATCH_SIZE)
test_data = build_batch_data(test_x, test_y, batch_size=BATCH_SIZE)

In [10]:
train_data[0][0].size(), train_data[0][1].size()

(torch.Size([32, 56]), torch.Size([32]))

In [11]:
train_data[0][1]

tensor([0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1.,
        0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 0., 0., 1.])

In [12]:
# Word Averaging Model
class WordAVGModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        super(WordAVGModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, data):
        # print("data", data.size())
        embedded = self.embedding(data) # [sent len, batch size, emb dim]
        # print("embdded", embedded.size())
        # embedded = embedded.permute(1, 0, 2) # [batch size, sent len, emb dim]
        # print("embdded2:", embedded.size())
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) # [batch size, embedding_dim]
        # print("poold:", pooled.size())
        score = self.fc(pooled)
        return score
    
def binary_accuracy(preds, y):
    """
    计算准确率
    """
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  
    acc = correct.sum() / len(correct)
    return acc

def train(model, device, iterator, optimizer, criterion):
    """训练函数"""
    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for x, y in iterator:
        x, y = x.to(device), y.to(device) # torch.int64
        optimizer.zero_grad()
        predictions = model(x).squeeze(1)  # torch.float32 
        
        loss = criterion(predictions, y)
        acc = binary_accuracy(predictions, y)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, device, iterator, criterion):
    """验证函数"""
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for x, y in iterator:
            x, y = x.to(device), y.to(device)
            predictions = model(x).squeeze(1)
            loss = criterion(predictions, y)
            acc = binary_accuracy(predictions, y)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def count_parameters(model):
    """统计模型的参数量"""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def epoch_time(start_time, end_time):
    """计算时间差，单位秒"""
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [13]:
INPUT_DIM = len(words_set) + 2
PAD_IDX = word2idx['<pad>']

model = WordAVGModel(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)
print(f'模型有{count_parameters(model):,}个可调节参数, 大约{count_parameters(model)*4/1024/1024} M.')

model = model.to(DEVICE)

# 使用多块GPU
# if NUM_CUDA > 1:
#     device_ids = list(range(NUM_CUDA))
#     print(device_ids)
#     model = nn.DataParallel(model, device_ids=device_ids)
optimizer = optim.Adam(model.parameters(),lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

模型有1,483,101个可调节参数, 大约5.657581329345703 M.


## 训练模型

In [14]:
model_name = 'wordavg-model.pth'
for epoch in range(1, EPOCHS+1):
    start_time = time.time()
    train_loss, train_acc = train(model, DEVICE, train_data, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, DEVICE, eval_data, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < BEST_VALID_LOSS:
        BEST_VALID_LOSS = valid_loss
        torch.save(model, model_name)
        print(f'***Save Best Model {model_name}***')
    
    print(f'Epoch: {epoch :02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


***Save Best Model wordavg-model.pth***
Epoch: 01 | Epoch Time: 0m 2s
	Train Loss: 0.370 | Train Acc: 83.19%
	 Val. Loss: 0.562 |  Val. Acc: 81.13%
Epoch: 02 | Epoch Time: 0m 2s
	Train Loss: 0.226 | Train Acc: 91.20%
	 Val. Loss: 0.681 |  Val. Acc: 82.06%
Epoch: 03 | Epoch Time: 0m 2s
	Train Loss: 0.191 | Train Acc: 92.70%
	 Val. Loss: 0.788 |  Val. Acc: 82.41%
Epoch: 04 | Epoch Time: 0m 2s
	Train Loss: 0.171 | Train Acc: 93.52%
	 Val. Loss: 0.888 |  Val. Acc: 81.48%
Epoch: 05 | Epoch Time: 0m 2s
	Train Loss: 0.157 | Train Acc: 94.11%
	 Val. Loss: 0.993 |  Val. Acc: 80.67%


## 测试集上的表现

In [15]:
model = torch.load(model_name)
test_loss, test_acc = evaluate(model, DEVICE, test_data, criterion)
print('Test Loss: {0} | Test Acc: {1} |'.format(test_loss, test_acc))

Test Loss: 0.5002752636958446 | Test Acc: 0.8097098214285714 |


## 计算词向量L2 Norm

In [16]:
embed = model.embedding.weight.data
print("Embed size:", embed.size())
word_l2norm = torch.norm(embed,  dim=1)
embed_l2norm, embed_l2normnorm_idx = word_l2norm.sort()

Embed size: torch.Size([14830, 100])


In [17]:
print('L2 norm 最小的 15 个单词：')
for i,s in zip(embed_l2normnorm_idx[:15].tolist(), embed_l2norm[:15].tolist()):
    print(idx2word[i], s)

L2 norm 最小的 15 个单词：
<pad> 0.11879125237464905
times 7.761016845703125
expeditious 7.798865795135498
nights 7.943765163421631
fallible 7.999438285827637
cheering 8.00982666015625
freak-outs 8.024674415588379
ol' 8.026588439941406
steeped 8.085253715515137
prophet 8.100972175598145
ennui 8.163886070251465
besides 8.185270309448242
independent 8.185609817504883
showing 8.20890998840332
loquacious 8.211852073669434


In [18]:
print('L2 norm 最大的 15 个单词：')
for i,s in zip(embed_l2normnorm_idx[-15:].tolist(), embed_l2norm[-15:].tolist()):
    print(idx2word[i],s)

L2 norm 最大的 15 个单词：
annoying 22.8407039642334
wonderfully 23.056434631347656
touching 23.10053062438965
pointless 23.324430465698242
pretentious 23.505430221557617
terrific 23.599756240844727
devoid 23.609289169311523
lousy 24.04730987548828
hilarious 24.437118530273438
failure 24.47695541381836
stupid 24.875703811645508
worst 25.027503967285156
lacking 25.51972007751465
remarkable 25.560626983642578
mess 26.684070587158203
