## Sentiment Classfication By Word Averaging

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import Counter
import random

In [2]:
random.seed(2019)
# 使用benchmark以启动CUDNN_FIND自动寻找最快的操作，
# 当计算图不会改变的时候（每次输入形状相同，模型不改变）的情况下可以提高性能，反之则降低性能。
torch.backends.cudnn.deterministic = True

In [3]:
EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 0.01
BEST_VALID_LOSS = float('inf')

EMBEDDING_DIM = 100
OUTPUT_DIM = 1

train_file = "data/senti.train.tsv"
eval_file = "data/senti.dev.tsv"
test_file = "data/senti.test.tsv"

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device('cuda' if USE_CUDA else 'cpu')
NUM_CUDA = torch.cuda.device_count()

In [4]:
def load_text_file(filename):
    sentences = []
    label = []
    with open(filename, "r") as f:
        sent_list  = [line.strip().split('\t') for line in f]
    for sample in sent_list:
        sentences.append(sample[0].split(" "))
        label.append(int(sample[-1]))
    return sentences, label

In [5]:
train_sentences, train_label = load_text_file(train_file)
eval_sentences, eval_label = load_text_file(eval_file)
test_sentences, test_label = load_text_file(test_file)

In [6]:
(train_sentences[0], train_label[0])

(['hide', 'new', 'secretions', 'from', 'the', 'parental', 'units'], 0)

In [7]:
print(len(train_sentences), len(train_label))
print(len(eval_sentences), len(eval_label))
print(len(test_sentences), len(test_label))

print(max([len(s) for s in train_sentences]), min([len(s) for s in train_sentences]))
print(max([len(s) for s in eval_sentences]), min([len(s) for s in eval_sentences]))
print(max([len(s) for s in test_sentences]), min([len(s) for s in test_sentences]))

67349 67349
872 872
1821 1821
52 1
47 2
56 2


In [8]:
def build_word_dic(sentences_list):
    """构建words_set, word2idx, idx2wor"""
    words_list = [w for line in sentences_list for w in line]
    counter = Counter(words_list)
    words_set = list(counter)
    # print(words_set[:10])
    words_set = [ '<pad>', "<unk>"] + words_set
    # print(words_set[0], words_set[1])
    word2idx = {w:i for i, w in enumerate(words_set)}
    idx2word = {i:w for i, w in enumerate(words_set)}
    return words_set, word2idx, idx2word

In [9]:
words_set, word2idx, idx2word = build_word_dic(train_sentences)

In [10]:
(len(words_set), len(word2idx), len(idx2word))

(16284, 16284, 16284)

In [11]:
i = 0
for k,v in idx2word.items():
    i += 1
    if i < 20:
        print(k, v)

0 <pad>
1 <unk>
2 hide
3 new
4 secretions
5 from
6 the
7 parental
8 units
9 contains
10 no
11 wit
12 ,
13 only
14 labored
15 gags
16 that
17 loves
18 its


In [12]:
def build_x_y(word2idx, sentences_list, label_list, sent_len=30):
    x = []
    y = []
    for sent, label in zip(sentences_list, label_list):
        word_x = [0]*sent_len
        if len(sent) > sent_len:
            sent = sent[:sent_len]
        for i, w in enumerate(sent):
            if w in word2idx:
                word_x[i] = word2idx[w]
            else:
                word_x[i] = word2idx['<unk>']
        x.append(word_x)
        y.append(label)
    return x, y

In [13]:
train_x, train_y = build_x_y(word2idx, train_sentences, train_label,sent_len=40)
eval_x, eval_y = build_x_y(word2idx, eval_sentences, eval_label,sent_len=40)
test_x, test_y = build_x_y(word2idx, test_sentences, test_label,sent_len=40)

In [14]:
(len(train_x), len(train_y))

(67349, 67349)

In [15]:
# 构造批次数据
def build_batch_data(data, label, batch_size=32):
    """构建 batch tensor，返回 batch 列表，每个batch为二元组包含data和label"""
    batch_data = []
    data_tensor = torch.tensor(data, dtype=torch.long)
    label_tensor = torch.tensor(label, dtype=torch.float)
    n, dim = data_tensor.size()
    for start in range(0, n, batch_size):
        end = start + batch_size
        if end > n:
            break
            dbatch = data_tensor[start: ]
            lbatch = label_tensor[start: ]
            print("最后一个batch size:", dbatch.size())
        else:
            dbatch = data_tensor[start: end]
            lbatch = label_tensor[start: end]
        batch_data.append((dbatch, lbatch))
    return batch_data

In [16]:
train_data = build_batch_data(train_x, train_y, batch_size=BATCH_SIZE)
eval_data = build_batch_data(eval_x, eval_y, batch_size=BATCH_SIZE)
test_data = build_batch_data(test_x, test_y, batch_size=BATCH_SIZE)

In [17]:
# Word Averaging Model
class WordAVGModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        super(WordAVGModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, data):
        # print("data", data.size())
        embedded = self.embedding(data) # [sent len, batch size, emb dim]
        # print("embdded", embedded.size())
        # embedded = embedded.permute(1, 0, 2) # [batch size, sent len, emb dim]
        # print("embdded2:", embedded.size())
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) # [batch size, embedding_dim]
        # print("poold:", pooled.size())
        score = self.fc(pooled)
        return score

In [18]:
INPUT_DIM = len(words_set)
PAD_IDX = word2idx['<pad>']
model = WordAVGModel(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)

In [19]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,628,501 trainable parameters


In [20]:
model = model.to(DEVICE)

# 使用多块GPU
if NUM_CUDA > 1:
    device_ids = list(range(NUM_CUDA))
    print(device_ids)
optimizer = optim.Adam(model.parameters(),lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

[0, 1, 2, 3]


In [21]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

def train(model, device, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for x, y in iterator:
        x, y = x.to(device), y.to(device) # torch.int64
        optimizer.zero_grad()
        predictions = model(x).squeeze(1)  # torch.float32 
        
        loss = criterion(predictions, y)
        acc = binary_accuracy(predictions, y)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, device, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for x, y in iterator:
            x, y = x.to(device), y.to(device)
            predictions = model(x).squeeze(1)
            loss = criterion(predictions, y)
            acc = binary_accuracy(predictions, y)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [22]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [23]:
model_name = 'wordavg-model.pth'
for epoch in range(1, EPOCHS+1):

    start_time = time.time()
    
    train_loss, train_acc = train(model, DEVICE, train_data, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, DEVICE, eval_data, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < BEST_VALID_LOSS:
        BEST_VALID_LOSS = valid_loss
        torch.save(model, model_name)
        print(f'***Save Best Model {model_name}***')
    
    print(f'Epoch: {epoch :02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

  "type " + obj.__name__ + ". It won't be checked "


***Save Best Model wordavg-model.pth***
Epoch: 01 | Epoch Time: 0m 8s
	Train Loss: 0.355 | Train Acc: 84.57%
	 Val. Loss: 0.585 |  Val. Acc: 81.02%
Epoch: 02 | Epoch Time: 0m 8s
	Train Loss: 0.214 | Train Acc: 91.84%
	 Val. Loss: 0.723 |  Val. Acc: 80.79%
Epoch: 03 | Epoch Time: 0m 7s
	Train Loss: 0.177 | Train Acc: 93.32%
	 Val. Loss: 0.854 |  Val. Acc: 79.98%
Epoch: 04 | Epoch Time: 0m 6s
	Train Loss: 0.156 | Train Acc: 94.25%
	 Val. Loss: 0.976 |  Val. Acc: 79.63%
Epoch: 05 | Epoch Time: 0m 8s
	Train Loss: 0.140 | Train Acc: 94.84%
	 Val. Loss: 1.089 |  Val. Acc: 79.51%


In [24]:
model = torch.load(model_name)
test_loss, test_acc = evaluate(model, DEVICE, test_data, criterion)
print('Test Loss: {0} | Test Acc: {1} |'.format(test_loss, test_acc))

Test Loss: 0.5327801055141858 | Test Acc: 0.8063616071428571 |


In [25]:
embed = model.embedding.weight.data
print("Embed size:", embed.size())
word_l2norm = torch.norm(embed,  dim=1)
embed_l2norm, embed_l2normnorm_idx = word_l2norm.sort()

Embed size: torch.Size([16284, 100])


In [26]:
print('L2 norm 最小的 15 个单词：')
for i in embed_l2normnorm_idx[:15].tolist():
    print(idx2word[i])
print(embed_l2norm[:15])

L2 norm 最小的 15 个单词：
<pad>
martial
Weeks
fetishes
core
Earnhart
released
Rabbit-Proof
Confusion
embarking
aiming
tackling
Am
nervous
Sade
tensor([0.0000, 7.3888, 7.7593, 7.7802, 7.8499, 7.8826, 7.9168, 7.9480, 7.9485,
        7.9650, 7.9883, 8.0346, 8.0370, 8.0452, 8.1170], device='cuda:0')


In [27]:
print('L2 norm 最大的 15 个单词：')
for i in embed_l2normnorm_idx[-15:].tolist():
    print(idx2word[i])
print(embed_l2norm[-15:])

L2 norm 最大的 15 个单词：
depressing
wonderfully
devoid
pretentious
annoying
lousy
failure
unnecessary
pointless
worst
stupid
lacking
remarkable
mess
wonderful
tensor([20.5986, 20.6516, 20.7279, 21.0944, 21.1322, 21.2453, 21.2597, 21.4071,
        21.7164, 22.6244, 23.3707, 23.4475, 23.6621, 23.8790, 25.6098],
       device='cuda:0')
