# Task2

In [1]:
import numpy as np # linear algebra
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import collections
import random
import time
from tqdm import tqdm

import torch
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F
import torchtext.vocab as Vocab
import torch.utils.data as Data

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 1 导入数据

In [2]:
train = pd.read_csv('./train.tsv.zip', sep="\t")
test = pd.read_csv('./test.tsv.zip', sep="\t")
print(train.shape,test.shape)

(156060, 4) (66292, 3)


In [3]:
y_num = train['Sentiment'].max() + 1
y_num

5

## 2.1 复现2-gram

在task1中我们已经看到N-gram模型因为采集了一定的语序信息要优于词袋模型，因此这里采用pytorch复现2-gram。

In [4]:
# 导入停用词库
file = open("stopwords.txt", "r")
stopwords = []
try:
    while True:
        text_line = file.readline()
        if text_line:
            stopwords.append(text_line.strip())
        else:
            break
finally:
    file.close()

In [5]:
'd' in stopwords

True

In [6]:
words_dic = {} # 词库
dic_length = 0
# 训练集
for i in train.index:
    words = train['Phrase'][i].split(' ')
    words_ = ""
    for word in words:
        if word not in stopwords:
            words_ =  words_ + " " + word
    words = words_.strip().split()
    words_length = len(words)
    for j in range(words_length-1):
        word = words[j] + " " + words[j+1]
        if word not in words_dic.keys():
            words_dic[word] = dic_length
            dic_length += 1

In [7]:
dic_length

63405

In [8]:
words_dic

{'A series': 0,
 'series escapades': 1,
 'escapades demonstrating': 2,
 'demonstrating adage': 3,
 'adage goose': 4,
 'goose gander': 5,
 'gander ,': 6,
 ', occasionally': 7,
 'occasionally amuses': 8,
 'amuses amounts': 9,
 'amounts story': 10,
 'story .': 11,
 'This quiet': 12,
 'quiet ,': 13,
 ', introspective': 14,
 'introspective entertaining': 15,
 'entertaining independent': 16,
 'independent worth': 17,
 'worth seeking': 18,
 'seeking .': 19,
 'Even fans': 20,
 'fans Ismail': 21,
 'Ismail Merchant': 22,
 'Merchant ,': 23,
 ', I': 24,
 'I suspect': 25,
 'suspect ,': 26,
 ', hard': 27,
 'hard time': 28,
 'time sitting': 29,
 'sitting .': 30,
 'A positively': 31,
 'positively thrilling': 32,
 'thrilling combination': 33,
 'combination ethnography': 34,
 'ethnography intrigue': 35,
 'intrigue ,': 36,
 ', betrayal': 37,
 'betrayal ,': 38,
 ', deceit': 39,
 'deceit murder': 40,
 'murder Shakespearean': 41,
 'Shakespearean tragedy': 42,
 'tragedy juicy': 43,
 'juicy soap': 44,
 'soap 

In [9]:
torch.manual_seed(1)
word_to_ix = words_dic
EMBEDDING_DIM = 100 # 词向量的维度
embeds = nn.Embedding(dic_length, EMBEDDING_DIM)

In [10]:
import torch.autograd as autograd
embeds(autograd.Variable(torch.LongTensor([word_to_ix["A series"]])))

tensor([[-1.5256, -0.7502, -0.6540, -1.6095, -0.1002, -0.6092, -0.9798, -1.6091,
         -0.7121,  0.3037, -0.7773, -0.2515, -0.2223,  1.6871,  0.2284,  0.4676,
         -0.6970, -1.1608,  0.6995,  0.1991,  0.8657,  0.2444, -0.6629,  0.8073,
          1.1017, -0.1759, -2.2456, -1.4465,  0.0612, -0.6177, -0.7981, -0.1316,
          1.8793, -0.0721,  0.1578, -0.7735,  0.1991,  0.0457,  0.1530, -0.4757,
         -0.1110,  0.2927, -0.1578, -0.0288,  2.3571, -1.0373,  1.5748, -0.6298,
         -0.9274,  0.5451,  0.0663, -0.4370,  0.7626,  0.4415,  1.1651,  2.0154,
          0.1374,  0.9386, -0.1860, -0.6446,  1.5392, -0.8696, -3.3312, -0.7479,
         -0.0255, -1.0233, -0.5962, -1.0055, -0.2106, -0.0075,  1.6734,  0.0103,
         -0.7040, -0.1853, -0.9962, -0.8313, -0.4610, -0.5601,  0.3956, -0.9823,
         -0.5065,  0.0998, -0.6540,  0.7317, -1.4344, -0.5008,  0.1716, -0.1600,
          0.2546, -0.5020, -1.0412,  0.7323, -1.0483, -0.4709,  0.2911,  1.9907,
          0.6614,  1.1899,  

In [11]:
# 把y转换成one-hot向量
def to_y(y):
    y_num = 5
    res = np.zeros((y_num, 1))
    res[y] = 1
    return res

In [12]:
y = []
for i in train.index:
    y.append(train['Sentiment'][i])

In [21]:
# 把x转换成one-hot向量
def to_X(words, length):
    res =torch.zeros([1, 100])
    words_length = len(words)
#     print(res)
    for j in range(words_length-1):
        word = words[j] + " " + words[j+1]
        res += embeds(autograd.Variable(torch.LongTensor([word_to_ix[word]])))
    return res

In [22]:
X = []
for i in train.index:
    words = train['Phrase'][i].split(' ')
    words_ = ""
    for word in words:
        if word not in stopwords:
            words_ =  words_ + " " + word
    words = words_.strip().split()
    X.append(to_X(words,EMBEDDING_DIM))

In [23]:
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.3)

In [25]:
# 定义模型
class n_gram(nn.Module):
    def __init__(self, n_dim, tag_size):
        super(n_gram, self).__init__()
        self.fc = nn.Linear(n_dim, tag_size)
        
    def forward(self, x):
        x = self.fc(x)
        x = nn.functional.dropout(x, p=0.2)
        output = nn.functional.log_softmax(x, dim=-1)
        return output
        
model = n_gram(EMBEDDING_DIM, y_num)
print(model)

optimizer = torch.optim.Adam(model.parameters())
loss_func = torch.nn.CrossEntropyLoss()

n_gram(
  (fc): Linear(in_features=100, out_features=5, bias=True)
)


In [59]:
model=model.cuda()

In [64]:
for epoch in range(5):
    print('epoch {}'.format(epoch + 1))
    # training-----------------------------
    train_loss = 0.
    train_acc = 0.
    for (batch_x, batch_y) in list(zip(X_train, y_train)):
        batch_y_ = torch.zeros([1])
        batch_y_[0] = batch_y
        batch_y = batch_y_
        batch_x, batch_y = Variable(batch_x).cuda(), Variable(batch_y).cuda()
        
        out = model(batch_x)
#         print(out.shape, batch_y.shape)
        
        loss = loss_func(out, batch_y.long())
        train_loss += loss.item()
        pred = torch.max(out, 1)[1]
        train_correct = (pred == batch_y).sum()
        train_acc += train_correct.item()
        optimizer.zero_grad()
        loss.backward()
        for state in optimizer.state.values():
            for k, v in state.items():
                if torch.is_tensor(v):
                    state[k] = v.cuda()
        optimizer.step()
    print('Train Loss: {:.6f}, Acc: {:.6f}'.format(train_loss / (len(
        y_train)), train_acc / (len(y_train))))

    # evaluation--------------------------------
    model.eval()
    eval_loss = 0.
    eval_acc = 0.
    for (batch_x, batch_y) in list(zip(X_test, y_test)):
        batch_y_ = torch.zeros([1])
        batch_y_[0] = batch_y
        batch_y = batch_y_
        batch_x, batch_y = Variable(batch_x).cuda(), Variable(batch_y).cuda()
        
        out = model(batch_x)
        
        loss = loss_func(out, batch_y.long())
        eval_loss += loss.item()
        pred = torch.max(out, 1)[1]
        num_correct = (pred == batch_y).sum()
        eval_acc += num_correct.item()
    print('Test Loss: {:.6f}, Acc: {:.6f}'.format(eval_loss / (len(
        y_test)), eval_acc / (len(y_test))))


epoch 1
Train Loss: 1.432325, Acc: 0.455841




Test Loss: 1.421885, Acc: 0.464650
epoch 2
Train Loss: 1.431565, Acc: 0.460400
Test Loss: 1.420933, Acc: 0.450980
epoch 3
Train Loss: 1.434614, Acc: 0.457928
Test Loss: 1.424881, Acc: 0.447264
epoch 4
Train Loss: 1.432096, Acc: 0.457928
Test Loss: 1.414831, Acc: 0.462173
epoch 5
Train Loss: 1.433252, Acc: 0.456930
Test Loss: 1.419857, Acc: 0.462557


## 2.2 双向LSTM网络进行情感分类

In [4]:
X_train, X_test, y_train, y_test =train_test_split(train['Phrase'], train['Sentiment'], test_size=0.3)

In [5]:
train_data = list(zip(X_train, y_train))

In [6]:
test_data = list(zip(X_test, y_test))

In [7]:
train_data[:5]

[('A strangely stirring experience that finds warmth in the coldest environment and makes each crumb of emotional comfort',
  3),
 ('psychedelic', 2),
 ('of the actress-producer and writer', 2),
 ('The Bard as black comedy -- Willie would have loved it', 3),
 ("if Argento 's Hollywood counterparts", 2)]

In [8]:
def get_tokenized(data):
    '''
    @params:
        data: 数据的列表，列表中的每个元素为 [文本字符串，0/1标签] 二元组
    @return: 切分词后的文本的列表，列表中的每个元素为切分后的词序列
    '''
    def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]
    
    return [tokenizer(review) for review, _ in data]

In [9]:
def get_vocab(data):
    '''
    @params:
        data: 同上
    @return: 数据集上的词典，Vocab 的实例（freqs, stoi, itos）
    '''
    tokenized_data = get_tokenized(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5)

vocab = get_vocab(train_data)
print('# words in vocab:', len(vocab))

# words in vocab: 14141


In [10]:
def preprocess(data, vocab):
    '''
    @params:
        data: 同上，原始的读入数据
        vocab: 训练集上生成的词典
    @return:
        features: 单词下标序列，形状为 (n, max_l) 的整数张量
        labels: 情感标签，形状为 (n,) 的0/1整数张量
    '''
    max_l = 500  # 将每条评论通过截断或者补0，使得长度变成500

    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))

    tokenized_data = get_tokenized(data)
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels

In [11]:
train_set = Data.TensorDataset(*preprocess(train_data, vocab))
test_set = Data.TensorDataset(*preprocess(test_data, vocab))

# 上面的代码等价于下面的注释代码
# train_features, train_labels = preprocess(train_data, vocab)
# test_features, test_labels = preprocess(test_data, vocab)
# train_set = Data.TensorDataset(train_features, train_labels)
# test_set = Data.TensorDataset(test_features, test_labels)

# len(train_set) = features.shape[0] or labels.shape[0]
# train_set[index] = (features[index], labels[index])

In [12]:
batch_size = 64
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
print('#batches:', len(train_iter))

X torch.Size([64, 500]) y torch.Size([64])
#batches: 1707


In [13]:
class BiRNN(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        '''
        @params:
            vocab: 在数据集上创建的词典，用于获取词典大小
            embed_size: 嵌入维度大小
            num_hiddens: 隐藏状态维度大小
            num_layers: 隐藏层个数
        '''
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        
        # encoder-decoder framework
        # bidirectional设为True即得到双向循环神经网络
        self.encoder = nn.LSTM(input_size=embed_size, 
                                hidden_size=num_hiddens, 
                                num_layers=num_layers,
                                bidirectional=True)
        self.decoder = nn.Linear(4*num_hiddens, 5) # 初始时间步和最终时间步的隐藏状态作为全连接层输入
        
    def forward(self, inputs):
        '''
        @params:
            inputs: 词语下标序列，形状为 (batch_size, seq_len) 的整数张量
        @return:
            outs: 对文本情感的预测，形状为 (batch_size, 2) 的张量
        '''
        # 因为LSTM需要将序列长度(seq_len)作为第一维，所以需要将输入转置
        embeddings = self.embedding(inputs.permute(1, 0)) # (seq_len, batch_size, d)
        # rnn.LSTM 返回输出、隐藏状态和记忆单元，格式如 outputs, (h, c)
        outputs, _ = self.encoder(embeddings) # (seq_len, batch_size, 2*h)
        encoding = torch.cat((outputs[0], outputs[-1]), -1) # (batch_size, 4*h)
        outs = self.decoder(encoding) # (batch_size, 5)
        return outs

embed_size, num_hiddens, num_layers = 100, 100, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)

加载预训练的词向量

由于预训练词向量的词典及词语索引与我们使用的数据集并不相同，所以需要根据目前的词典及索引的顺序来加载预训练词向量。

In [17]:
cache_dir = "./"
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=cache_dir)

def load_pretrained_embedding(words, pretrained_vocab):
    '''
    @params:
        words: 需要加载词向量的词语列表，以 itos (index to string) 的词典形式给出
        pretrained_vocab: 预训练词向量
    @return:
        embed: 加载到的词向量
    '''
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # 初始化为0
    oov_count = 0 # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    return embed

net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它


  0%|          | 0/400000 [00:00<?, ?it/s][A
  0%|          | 1118/400000 [00:00<00:35, 11144.04it/s][A
  1%|          | 2130/400000 [00:00<00:36, 10801.49it/s][A
  1%|          | 3769/400000 [00:00<00:32, 12022.64it/s][A
  1%|▏         | 5231/400000 [00:00<00:31, 12683.79it/s][A
  2%|▏         | 6867/400000 [00:00<00:28, 13590.65it/s][A
  2%|▏         | 8232/400000 [00:00<00:28, 13608.27it/s][A
  2%|▏         | 9894/400000 [00:00<00:27, 14358.64it/s][A
  3%|▎         | 11367/400000 [00:00<00:26, 14466.67it/s][A
  3%|▎         | 12896/400000 [00:00<00:26, 14699.77it/s][A
  4%|▎         | 14339/400000 [00:01<00:26, 14492.36it/s][A
  4%|▍         | 15896/400000 [00:01<00:25, 14797.83it/s][A
  4%|▍         | 17590/400000 [00:01<00:24, 15346.57it/s][A
  5%|▍         | 19284/400000 [00:01<00:24, 15782.16it/s][A
  5%|▌         | 21070/400000 [00:01<00:23, 16314.06it/s][A
  6%|▌         | 22895/400000 [00:01<00:22, 16833.65it/s][A
  6%|▌         | 24741/400000 [00:01<00:21, 1

There are 916 oov words.



100%|█████████▉| 398535/400000 [00:38<00:00, 16418.01it/s][A

In [18]:
def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval()
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train()
            else:
                if('is_training' in net.__code__.co_varnames):
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n

def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y) 
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

In [19]:
lr, num_epochs = 0.01, 5
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()

train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.9642, train acc 0.602, test acc 0.625, time 255.5 sec
epoch 2, loss 0.4367, train acc 0.639, test acc 0.638, time 254.3 sec
epoch 3, loss 0.2765, train acc 0.655, test acc 0.647, time 255.1 sec
epoch 4, loss 0.2010, train acc 0.666, test acc 0.642, time 257.2 sec
epoch 5, loss 0.1595, train acc 0.670, test acc 0.647, time 259.6 sec


In [22]:
def predict_sentiment(net, vocab, sentence):
    '''
    @params：
        net: 训练好的模型
        vocab: 在该数据集上创建的词典，用于将给定的单词序转换为单词下标的序列，从而输入模型
        sentence: 需要分析情感的文本，以单词序列的形式给出
    @return: 预测的结果
    0 - negative
    1 - somewhat negative
    2 - neutral
    3 - somewhat positive
    4 - positive
    '''
    device = list(net.parameters())[0].device # 读取模型所在的环境
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    sentiments = {0: "negative", 1: "somewhat negative", 2: "neutral", 3: "somewhat positive", 4: "positive"}
    return sentiments[label.item()]

In [23]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great'])

'somewhat positive'

## 2.3 卷积神经网络进行情感分类

**TextCNN 模型**

TextCNN 模型主要使用了一维卷积层和时序最大池化层。假设输入的文本序列由 n 个词组成，每个词用 d 维的词向量表示。那么输入样本的宽为 n，输入通道数为 d。TextCNN 的计算主要分为以下几步。

1. 定义多个一维卷积核，并使用这些卷积核对输入分别做卷积计算。宽度不同的卷积核可能会捕捉到不同个数的相邻词的相关性。
2. 对输出的所有通道分别做时序最大池化，再将这些通道的池化输出值连结为向量。
3. 通过全连接层将连结后的向量变换为有关各类别的输出。这一步可以使用丢弃层应对过拟合。

下面我们来实现 TextCNN 模型。与上一节相比，除了用一维卷积层替换循环神经网络外，这里我们还使用了两个嵌入层，一个的权重固定，另一个则参与训练。

In [29]:
class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d,self).__init__()
    def forward(self, x):
        return torch.max_pool1d(x,kernel_size=x.shape[2])

class TextCNN(nn.Module):
    def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
        '''
        @params:
            vocab: 在数据集上创建的词典，用于获取词典大小
            embed_size: 嵌入维度大小
            kernel_sizes: 卷积核大小列表
            num_channels: 卷积通道数列表
        '''
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size) # 参与训练的嵌入层
        self.constant_embedding = nn.Embedding(len(vocab), embed_size) # 不参与训练的嵌入层
        
        self.pool = GlobalMaxPool1d() # 时序最大池化层没有权重，所以可以共用一个实例
        self.convs = nn.ModuleList()  # 创建多个一维卷积层
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels = 2*embed_size, 
                                        out_channels = c, 
                                        kernel_size = k))
            
        self.decoder = nn.Linear(sum(num_channels), 5)
        self.dropout = nn.Dropout(0.5) # 丢弃层用于防止过拟合

    def forward(self, inputs):
        '''
        @params:
            inputs: 词语下标序列，形状为 (batch_size, seq_len) 的整数张量
        @return:
            outputs: 对文本情感的预测，形状为 (batch_size, 5) 的张量
        '''
        embeddings = torch.cat((
            self.embedding(inputs), 
            self.constant_embedding(inputs)), dim=2) # (batch_size, seq_len, 2*embed_size)
        # 根据一维卷积层要求的输入格式，需要将张量进行转置
        embeddings = embeddings.permute(0, 2, 1) # (batch_size, 2*embed_size, seq_len)
        
        encoding = torch.cat([
            self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
        # encoding = []
        # for conv in self.convs:
        #     out = conv(embeddings) # (batch_size, out_channels, seq_len-kernel_size+1)
        #     out = self.pool(F.relu(out)) # (batch_size, out_channels, 1)
        #     encoding.append(out.squeeze(-1)) # (batch_size, out_channels)
        # encoding = torch.cat(encoding) # (batch_size, out_channels_sum)
        
        # 应用丢弃法后使用全连接层得到输出
        outputs = self.decoder(self.dropout(encoding))
        return outputs

embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]
net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)

In [30]:
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 1.1583, train acc 0.544, test acc 0.604, time 145.4 sec
epoch 2, loss 0.4847, train acc 0.614, test acc 0.629, time 144.0 sec
epoch 3, loss 0.2920, train acc 0.650, test acc 0.649, time 143.8 sec
epoch 4, loss 0.2059, train acc 0.673, test acc 0.659, time 143.6 sec
epoch 5, loss 0.1566, train acc 0.687, test acc 0.662, time 143.5 sec


In [72]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'bad'])

'somewhat negative'