参考地址：[唐国梁Tommy](https://space.bilibili.com/474347248/channel/index)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from string import punctuation

In [2]:
with open('./reviews.txt', 'r') as file:
    text = file.read()

In [3]:
with open('./labels.txt', 'r') as file:
    labels = file.read()

In [4]:
labels[:10]

'positive\nn'

#### 1. 处理text

In [5]:
# 去标点符号
clean_text = ''.join(  [cha for cha in text if cha not in punctuation]     )

In [7]:
clean_text = clean_text.split('\n')

In [8]:
len(clean_text)

25001

In [9]:
clean_text[0]

'bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   '

#### 2. 处理label

In [10]:
labels = labels.split('\n')

In [11]:
labels[:5]

['positive', 'negative', 'positive', 'negative', 'positive']

In [22]:
# positive :1 negative: 0
label_int = np.array(  [1 if x == 'positive' else 0 for x in labels]   )

In [23]:
len(label_int)

25001

In [24]:
from collections import Counter
Counter(label_int)

Counter({1: 12500, 0: 12501})

#### 3. 构建vocab

In [14]:
words = [word.lower() for sentence in clean_text for word in sentence.split(' ')]

In [15]:
words[:10]

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '', 'it', 'ran', 'at']

In [16]:
various_words = [word  for word in set(words)  if word]  # 清理调空字符

In [17]:
len(various_words)

74072

#### 4. int2word and word2int

In [20]:
int_word = dict(enumerate(various_words, 1))
word_int = {w: i for i, w in int_word.items()}

In [21]:
len(int_word), len(word_int)

(74072, 74072)

#### 5. padding

In [25]:
# 清理文本太短及过长的样本
# 统计文本中，每条评论的长度
sentence_length = [  len(sentence.split()) for sentence in clean_text  ]

In [26]:
counts = Counter(sentence_length)

In [28]:
min_sen = min(   sorted(counts.items() )   )

In [29]:
min_sen

(0, 1)

In [30]:
# 最大评论长度
max_sen = max(sorted(counts.items()))

In [31]:
max_sen

(2514, 1)

In [32]:
counts[2514]

1

In [33]:
# 获取 min 和 max 对应的索引
min_index = [i for i, length in enumerate(sentence_length) if length == min_sen[0]]
max_index = [i for i, length in enumerate(sentence_length) if length == max_sen[0]]

In [34]:
min_index

[25000]

In [35]:
max_index

[3908]

In [36]:
# 根据索引删除文本中过短，或过长的评论
new_text = np.delete(clean_text, min_index)

In [37]:
len(clean_text), len(new_text)

(25001, 25000)

In [38]:
new_text2 = np.delete(new_text, max_index)

In [40]:
len(clean_text), len(new_text2)

(25001, 24999)

In [41]:
# 同样需要在标签集中根据索引删除对应的标签
new_labels = np.delete(label_int, min_index)
new_labels = np.delete(new_labels, max_index)

In [42]:
len(label_int), len(new_labels)

(25001, 24999)

In [43]:
new_text2[0]

'bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   '

In [44]:
# 2.6 将单词映射为整型

text_ints = []
for sentence in new_text2:
    sample = list()
    for word in sentence.split():
        int_value = word_int[word] # 获取到单词对应的键
        sample.append(int_value)
    text_ints.append(sample)

In [45]:
# 设定统一的文本长度，对整个文本数据中的每条评论进行填充或截断
# 设定每条评论固定长度为200个单词，不足的评论用0填充，超过的直接截断

def reset_text(text, seq_len):
    dataset = np.zeros(  (len(text), seq_len)  )  # 超长的补0
    
    for index, sentence in enumerate(text):
        if len(sentence) < seq_len:
            dataset[index, :len(sentence)] = sentence
        else:
            dataset[index, :] = sentence[: seq_len] # 截断
            
    return dataset

In [46]:
dataset = reset_text(text_ints, seq_len=200)

In [47]:
dataset.shape

(24999, 200)

In [48]:
dataset[0, :]

array([62697., 72211., 33323., 12552., 31884., 17251., 73299., 27908.,
       33626., 23906., 60435., 29069., 55246., 45865., 59719., 18591.,
        9946., 57217.,  8754., 36693., 55246., 47129.,  4762., 20176.,
       27068., 23906.,  8903., 17817.,  7732., 32514., 38070.,  1023.,
       17776., 62697., 72211., 64350., 39526., 33323., 12335., 10611.,
       38070., 46924., 25639., 33323., 47129., 23906.,  6058., 38070.,
         794., 73159., 23906.,  7431., 12469., 68376., 70877., 33260.,
       62543., 47388.,  1837.,  1701., 47129., 69580., 23906., 34713.,
       55853., 23906., 13278., 27898.,  1045., 53736., 32514., 55853.,
       23906., 49258., 47547., 37983., 27266.,  1837., 12469., 48599.,
       47547., 39491., 23906., 23727., 27068., 10909., 12552., 15379.,
        3572., 26751., 38070., 25736., 16270., 23906., 57217., 47547.,
        9347., 30407., 33626., 72211., 12552., 48905., 66388.,  8085.,
       47547., 30558., 24592., 38070., 65745., 54996., 55853., 67667.,
      

#### 6. 划分训练集和验证集和测试集

In [52]:
import torch
import torch.nn as nn

In [78]:
type(dataset), type(label_int)

(numpy.ndarray, numpy.ndarray)

In [54]:
# 数据类型转换
dataset_tensor = torch.from_numpy(dataset)
label_tensor = torch.from_numpy(new_labels)

In [55]:
dataset_tensor.shape, label_tensor.shape

(torch.Size([24999, 200]), torch.Size([24999]))

In [56]:
# 数据分割，train, val, test

# 总样本数
all_samples = len(dataset_tensor)
print("总样本数：",all_samples)

# 设置比例
ratio = 0.8
train_size = int(all_samples * 0.8) # 训练样本数
print("训练样本数：",train_size)

rest_size = all_samples - train_size # 剩余样本数

val_size = int(rest_size * 0.5) # 验证样本数
print("验证样本数：", val_size)

test_size = int(rest_size * 0.5) # 测试样本数
print("测试样本数：", test_size)

总样本数： 24999
训练样本数： 19999
验证样本数： 2500
测试样本数： 2500


In [57]:
# 获取train, val, test 样本

# train
train = dataset_tensor[:train_size]
train_labels = label_tensor[:train_size]

# 剩余样本
rest_samples = dataset_tensor[train_size:]
rest_labels = label_tensor[train_size:]

# val
val = rest_samples[:val_size]
val_labels = rest_labels[:val_size]

# test
test = rest_samples[val_size:]
test_labels = rest_labels[val_size:]

In [58]:
train.shape, train_labels.shape

(torch.Size([19999, 200]), torch.Size([19999]))

In [59]:
val.shape, val_labels.shape

(torch.Size([2500, 200]), torch.Size([2500]))

In [60]:
test.shape, test_labels.shape

(torch.Size([2500, 200]), torch.Size([2500]))

In [61]:
# 通过dataLoadder 按批处理数据
from torch.utils.data import TensorDataset, DataLoader

In [62]:
# 对数据进行封装 （评论，标签）
train_dataset = TensorDataset(train, train_labels)
val_dataset = TensorDataset(val, val_labels)
test_dataset = TensorDataset(test, test_labels)

batch_size = 128
# 批处理
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)

In [63]:
# 获取train中的第一批数据
data, label = next(iter(train_loader))

In [65]:
data.shape, label.shape

(torch.Size([128, 200]), torch.Size([128]))

In [66]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

### 定义模型

In [88]:
class sentiment(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size, num_layers, dropout=0.5):
        super(sentiment, self).__init__()
        
        # batch_size = 128
        # seq_len=200
        
        # # 初始化超参数
        # vocab_size = len(word_int) + 1 # 输入 不同的单词个数  因为我们的index是从1开始的# 既word_embedding索引为0的 不会被更新。
        # output_size = 1 # 输出
        # embedding_dim = 400 # 词嵌入维度
        # hidden_dim = 128 # 隐藏层节点个数
        # num_layers = 2 # lstm的层数
        
        self.hidden_dim = hidden_dim
        self.ouput_size = output_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.linear = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden): 
        #print('x.shape:', x.shape)
        batch_size = x.size(0) # 获取batch_size
        x = x.long()
        #print('x.long().shape:', x.shape)
        embeds = self.embedding(x)
        
        # print('Embedding.out.shape:', embeds.shape)
        # print('h_0.shape:',   hidden[0].shape)
        # print('h_0:',   hidden[0])
        # print('c_0.shape:',   hidden[1].shape)
        # print('C_0:',   hidden[1])
        
        #embeds [batch_size, seq_size,embedding_dim(fea_num)] = [128 * 200 * 400]
        out, hidden = self.lstm(embeds, hidden)
        # out: [batch_size, seq_size, hidden_dim] = [128 * 200 * 128]
        
        # print('lstm.out:', out.shape)  # [128 * 200 * 128]
        # print('lstm.h_n:', hidden[0].shape) #h_n.shape[num_layers, batch_size, hidden_dim] = [2 * 128 * 128]
        # print('lstm.c_n:', hidden[1].shape) #c_n.shape[num_layers, batch_size, hidden_dim] = [2 * 128 * 128]
        
        out = out.reshape(-1, self.hidden_dim)
        # out: [25600=128*200, 128]
        # print('reshape.out.shape', out.shape)
        
        out = self.linear(out)
        # out: [output_size] = [25600 * 1]
        # print('linear.out.shape:', out.shape)
        
        sigmoid_out = self.sigmoid(out) 
        # sigmoid_out: [25600 * 1] 维度不变的
        # print('Sigmoid.shape:', sigmoid_out.shape)
        
        sigmoid_out = sigmoid_out.reshape(batch_size, -1)
        # sigmoid_out: [128 * 200] 维度不变的
        # print('Sigmoid.reshape.shape:', sigmoid_out.shape)
        # print('Sigmoid.reshape.shape:', sigmoid_out)
        
        sigmoid_out = sigmoid_out[:, -1]
        # print('last out.shape:', sigmoid_out.shape)
        # print('last out.shape:', sigmoid_out)
        
        # raise Exception('err')
        
        return sigmoid_out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.num_layers, batch_size, self.hidden_dim).zero_().to(device),
                  weight.new(self.num_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden 

In [89]:
# 初始化超参数
# seq_len = 200
# batch_size = 128
# vocab_size = 74073
vocab_size = len(word_int) + 1 # 输入 不同的单词个数  因为我们的index是从1开始的# 既word_embedding索引为0的 不会被更新。
output_size = 1 # 输出
embedding_dim = 400 # 词嵌入维度
hidden_dim = 128 # 隐藏层节点个数
num_layers = 2 # lstm的层数

In [90]:
# 创建模型
model = sentiment(vocab_size, embedding_dim, hidden_dim, output_size, num_layers)

In [91]:
model

sentiment(
  (embedding): Embedding(74073, 400)
  (lstm): LSTM(400, 128, num_layers=2, batch_first=True, dropout=0.5)
  (linear): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

### 模型参数分析

#### 一：嵌入层

* 输入：$\rm [batch\_size, seq\_len] = [128, 200]$

```python
batch_size = x.size(0) # 获取batch_size
x = x.long()

self.embedding = nn.Embedding(vocab_size, embedding_dim)
embeds = self.embedding(x)
```
* 输出: $\rm [batch\_size, seq\_len, embedding\_size] = [128, 200, 400]$

#### 二: LSTM

[LSTM文档](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM)

* 输入：接上面输出
```python
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
out, hidden = self.lstm(embeds, hidden)
```


* 输出-out: $\rm [N,L,D*H_{out}] = [batch\_size, seq\_len, 1 \times hidden\_size] =  [128, 200, 128]$
* 输出-h_n: $\rm [D \times num\_layers, N, H_{out}] = [1 \times num\_layers, batch\_size, hidden\_size] = [2, 128, 128]$
* 输出-c_h: $\rm [D \times num\_layers, N, H_{cell}] = [1 \times num\_layers, batch\_size, hidden\_size] = [2, 128, 128]$

* 作用：
1. 相当于把400维的特征，压缩成128维特征
2. 当然中间使用了各种门来克服RNN的短时记忆. 尽量做到无损压缩？

#### 三：全连接层

* [文档torch.reshape](https://pytorch.org/docs/stable/generated/torch.reshape.html#torch.reshape)

```python
out = out.reshape(-1, self.hidden_dim)
```
输出：$[128\times 200, 128] = [25600, 128]$

* [文档nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html#torch.nn.Linear)

```python
self.linear = nn.Linear(hidden_dim, output_size) # input_size=hidden_dim=128 这个参数是对接上一层的输入的, 必须对齐上一次的输出
out = self.linear(out)
```
输出：$\rm [*, H_{out}=out\_features=1] = [25600, 1]$

 where all but the last dimension are the same shape as the input
 
 除了最后一个维度，其他的跟输入维度一致

#### 四：输出层（分类）


```python
self.sigmoid = nn.Sigmoid()
sigmoid_out = self.sigmoid(out) # 输出每个句子的类别概率分布 sigmoid不改变输入数据的形状
```

输出：$\rm [25600, 1]$

```python
sigmoid_out = sigmoid_out.reshape(batch_size, -1)
```
输出：$\rm [128, ?] = [128, 200]$

```python
sigmoid_out = sigmoid_out[:, -1]
```
输出: $\rm [128]$ 取了最后一列， 为啥取最后一列？

[二元交叉熵损失](https://pytorch.org/docs/stable/generated/torch.nn.BCELoss.html#torch.nn.BCELoss)

#### 五：总结

1. 嵌入层：128 * 200的句子阵列，经过嵌入层，变成 128 * 200 * 400 。 相当于完成了word2vec。  1个单词->400特征维度
2. lsmt层：128 * 200 * 400， 将特征压缩为 128 * 200 * 128                              400特征维度-> 变128维特征
3. 全连接层：将lstm的输出out, reshape到最后一个维度是128维特征（是为了保持语义对齐？半个单词扔入下一层?。）输出特征维度必须是1，便于接sogmoid完成二分类。

#### 六：参数日志
```python
x.shape: torch.Size([128, 200])
x.long().shape: torch.Size([128, 200])
Embedding.out.shape: torch.Size([128, 200, 400])
h_0.shape: torch.Size([2, 128, 128])
h_0: tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])
c_0.shape: torch.Size([2, 128, 128])
C_0: tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])
lstm.out: torch.Size([128, 200, 128])
lstm.h_n: torch.Size([2, 128, 128])
lstm.c_n: torch.Size([2, 128, 128])
reshape.out.shape torch.Size([25600, 128])
linear.out.shape: torch.Size([25600, 1])
Sigmoid.shape: torch.Size([25600, 1])
Sigmoid.reshape.shape: torch.Size([128, 200])
Sigmoid.reshape.shape: tensor([[0.4760, 0.4678, 0.4703,  ..., 0.4949, 0.4902, 0.4752],
        [0.4901, 0.4839, 0.4789,  ..., 0.4729, 0.4694, 0.4691],
        [0.4876, 0.4865, 0.4863,  ..., 0.4751, 0.4749, 0.4814],
        ...,
        [0.4786, 0.4785, 0.4747,  ..., 0.4837, 0.4819, 0.4820],
        [0.4817, 0.4824, 0.4814,  ..., 0.4794, 0.4776, 0.4740],
        [0.4742, 0.4786, 0.4758,  ..., 0.4801, 0.4867, 0.4926]],
       grad_fn=<ReshapeAliasBackward0>)
last out.shape: torch.Size([128])
last out.shape: tensor([0.4752, 0.4691, 0.4814, 0.4681, 0.4697, 0.4922, 0.4769, 0.4800, 0.4899,
        0.4674, 0.4897, 0.4811, 0.4813, 0.4744, 0.4880, 0.4758, 0.4850, 0.4868,
        0.4915, 0.4682, 0.4884, 0.4724, 0.4829, 0.4729, 0.4691, 0.4802, 0.4709,
        0.4707, 0.4906, 0.4702, 0.4837, 0.4818, 0.4754, 0.4862, 0.4534, 0.4722,
        0.4651, 0.4451, 0.4810, 0.4856, 0.4892, 0.4785, 0.4865, 0.4763, 0.4716,
        0.4931, 0.4601, 0.4717, 0.4765, 0.4740, 0.4830, 0.4815, 0.4913, 0.4778,
        0.4749, 0.4700, 0.4896, 0.4720, 0.4680, 0.4752, 0.4854, 0.4779, 0.4686,
        0.4798, 0.4728, 0.4898, 0.4675, 0.4702, 0.4727, 0.4803, 0.4757, 0.4830,
        0.4788, 0.4707, 0.4817, 0.4762, 0.4861, 0.4802, 0.4785, 0.4819, 0.4918,
        0.4630, 0.4900, 0.4705, 0.4633, 0.4680, 0.4946, 0.4583, 0.4794, 0.4937,
        0.4784, 0.4671, 0.4887, 0.4868, 0.4811, 0.4697, 0.4739, 0.4957, 0.4735,
        0.4730, 0.4699, 0.4693, 0.4680, 0.4783, 0.4637, 0.4902, 0.4606, 0.4828,
        0.4750, 0.4750, 0.4787, 0.4750, 0.4773, 0.4816, 0.4640, 0.4849, 0.4769,
        0.4760, 0.4776, 0.4766, 0.4827, 0.4769, 0.4860, 0.4850, 0.4831, 0.4820,
        0.4740, 0.4926], grad_fn=<SelectBackward0>)
```

### 训练模型

In [96]:
criterion = torch.nn.BCELoss() # 损失函数
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # 优化器
num_epochs = 10 # 循环次数
# num_epochs = 1 # 循环次数

In [97]:
model = model.to(device)

In [98]:
# 定义训练模型
def train(model, device, data_loader, criterion, optimizer, num_epochs, val_loader):
    history = []
    for epoch in range(num_epochs):
        hs = model.init_hidden(batch_size)
        train_loss = []
        train_correct = 0.0
        model.train()
        for data, target in data_loader:
            data = data.to(device)
            target = target.to(device)
            
            optimizer.zero_grad()
            output, hs = model(data, hs) # 模型训练
            hs = tuple(  [h.data for h in hs]  )
            # output.shape) # torch.Size([128])
            loss = criterion(output, target.float()) # 计算损失
            train_loss.append(loss.item()) # 累计损失
            loss.backward() # 反向传播
            optimizer.step() # 参数更新
            train_correct += torch.sum(output==target) # 比较
            
        # 模型严重
        model.eval()
        hs = model.init_hidden(batch_size)
        val_loss = []
        val_correct = 0.0
        with torch.no_grad():
            for data, target in val_loader:
                data = data.to(device)
                target = target.to(device)
                preds, hs = model(data, hs) # 验证
                hs = tuple([h.data for h in hs])
                
                losss = criterion(preds, target.float()) # 计算损失
                
                val_loss.append(loss.item()) # 累计损失
                
                val_correct += torch.sum(preds==target) # 比较
                
        print(f'Epoch {epoch}/{num_epochs} --- train loss {np.round(np.mean(train_loss), 5)} --- val loss {np.round(np.mean(val_loss),5)}')
    

In [99]:
train(model, device, train_loader, criterion, optimizer, num_epochs, val_loader)

Epoch 0/10 --- train loss 0.47734 --- val loss 0.40428
Epoch 1/10 --- train loss 0.25807 --- val loss 0.35364
Epoch 2/10 --- train loss 0.16648 --- val loss 0.19105
Epoch 3/10 --- train loss 0.10943 --- val loss 0.13632
Epoch 4/10 --- train loss 0.09124 --- val loss 0.04716
Epoch 5/10 --- train loss 0.08076 --- val loss 0.04988
Epoch 6/10 --- train loss 0.09673 --- val loss 0.12098
Epoch 7/10 --- train loss 0.07418 --- val loss 0.17525
Epoch 8/10 --- train loss 0.06416 --- val loss 0.06349
Epoch 9/10 --- train loss 0.05212 --- val loss 0.01537


In [100]:
# 测试
def test(model, data_loader, device, criterion):
    test_losses = []
    num_correct = 0
    # 初始化隐藏状态
    hs = model.init_hidden(batch_size)
    model.eval()
    for i, dataset in enumerate(data_loader):
        data = dataset[0].to(device) # 部署到device
        target = dataset[1].to(device)
        output, hs = model(data, hs) # 测试
        loss = criterion(output, target.float()) # 计算损失
        pred = torch.round(output) # 将预测值进行四舍五入，转换为0 或 1
        test_losses.append(loss.item()) # 保存损失
        correct_tensor = pred.eq(target.float().view_as(pred)) # 返回一堆True 或 False
        correct = correct_tensor.cpu().numpy()
        result = np.sum(correct)
        num_correct += result
        #print("num correct : ", num_correct)
        print(f'Batch {i}')
        print(f'loss : {np.round(np.mean(loss.item()), 3)}')
        print(f'accuracy : {np.round(result / len(data), 3) * 100} %')
        print()
    print("总的测试损失 test loss : {:.2f}".format(np.mean(test_losses)))
    print("总的测试准确率 test accuracy : {:.2f}".format(np.mean(num_correct / len(data_loader.dataset))))

In [101]:
test(model, test_loader, device, criterion)

Batch 0
loss : 0.759
accuracy : 84.39999999999999 %

Batch 1
loss : 1.245
accuracy : 70.3 %

Batch 2
loss : 1.191
accuracy : 73.4 %

Batch 3
loss : 1.121
accuracy : 71.89999999999999 %

Batch 4
loss : 0.927
accuracy : 75.0 %

Batch 5
loss : 1.022
accuracy : 78.10000000000001 %

Batch 6
loss : 0.965
accuracy : 74.2 %

Batch 7
loss : 0.837
accuracy : 77.3 %

Batch 8
loss : 1.059
accuracy : 72.7 %

Batch 9
loss : 0.947
accuracy : 78.10000000000001 %

Batch 10
loss : 1.221
accuracy : 74.2 %

Batch 11
loss : 0.723
accuracy : 76.6 %

Batch 12
loss : 0.98
accuracy : 80.5 %

Batch 13
loss : 1.237
accuracy : 70.3 %

Batch 14
loss : 0.918
accuracy : 74.2 %

Batch 15
loss : 1.062
accuracy : 75.8 %

Batch 16
loss : 1.031
accuracy : 76.6 %

Batch 17
loss : 0.894
accuracy : 78.10000000000001 %

Batch 18
loss : 0.89
accuracy : 76.6 %

总的测试损失 test loss : 1.00
总的测试准确率 test accuracy : 0.74


### 预测predict

In [111]:
text = 'this movie is so amazing. the plot is attractive. and I really like it.'
text = """this film lacked something i couldn  t put my finger on at first charisma on the part of the leading actress . this inevitably translated to lack of chemistry when she shared the screen with her leading man . even the romantic scenes came across as being merely the actors at play . it could very well have been the director who miscalculated what he needed from the actors . i just don  t know .  br    br   but could it have been the screenplay  just exactly who was the chef in love with  he seemed more enamored of his culinary skills and restaurant  and ultimately of himself and his youthful exploits  than of anybody or anything else . he never convinced me he was in love with the princess .  br    br   i was disappointed in this movie . but  don  t forget it was nominated for an oscar  so judge for yourself .  """

In [112]:
def converts(text):
    # 去除标点符号
    new_text = ''.join([char for char in text if char not in punctuation])
    print('new text:', new_text)
    # 文本映射为索引
    text_ints = [word_int[word.lower()] for word in new_text.split()]
    print("文本映射为索引：\n", text_ints)
    return text_ints

In [113]:
text_ints = converts(text)

new text: this film lacked something i couldn  t put my finger on at first charisma on the part of the leading actress  this inevitably translated to lack of chemistry when she shared the screen with her leading man  even the romantic scenes came across as being merely the actors at play  it could very well have been the director who miscalculated what he needed from the actors  i just don  t know   br    br   but could it have been the screenplay  just exactly who was the chef in love with  he seemed more enamored of his culinary skills and restaurant  and ultimately of himself and his youthful exploits  than of anybody or anything else  he never convinced me he was in love with the princess   br    br   i was disappointed in this movie  but  don  t forget it was nominated for an oscar  so judge for yourself   
文本映射为索引：
 [48840, 56853, 24852, 10937, 47547, 3644, 8941, 15753, 4762, 14110, 11467, 33626, 65442, 57132, 11467, 23906, 46379, 55853, 23906, 5607, 47267, 48840, 48955, 24785, 3

In [114]:
# 文本对齐，sequence_length = 200
new_text_ints = reset_text([text_ints], seq_len=200) # 注意这里要添加一个[]，因为，reset_text处理的二维数据

In [115]:
new_text_ints

array([[48840., 56853., 24852., 10937., 47547.,  3644.,  8941., 15753.,
         4762., 14110., 11467., 33626., 65442., 57132., 11467., 23906.,
        46379., 55853., 23906.,  5607., 47267., 48840., 48955., 24785.,
        38070., 24314., 55853., 42406., 48599., 65118., 17250., 23906.,
        43077., 21600., 20566.,  5607., 68727., 33061., 23906.,  7497.,
        61439., 41821., 52397., 55246., 30498., 19773., 23906., 65721.,
        33626., 67051., 73299., 44709., 61249., 40806., 56442., 53448.,
        23906., 25412., 68376., 67938., 66837., 66076.,  8454., 27500.,
        23906., 65721., 47547., 63309., 49171.,  8941., 33669., 25187.,
        25187., 15418., 44709., 73299., 56442., 53448., 23906., 29603.,
        63309., 24485., 68376., 18552., 23906., 59151., 27068., 63856.,
        21600., 66076., 54163., 40865., 58370., 55853.,  6411., 34154.,
        34767., 27266., 42203., 27266., 22085., 55853., 27071., 27266.,
         6411., 57018., 61280., 25639., 55853., 46145., 18473., 

In [116]:
# numpy to tensor
text_tensor  = torch.from_numpy(new_text_ints)

In [117]:
text_tensor.shape

torch.Size([1, 200])

In [118]:
def predict(model, text_tensor, device):
    batch_size = text_tensor.size(0) # 这里是1
    hs = model.init_hidden(batch_size) # 初始化隐状态
    text_tensor = text_tensor.to(device)
    
    pred, hs = model(text_tensor, hs) #判断
    print('概率值', pred.item())
    # 将pred概率值转换为0或1
    pred = torch.round(pred)
    print('判定值', pred.item())
    # 判断
    if pred.data == 1:
        print('正面评论')
    else:
        print('反面评论')

In [119]:
predict(model, text_tensor, device)

概率值 0.0018330179154872894
判定值 0.0
反面评论
