In [1]:
%pwd
%ls
import numpy as np

__notebook__.ipynb


# 1.读取数据

In [2]:
with open("../input/movie-review-data/reviews.txt") as f:
    reviews = f.read()
    
with open("../input/movie-review-data/labels.txt") as f:
    labels = f.read()

# 2.数据预处理和序列化

In [3]:
import re

def preprocess(w):
    # w = unicode_to_ascii(w.lower().strip())
    w = w.lower().strip()

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Záéíóúâêôãõàèìò?.!,¿\n]+", " ", w)

    w = w.strip()
    

    return w

In [4]:
reviews = preprocess(reviews)

In [5]:
reviews[:50]


'bromwell high is a cartoon comedy . it ran at the '

In [6]:
from collections import Counter
words = reviews.split(' ')
word_count = Counter(words)
vocab_size = len(word_count)+2 # 1 for unknow word and 1 for padding
print(vocab_size)

76638


In [7]:
keys = list(word_count.keys())
keys[:5]

['bromwell', 'high', 'is', 'a', 'cartoon']

In [8]:
dictionary = {keys[i]:i+1 for i in range(len(keys))}
dictionary

{'bromwell': 1,
 'high': 2,
 'is': 3,
 'a': 4,
 'cartoon': 5,
 'comedy': 6,
 '.': 7,
 'it': 8,
 'ran': 9,
 'at': 10,
 'the': 11,
 'same': 12,
 'time': 13,
 'as': 14,
 'some': 15,
 'other': 16,
 'programs': 17,
 'about': 18,
 'school': 19,
 'life': 20,
 'such': 21,
 'teachers': 22,
 'my': 23,
 'years': 24,
 'in': 25,
 'teaching': 26,
 'profession': 27,
 'lead': 28,
 'me': 29,
 'to': 30,
 'believe': 31,
 'that': 32,
 's': 33,
 'satire': 34,
 'much': 35,
 'closer': 36,
 'reality': 37,
 'than': 38,
 'scramble': 39,
 'survive': 40,
 'financially': 41,
 'insightful': 42,
 'students': 43,
 'who': 44,
 'can': 45,
 'see': 46,
 'right': 47,
 'through': 48,
 'their': 49,
 'pathetic': 50,
 'pomp': 51,
 'pettiness': 52,
 'of': 53,
 'whole': 54,
 'situation': 55,
 'all': 56,
 'remind': 57,
 'schools': 58,
 'i': 59,
 'knew': 60,
 'and': 61,
 'when': 62,
 'saw': 63,
 'episode': 64,
 'which': 65,
 'student': 66,
 'repeatedly': 67,
 'tried': 68,
 'burn': 69,
 'down': 70,
 'immediately': 71,
 'recalled':

In [9]:
'''
'i do' -> ['i','do']
'''
def text2seq(text):
    res = []
    for word in text.split(' '):
        res.append(dictionary.get(word,vocab_size-1))
        
    return res
        

In [10]:
def pad(seq,length):
    seqLen = len(seq)
    if seqLen < length:
        seq.extend([0]*(length-seqLen))
        
    elif seqLen > length:
        seq = seq[:length]
        
    return seq

# 3.准备Dataset和Dataloader

In [11]:
import torch
import torch.utils.data as data

class myDataset(data.Dataset):
    def __init__(self):
        self.review_list = reviews.split('\n')
        self.label_list = labels.split('\n')
    def __getitem__(self,idx):
        label = 1 if self.label_list[idx]=='positive' else 0
        return (pad(text2seq(self.review_list[idx]),255), label)
    def __len__(self):
        return len(self.review_list)

In [12]:
dataset=myDataset()
dataset[2]

([759,
  167,
  168,
  14,
  169,
  170,
  171,
  95,
  172,
  113,
  173,
  98,
  24,
  174,
  175,
  4,
  176,
  30,
  177,
  133,
  147,
  11,
  178,
  32,
  179,
  180,
  181,
  182,
  44,
  183,
  184,
  134,
  185,
  30,
  19,
  186,
  167,
  187,
  98,
  11,
  188,
  7,
  189,
  190,
  86,
  53,
  11,
  191,
  14,
  128,
  4,
  192,
  193,
  194,
  195,
  18,
  196,
  21,
  14,
  197,
  11,
  198,
  147,
  199,
  200,
  201,
  30,
  202,
  203,
  11,
  204,
  205,
  167,
  195,
  206,
  207,
  208,
  137,
  209,
  30,
  210,
  211,
  147,
  11,
  212,
  7,
  213,
  213,
  174,
  89,
  206,
  151,
  179,
  214,
  4,
  215,
  30,
  216,
  147,
  11,
  212,
  98,
  4,
  217,
  218,
  11,
  219,
  151,
  180,
  220,
  134,
  4,
  221,
  11,
  222,
  223,
  4,
  224,
  225,
  147,
  11,
  226,
  4,
  227,
  61,
  184,
  151,
  180,
  228,
  30,
  46,
  89,
  8,
  33,
  229,
  30,
  137,
  191,
  32,
  3,
  230,
  231,
  33,
  232,
  7,
  213,
  213,
  233,
  234,
  44,
  235,
  44,
 

## 划分训练集和测试集

In [13]:
def myfunc(batch_data):
    '''
    batch_data: 32x2
    '''
    resData = []
    resLabel = []
    for i in batch_data:
        resData.append(i[0])
        resLabel.append(i[1])
    return torch.tensor(resData,dtype=torch.int),torch.tensor(resLabel,dtype=torch.float)

In [14]:
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])


In [15]:
trainloader = data.DataLoader(train_dataset, batch_size=32,shuffle=True,collate_fn=myfunc)
testloader = data.DataLoader(test_dataset, batch_size=32,shuffle=True,collate_fn=myfunc)



In [16]:
# for i in trainloader:
#     trData, trLabel = i
#     print(type(trData))
#     print(type(trLabel))
#     print(trData.size())
#     print(trLabel.size())
#     break

'''
<class 'torch.Tensor'>
<class 'torch.Tensor'>
torch.Size([32, 255])
torch.Size([32])
'''


"\n<class 'torch.Tensor'>\n<class 'torch.Tensor'>\ntorch.Size([32, 255])\ntorch.Size([32])\n"

# 4. 定义模型

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F

emb_dim = 30

class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size,emb_dim,padding_idx=0)
        self.rnn = nn.RNN(input_size=emb_dim,hidden_size=32,num_layers=1,batch_first=True)
        self.fc1 = nn.Linear(32,128)
        self.fc2 = nn.Linear(128,32)
        self.fc3 = nn.Linear(32,1)

    def forward(self, x):
        x = self.embedding(x)                     # 32,255,30
#         h0 = torch.randn(1, x.size()[0], 32)               # D*num_layers, batch_size, hidden_size
        _,hns = self.rnn(x)                    # 1,32,32
        x = hns.view(hns.size()[1],-1)            # reshape to 32,32
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        outprob = torch.sigmoid(self.fc3(x))      # batch_size, 1
        return outprob.view(outprob.size()[0])    # reshape to 32
#         pass



In [18]:
# 创建网络模型
myModel = MyModel()

# 损失函数
loss_fn = nn.BCELoss()

# 优化器
learning_rate = 1e-2
optimizer = torch.optim.Adam(myModel.parameters(), lr=learning_rate)

# 训练的轮数
epoch = 100

In [19]:
def correct_num(vec1,vec2):
    result = (torch.abs(vec1-vec2)) <0.5
    return torch.sum(result).item()

In [20]:
test_data_size = len(test_dataset)
test_data_size

5000

# 5. 训练

In [21]:
for i in range(epoch):
    print("-------第 {} 轮训练开始-------".format(i+1))

    # 训练步骤开始
    myModel.train()
    step = 0
    for data in trainloader:
        trData, labels = data
        outputs = myModel(trData) # 求模型的输出
        loss = loss_fn(outputs, labels)  # 求loss
        step += 1
        
        if (step%100 ==0):
            print(f'第{i+1}轮第{step}训练step时的loss: {loss.item()}')
        

        # 优化器优化模型
        optimizer.zero_grad() # 梯度清零
        loss.backward()       # 求梯度
        optimizer.step()      # 更新参数


    # 测试步骤开始
    myModel.eval()
    total_test_loss = 0       # 每一轮总的loss
    total_accuracy = 0        # 每一轮总的精确度
    with torch.no_grad():     # 不求梯度，不更新参数
        for data in testloader:
            teData, teLabels = data
            outputs = myModel(teData)
            loss = loss_fn(outputs, teLabels)
            total_test_loss = total_test_loss + loss.item()
            total_accuracy = total_accuracy + correct_num(teLabels,outputs)

    print(f"第{i+1}轮整体测试集上的Loss: {total_test_loss}")
    print(f"第{i+1}轮整体测试集上的Accuracy: {total_accuracy/test_data_size}")
   

#     torch.save(tudui, "tudui_{}_epoch.pth".format(i))
#     print("模型已保存")

-------第 1 轮训练开始-------
第1轮第100训练step时的loss: 0.6935737729072571
第1轮第200训练step时的loss: 0.6904212236404419
第1轮第300训练step时的loss: 0.6965456008911133
第1轮第400训练step时的loss: 0.6885498762130737
第1轮第500训练step时的loss: 0.6987849473953247
第1轮第600训练step时的loss: 0.6940868496894836
第1轮整体测试集上的Loss: 108.82200574874878
第1轮整体测试集上的Accuracy: 0.5076
-------第 2 轮训练开始-------
第2轮第100训练step时的loss: 0.691878080368042
第2轮第200训练step时的loss: 0.6945547461509705
第2轮第300训练step时的loss: 0.6931473612785339
第2轮第400训练step时的loss: 0.6953945159912109
第2轮第500训练step时的loss: 0.6925694346427917
第2轮第600训练step时的loss: 0.6913613080978394
第2轮整体测试集上的Loss: 108.81661278009415
第2轮整体测试集上的Accuracy: 0.5052
-------第 3 轮训练开始-------
第3轮第100训练step时的loss: 0.6885122656822205
第3轮第200训练step时的loss: 0.6955881118774414
第3轮第300训练step时的loss: 0.6860030889511108
第3轮第400训练step时的loss: 0.6974500417709351
第3轮第500训练step时的loss: 0.6955376863479614
第3轮第600训练step时的loss: 0.6973163485527039
第3轮整体测试集上的Loss: 108.85424464941025
第3轮整体测试集上的Accuracy: 0.5052
-------第 4 轮训练开始-------

In [22]:
# for i in testloader:
#     trData, trLabel = i
#     prob=myModel(trData)
#     print(prob)
#     print(prob.size())
#     print(trLabel.size())
    
#     print(correct_num(prob,trLabel))
#     break

# '''
# tensor([0.5114, 0.5114, 0.5048, 0.5181, 0.5114, 0.5114, 0.5114, 0.5114, 0.5130,
#         0.4825, 0.5268, 0.5114, 0.5114, 0.5114, 0.5114, 0.5114, 0.5114, 0.5114,
#         0.5114, 0.5114, 0.5242, 0.5115, 0.5067, 0.4983, 0.5114, 0.5114, 0.5114,
#         0.5114, 0.5108, 0.5033, 0.5114, 0.5114], grad_fn=<ViewBackward>)
# torch.Size([32])
# torch.Size([32])
# '''