In [3]:
import collections
import os
import random
import tarfile
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data

import sys
os.environ['CUDA_VISIBLE_DEVICES']='0'
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATA_ROOT='./S1/CSCL/tangss/Datasets'

# 读取数据

In [4]:
fname=os.path.join(DATA_ROOT,'aclImdb_v1.tar.gz')
if not os.path.exists(os.path.join(DATA_ROOT,'aclImdb')):
    print('从压缩包解压...')
    with tarfile.open(fname,'r') as f:
        f.extractall(DATA_ROOT)

In [5]:
from tqdm import tqdm
def read_imdb(folder='train',data_root="./S1/CSCL/tangss/Datasets/aclImdb"):
    data=[]
    for label in ['pos','neg']:
        folder_name=os.path.join(data_root,folder,label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name,file),'rb') as f:
                review=f.read().decode('utf-8').replace('\n','').lower()
                data.append([review,1 if label=='pos' else 0])
    random.shuffle(data)
    return data
train_data,test_data=read_imdb('train'),read_imdb('test')

100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:05<00:00, 2147.60it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:05<00:00, 2213.46it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:05<00:00, 2229.79it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:06<00:00, 1828.55it/s]


# 预处理数据

In [6]:
def get_tokenized_imdb(data):
    def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]
    return [tokenizer(review) for review,_ in data]

In [7]:
def get_vocab_imdb(data):
    tokenized_data=get_tokenized_imdb(data)
    counter=collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter,min_freq=5)
vocab=get_vocab_imdb(train_data)
'#words in vocab:',len(vocab)

('#words in vocab:', 46152)

In [8]:
def preprocess_imdb(data,vocab):
    max_l=500
    def pad(x):
        return x[:max_l] if len(x)>max_l else x+[0]*(max_l-len(x))
    tokenized_data=get_tokenized_imdb(data)
    features=torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels=torch.tensor([score for _,score in data])
    return features,labels

# 创建数据迭代器

In [9]:
batch_size=64
train_set=Data.TensorDataset(*preprocess_imdb(train_data,vocab))
test_set=Data.TensorDataset(*preprocess_imdb(test_data,vocab))
train_iter=Data.DataLoader(train_set,batch_size,shuffle=True)
test_iter=Data.DataLoader(test_set,batch_size)

In [10]:
for X,y in train_iter:
    print('X',X.shape,'y:',y.shape)
    break
'#batches:',len(train_iter)

X torch.Size([64, 500]) y: torch.Size([64])


('#batches:', 391)

# 使用循环神经网络的模型

In [11]:
class BiRNN(nn.Module):
    def __init__(self,vocab,embed_size,num_hiddens,num_layers):
        super(BiRNN,self).__init__()
        self.embedding=nn.Embedding(len(vocab),embed_size)
        # bidirectional设为True即得到双向循环神经⽹络
        self.encoder=nn.LSTM(input_size=embed_size,
                            hidden_size=num_hiddens,
                            num_layers=num_layers,
                            bidirectional=True)
        # 初始时间步和最终时间步的隐藏状态作为全连接层输⼊
        self.decoder=nn.Linear(4*num_hiddens,2)
    def forward(self,inputs):
        embeddings=self.embedding(inputs.permute(1,0))
        outputs,_=self.encoder(embeddings)
        encoding=torch.cat((outputs[0],outputs[-1]),-1)
        outs=self.decoder(encoding)
        return outs

In [23]:
embed_size,num_hiddens,num_layers=100,100,2
net=BiRNN(vocab,embed_size,num_hiddens,num_layers)

# 加载预训练的词向量

In [13]:
glove_vocab=Vocab.GloVe(name='6B',dim=100,cache='./Datasets/glove')

In [14]:
def load_pretrained_embedding(words,pretrained_vocab):
    embed=torch.zeros(len(words),pretrained_vocab.vectors[0].shape[0])
    oov_count=0
    for i,word in enumerate(words):
        try:
            idx=pretrained_vocab.stoi[word]
            embed[i,:]=pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count+=0
    if oov_count>0:
        print('there are %d oov words.'%oov_count)
    return embed
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos,glove_vocab))
net.embedding.weight.requires_grad=False

# 训练并评价模型

In [15]:
import time
def train_ch5(net,train_iter,test_iter,batch_size,optimizer,device,num_epochs):
    net=net.to(device)
    print('training on ',device)
    loss=torch.nn.CrossEntropyLoss()
    batch_count=0
    for epoch in range(num_epochs):
        train_l_sum,train_acc_sum,n,start=0.0,0.0,0,time.time()
        for X,y in train_iter:
            X=X.to(device)
            y=y.to(device)
            y_hat=net(X)
            l=loss(y_hat,y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum+=l.cpu().item()
            train_acc_sum+=(y_hat.argmax(dim=1)==y).sum().cpu().item()
            n+=y.shape[0]
            batch_count+=1
        test_acc=evaluate_accuracy(test_iter,net)
        print('epoch %d,loss %.4f,train acc %.3f,test acc %.3f,time %.lf sec'%(epoch+1,train_l_sum/batch_count,train_acc_sum/n,test_acc,time.time()-start))

In [16]:
def evaluate_accuracy(data_iter,net,device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
    acc_sum,n=0.0,0
    with torch.no_grad():
        for X,y in data_iter:
            if isinstance(net,torch.nn.Module):
                net.eval()#评估模式，这会关闭dropout
                acc_sum+=(net(X.to(device)).argmax(dim=1)==y.to(device)).float().sum().cpu().item()
                net.train()#改回训练模式
            else:
                if('is_training' in net.__code__.co_varname):
                    acc_sum+=(net(X,is_training=False).argmax(dim=1)==y).float().sum().item()
            n+=y.shape[0]
    return acc_sum/n

In [17]:
lr,num_epochs=0.01,5
optimizer=torch.optim.Adam(filter(lambda p:p.requires_grad,net.parameters()),lr=lr)
loss=nn.CrossEntropyLoss()
train_ch5(net,train_iter,test_iter,batch_size,optimizer,device,num_epochs)

training on  cuda
epoch 1,loss 0.5873,train acc 0.680,test acc 0.803,time 221 sec
epoch 2,loss 0.2000,train acc 0.823,test acc 0.841,time 219 sec
epoch 3,loss 0.1137,train acc 0.854,test acc 0.851,time 219 sec
epoch 4,loss 0.0762,train acc 0.871,test acc 0.840,time 219 sec
epoch 5,loss 0.0522,train acc 0.895,test acc 0.855,time 219 sec


In [18]:
def predict_sentiment(net,vocab,sentence):
    device=list(net.parameters())[0].device
    sentence=torch.tensor([vocab.stoi[word] for word in sentence],device=device)
    label=torch.argmax(net(sentence.view((1,-1))),dim=1)
    return 'positive' if label.item()==1 else 'negative'

In [19]:
predict_sentiment(net,vocab,['this','movie','is','so','great'])

'positive'

In [21]:
predict_sentiment(net,vocab,['this','movie','is','bad'])

'negative'