In [1]:
"""
读取数据并对数据做预处理
统计出训练数据中出现频次最多的5k个单词，用这出现最多的5k个单词创建词表（词向量）
对于测试数据，直接用训练数据构建的词表
"""

import os
import torch
import torch.nn as nn
import argparse
import copy
from torch.autograd import Variable

class DataProcessor(object):
    def read_text(self, is_train_data):
        # 读取原始文本数据
        # is_train_data==True表示读取训练数据
        # is_train_data==False表示读取测试数据
        datas = []
        labels = []
        if is_train_data:
            # 训练数据目录
            pos_path = "./data/aclImdb/train/pos/" 
            neg_path = "./data/aclImdb/train/neg/" 
        else:
            # 测试数据目录
            pos_path = "./data/aclImdb/test/pos/" 
            neg_path = "./data/aclImdb/test/neg/"
        pos_files = os.listdir(pos_path)  # 获取文件夹下的所有文件名称
        neg_files = os.listdir(neg_path)
        for i, file_name in enumerate(pos_files):
            if i > 2000:
                break
            file_position = pos_path + file_name
            with open(file_position, "r",encoding='utf-8') as f:
                data = f.read()
                datas.append(data)
                labels.append([1, 0]) # 正类标签维[1,0]
        for i, file_name in enumerate(neg_files):
            if(i > 2000):
                break
            file_position = neg_path + file_name 
            with open(file_position, "r",encoding='utf-8') as f:
                data = f.read()
                datas.append(data)
                labels.append([0, 1])  # 负类标签维[0,1]
        return datas, labels
    
    def word_count(self, datas):
        # 统计单词出现的频次，并将其降序排列，得出出现频次最多的单词
        dic = {}
        for data in datas:
            data_list = data.split()
            for word in data_list:
                word = word.lower() # 所有单词转化为小写
                if word in dic:
                    dic[word] += 1
                else:
                    dic[word] = 1
        word_count_sorted = sorted(dic.items(), 
            key=lambda item:item[1], reverse=True)
        return word_count_sorted
    
    def word_index(self, datas, vocab_size):
        # 创建词表
        word_count_sorted = self.word_count(datas)
        word2index = {}
        # 词表中未出现的词
        word2index["<unk>"] = 0
        # 句子添加的padding
        word2index["<pad>"] = 1
        
        # 词表的实际大小由词的数量和限定大小决定
        vocab_size = min(len(word_count_sorted), vocab_size)
        for i in range(vocab_size):
            word = word_count_sorted[i][0]
            word2index[word] = i + 2
          
        return word2index, vocab_size
    
    def get_datasets(self, vocab_size, embedding_size, max_len):
        # 注，由于nn.Embedding每次生成的词嵌入不固定，因此此处同时获取训练数据的
        # 词嵌入和测试数据的词嵌入
        # 测试数据的词表也用训练数据创建
        train_datas, train_labels = self.read_text(is_train_data=True)
        word2index, vocab_size = self.word_index(train_datas, vocab_size)
        
        test_datas, test_labels = self.read_text(is_train_data = False)
        
        train_features = []
        for data in train_datas:
            feature = []
            data_list = data.split()
            for word in data_list:
                word = word.lower()  # 词表中的单词均为小写
                if word in word2index:
                    feature.append(word2index[word])
                else:
                    # 词表中未出现的词用<unk>代替
                    feature.append(word2index["<unk>"]) 
                if(len(feature)==max_len): 
                    # 限制句子的最大长度，超出部分直接截断
                    break
            # 对未达到最大长度的句子添加padding
            feature = feature + [word2index["<pad>"]] * \
                (max_len - len(feature))
            train_features.append(feature)
            
        test_features = []
        for data in test_datas:
            feature = []
            data_list = data.split()
            for word in data_list:
                word = word.lower() # 词表中的单词均为小写
                if word in word2index:
                    feature.append(word2index[word])
                else:
                    # 词表中未出现的词用<unk>代替
                    feature.append(word2index["<unk>"]) 
                if(len(feature)==max_len): 
                    # 限制句子的最大长度，超出部分直接截断
                    break
            # 对未达到最大长度的句子添加padding
            feature = feature + [word2index["<pad>"]] \
                * (max_len - len(feature))
            test_features.append(feature)
            
        # 将词的index转换成tensor,train_features中数据的维度需要一致
        # 否则会报错
        train_features = torch.LongTensor(train_features)
        train_labels = torch.FloatTensor(train_labels)
        
        test_features = torch.LongTensor(test_features)
        test_labels = torch.FloatTensor(test_labels)
        
        # 将词转化为embedding
        # 词表中有两个特殊的词<unk>和<pad>，所以词表实际大小为vocab_size + 2
        embed = nn.Embedding(vocab_size + 2, embedding_size)
        train_features = embed(train_features)
        test_features = embed(test_features)
        # 指定输入特征是否需要计算梯度
        train_features = Variable(train_features, 
            requires_grad=False)
        train_datasets = torch.utils.data.TensorDataset(
            train_features, train_labels)
        
        test_features = Variable(test_features, requires_grad=False)
        test_datasets = torch.utils.data.TensorDataset(
            test_features, test_labels)
        return train_datasets, test_datasets


In [2]:
import torch
import torch.nn as nn



class RNN(nn.Module):
    '''
    :param input_size:词向量维度
    :param hidden_size:隐藏单元数量
    :param output_size:输出类别数
    :param num_layers:RNN层数
    '''

    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers

        self.rnn = nn.RNN(
            self.input_size, self.hidden_size, 
            self.num_layers, batch_first=True)
        self.fc = nn.Linear(self.hidden_size, self.output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        batch_size = x.size(0)  # 重新获取batch_size
        h0 = torch.zeros(
            self.num_layers, batch_size, self.hidden_size)
        output, hidden = self.rnn(x, h0)
        # resize使得rnn的输出结果可以输入到fc层中
        output = output[:, -1, :]
        # output = output.contiguous().view(-1, self.hidden_size)
        output = self.fc(output)
        output = self.softmax(output)
        return output


In [3]:
class LSTM(nn.Module):
    """
    :param embedding_size:词向量维度
    :param hidden_size:隐藏单元数量
    :param output_size:输出类别数
    :param num_layers:LSTM层数
    """
    def __init__(self, embedding_size, hidden_size, num_layers, 
                 num_classes, device):
        super(LSTM, self).__init__()

        self.num_directions = 2
        self.input_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(embedding_size, hidden_size,
                            num_layers=num_layers,
                            bidirectional=(self.num_directions == 2))

        self.liner = nn.Linear(num_layers * self.num_directions * hidden_size, num_classes)
        self.act_func = nn.Softmax(dim=1)
        self.device = device

    def forward(self, input):
        # lstm的输入维度为 [seq_len, batch_size, input_size]
        output = input.permute(1, 0, 2)
        batch_size = output.size(1)
        h_0 = torch.randn(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(self.device)
        c_0 = torch.randn(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(self.device)
        out, (h_n, c_n) = self.lstm(output, (h_0, c_0))
        output = h_n
        output = output.permute(1, 0, 2)
        output = output.contiguous().view(
            batch_size,
            self.num_layers * self.num_directions * self.hidden_size)
        output = self.liner(output)
        output = self.act_func(output)
        return output


In [4]:
def test(model, test_loader, loss_func, device):
    model.eval()
    loss_val = 0.0
    corrects = 0.0
    for datas, labels in test_loader:
        datas = datas.to(device)
        labels = labels.to(device)

        preds = model(datas)
        loss = loss_func(preds, labels)

        loss_val += loss.item() * datas.size(0)

        # 获取预测的最大概率出现的位置
        preds = torch.argmax(preds, dim=1)
        labels = torch.argmax(labels, dim=1)
        corrects += torch.sum(preds == labels).item()
    test_loss = loss_val / len(test_loader.dataset)
    test_acc = corrects / len(test_loader.dataset)
    print("Test Loss: {}, Test Acc: {}".format(test_loss, test_acc))
    return test_acc


def train(model, train_loader, test_loader, epochs, device, lr):
    best_val_acc = 0.0
    best_model_params = copy.deepcopy(model.state_dict())
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()
    for epoch in range(epochs):
        model.train()
        loss_val = 0.0
        corrects = 0.0
        for datas, labels in train_loader:

            preds = model(datas)
            loss = loss_fn(preds, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss_val += loss.item() * datas.size(0)

            # 获取预测的最大概率出现的位置
            preds = torch.argmax(preds, dim=1)
            labels = torch.argmax(labels, dim=1)
            corrects += torch.sum(preds == labels).item()
        train_loss = loss_val / len(train_loader.dataset)
        train_acc = corrects / len(train_loader.dataset)
        if epoch % 2 == 0:
            print("Train Loss: {}, Train Acc: {}".format(
                train_loss, train_acc))
            test_acc = test(model, test_loader, loss_fn, device)
            if best_val_acc < test_acc:
                best_val_acc = test_acc
                best_model_params = copy.deepcopy(model.state_dict())
    model.load_state_dict(best_model_params)
    return model





def select_model(config):
    """选择网络模型"""
    if config.net == "LSTM":
        model = LSTM(
            config.embedding_size, config.hidden_size,
            config.num_layers, config.num_classes, config.device)
    elif config.net == "RNN":
        model = RNN(
            config.embedding_size, hidden_size=config.hidden_size,          
            output_size=config.num_classes, 
            num_layers=config.num_layers)
    else:
        raise NameError("No defined net found")
    return model


def get_dataloader(config):
    processor = DataProcessor()
    train_datasets, test_datasets = processor.get_datasets(
        vocab_size=config.vocab_size,
        embedding_size=config.embedding_size,
        max_len=config.sentence_max_len)
    train_loader = torch.utils.data.DataLoader(
        train_datasets, batch_size=config.batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(
        test_datasets, batch_size=config.batch_size, shuffle=True)
    return train_loader, test_loader


In [5]:
def parse_args():
    parser = argparse.ArgumentParser(description="config")
    parser.add_argument('--net', default='RNN', type=str)
    # parser.add_argument('--dataset', default=None, type=str)
    parser.add_argument('--vocab_size', default=10000, type=int)
    parser.add_argument('--embedding_size', default=100, type=int)
    parser.add_argument('--num_classes', default=2, type=int)
    parser.add_argument('--sentence_max_len', default=64, type=int)
    parser.add_argument('--num_layers', default=4, type=int)
    parser.add_argument('--lr', default=1e-4, type=float)
    parser.add_argument('--batch_size', default=32, type=int)
    parser.add_argument('--num_epochs', default=256, type=int)
    parser.add_argument('--hidden_size', default=32, type=int)
    parser.add_argument('--device', default='cpu', type=str)
    args = parser.parse_args(args=[])
    return args

config = parse_args()
train_loader, test_loader = get_dataloader(config)
model = select_model(config)
config.device = torch.device(config.device if torch.cuda.is_available() and config.device != 'cpu'
                             else 'cpu')
model = model.to(config.device)
model = train(model, train_loader, test_loader, 
              config.num_epochs, config.device, config.lr)

FileNotFoundError: [Errno 2] No such file or directory: './data/aclImdb/train/pos/'