改进，添加无标签数据


1. 基础代码在进行word2vec时，使用的是train_x和text_x的语料数据，下面根据train_x+train_x_no_label+test_x的语料数据来建立词典，得到新的词嵌入向量。

修改:

```
if __name__ == "__main__":
    print("loading traning data...")
    x_train, y_train = load_training_data("training_label.txt")
    print("loading testing data...")
    x_test = load_testing_data("testing_data.txt")
    wv_model = train_word2vec(x_train + x_test)
    print("saving model...")
    wv_model.save("w2v.model")
```

为:

```
if __name__ == "__main__":
    print("loading traning data...")
    x_train, y_train = load_training_data("training_label.txt")
    print("loading training nolabel data...")
    x_train_no_label = load_training_data("training_nolabel.txt")
    print("loading testing data...")
    x_test = load_testing_data("testing_data.txt")
    wv_model = train_word2vec(x_train + x_train_nolabel + x_test)
    print("saving model...")
    wv_model.save("w2v_all.model")
    
```

2. 增加epoches次数

# util

In [1]:
# 加载训练集方法
def load_training_data(path="traning_label.txt"):
    """
    输出每句话的分词（按英文空格分）列表 和 标签列表
    """
    # 带label的训练集
    if "training_label" in path:
        with open(path, 'r') as f:
            lines = f.readlines()
            lines= [line.strip('\n').split(' ') for line in lines]
        x = [line[2:] for line in lines]
        y = [line[0] for line in lines]
        return x, y
    # 不带label的训练集    
    else:
        with open(path, 'r') as f:
            lines = f.readlines()
            x = [line.strip('\n').split(' ') for line in lines]
        return x
    
# 加载测试集方法
def load_testing_data(path="testing_data.txt"):
    with open(path, 'r') as f:
        lines = f.readlines()
        X = [','.join(line.strip('\n').split(',')[1:]).strip() for line in lines[1:]]
        X = [sen.split(' ') for sen in X]
    return X

# 评估方法
def evaluation(outputs, labels):
    """
    outputs 和 labels是tensor类型。
    sigmoid激活后输出，所以outputs的范围是(0,1)。
    """
    outputs[outputs>=0.5] = 1
    outputs[outputs<0.5] = 0
    correct = torch.sum(torch.eq(outputs,labels)).item()
    return correct

# word2vec

In [3]:
from gensim.models import word2vec
def train_word2vec(x):
    model = word2vec.Word2Vec(x, vector_size=250, min_count=5, window=5, workers=12, epochs=10, sg=1)  # sg=1 为Skip-Gram, sg=0(默认) 为CBOW。
    return model
if __name__ == "__main__":
    print("loading traning data...")
    x_train, y_train = load_training_data("training_label.txt")
    print("loading training nolabel data...")
    x_train_no_label = load_training_data("training_nolabel.txt")
    print("loading testing data...")
    x_test = load_testing_data("testing_data.txt")
    wv_model = train_word2vec(x_train + x_train_no_label + x_test)
    print("saving model...")
    wv_model.save("w2v_all.model")

loading traning data...
loading training nolabel data...
loading testing data...
saving model...



# data prepropress

In [4]:
from torch import nn
from gensim.models import Word2Vec
class Prepropress:
    def __init__(self, sentences, sen_len, w2v_path="w2v.model"):
        self.w2v_path = w2v_path
        self.sentences = sentences
        self.sen_len = sen_len
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []
    
    def get_w2v_model(self):
        self.embedding = Word2Vec.load(self.w2v_path)  # 加载词向量预训练模型
        self.embedding_size = self.embedding.vector_size
        
    def add_embedding(self, word):
        # 将<PAD> 和 <UNK> 添加到embedding_matrix, 为这两个word赋予一个随机的向量
        vector = torch.empty(1, self.embedding_size)
        nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
#         self.embedding_matrix.append(vector)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], dim=0)
        
    def make_embedding(self, load=True):
        if load:
            self.get_w2v_model()
        else:
            raise NotImplementedError
#         for i, word in enumerate(self.embedding.wv.key_to_index.key()):
#             self.word2idx[word] = len(self.word2idx)
#             self.idx2word.append(word)
#             self.embedding_matrix.append(self.embedding[word])
        self.word2idx = self.embedding.wv.key_to_index
        self.idx2word = self.embedding.wv.index_to_key
        self.embedding_matrix = self.embedding.wv.vectors
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        # 添加<pad>和<unk>
        self.add_embedding('<PAD>')
        self.add_embedding('<UNK>')
        return self.embedding_matrix
    
    def pad_sentence(self, sentence):
        if len(sentence) < self.sen_len:
            pad_len = self.sen_len - len(sentence)
#             print(f'pad_len:{pad_len}')
            sentence += [self.word2idx['<PAD>']] * pad_len
#             print(f'sentence: {sentence}')
        else:
            sentence = sentence[:self.sen_len]
#         print(f'len(sentence):{len(sentence)}')
#         print(f'self.sen_len: {self.sen_len}')
        assert len(sentence) == self.sen_len
        return sentence
    
    def sentence_word2idx(self):
        # 把句子中的单词转换为index
        sentence_list = []
        for i, sentence in enumerate(self.sentences):
            sentence_i = []
            for word in sentence:
#                 if word in self.word2idx:
#                     word_vector = self.embedding_matrix(self.word2idx[word])
#                 else:
#                     word_vector = self.embedding_matrix(self.word2idx['<UNK>'])
                if word in self.word2idx:
                    sentence_i.append(self.word2idx[word])
                else:
                    sentence_i.append(self.word2idx['<UNK>'])
#                 sentence_i.append(word_vector)
            sentence_i = self.pad_sentence(sentence_i)
            sentence_list.append(sentence_i)
        return torch.LongTensor(sentence_list)
    
    def labels_to_tensor(self, y):
        y = [int(label) for label in y]
        return torch.LongTensor(y)

# dataset

In [6]:
from torch.utils import data
class MyDataset(data.Dataset):
    """
    将数据包装为Dataset类，然后传入DataLoader，使用DataLoader这个类对数据操作
    """
    def __init__(self, X, y):
        self.data = X
        self.label = y
        
    def __getitem__(self, index):
        if self.label is not None:
            return self.data[index], self.label[index]
        else:
            return self.data[index]
        
    def __len__(self):
        return len(self.data)

# Model

In [7]:
import torch
from torch import nn
class LSTM_net(nn.Module):
    def __init__(self, embedding, embedding_dim, hidden_dim, n_layer, dropout, fix_embedding):
        super(LSTM_net, self).__init__()
        # embedding layer
        self.embedding = nn.Embedding(embedding.size(0), embedding.size(1))
        self.embedding.weight = nn.Parameter(embedding)  # 将一个变量转换为可训练的参数，并将这个参数绑定到module的net.parameter()中
        self.embedding.weight.reqiure_grad = False if fix_embedding else True  # 是否将embedding fix住，如果fix_embedding为False，那么在训练过程中embedding也会跟着训练
        # 疑问：embedding.size(1)和embedding_dim不一样吗？
        self.embedding_dim = embedding.size(1)
        self.hidden_dim = hidden_dim
        self.n_layer = n_layer
        self.dropout = dropout
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layer, batch_first=True)
        self.classifer = nn.Sequential(nn.Dropout(dropout),
                                      nn.Linear(hidden_dim, 1),
                                      nn.Sigmoid())
        
        
    def forward(self, inputs):
        inputs = self.embedding(inputs)
        x, _ = self.lstm(inputs, None)
        # 取用lstm最优一层的hidden state
        x = x[:, -1, :]
        x = self.classifer(x)
        return x

# Train

In [10]:
import torch
from torch.nn import BCELoss
# from torch.nn.function import binary_cross_entropy
import torch.optim as optim 
    
def train(train, val, epoches, model, lr, batch_size, device):
    """
    训练集train, 验证集val，轮数epoches，模型model，学习率lr, 批大小batch_size
    """
    # 损失函数
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("total parameters: {} ; trainable parameters: {}".format(total, trainable))
    criterion = BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    t_batches = len(train)
    v_batches = len(val)
    best_acc = 0
    for epoch in range(epoches):
        model.train()
        total_loss, total_acc = 0, 0
        for i, (inputs, labels) in enumerate(train):
            inputs = inputs.to(device, dtype=torch.long)
            labels = labels.to(device, dtype=torch.float)
            # inputs: (batch_size, sen_len)
            # inputs 中的值是word2id 的id值
#             print(f'training  inputs: {inputs}')
#             print(f'training inputs.shape: {inputs.shape}')
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.squeeze()  # 去掉张量内元素的外层的所有中括号
            loss = criterion(outputs, labels)  # 计算损失
            loss.backward()  # 反向传播梯度
            optimizer.step()
            
            acc = evaluation(outputs, labels)
            # end = '\r' 滚动输出到屏幕
            print("[Epoch_{} {}/{}], loss: {:.3f}, acc: {:.3f}".format(
                epoch, i, t_batches, loss.item(), acc/batch_size*100), end="\r")

            total_loss += loss.item()
            total_acc += acc / batch_size
        print("[Epoch_{}], total_loss: {:.3f}, total_acc: {:.3f}".format(
            epoch, total_loss / t_batches, total_acc / t_batches * 100))

        model.eval()
        with torch.no_grad():
            total_loss, total_acc = 0, 0
            for i, (inputs, labels) in enumerate(val):
                inputs = inputs.to(device, dtype=torch.long)
                labels  = labels.to(device, dtype=torch.float)
                outputs = model(inputs)
                outputs = outputs.squeeze()
                loss = criterion(outputs, labels)
                acc = evaluation(outputs, labels)
                total_loss += loss.item()
                total_acc += acc / batch_size
            print("Valid:[Epoch_{}], total_loss: {:.3f}, total_acc: {:.3f}".format(
                epoch, total_loss / v_batches, total_acc / v_batches * 100))
            if total_acc > best_acc:
#                 print("best_acc: {:.3f}".format(best_acc))
                best_acc = total_acc
                torch.save(model, 'ckpt_rnn1.model')
                print(f'saving model with acc {total_acc/v_batches*100 :.3f}')
        print("---------------------------------------------")

# Test

In [9]:
import torch
def test(test_x, model, batch_size, device):
    model.eval()
    res_list = []
    with torch.no_grad():
        for i, inputs in enumerate(test_x):
            inputs = inputs.to(device, dtype=torch.long)
            outputs = model(inputs)
            outputs = outputs.squeeze()
            outputs[outputs >= 0.5] = 1
            outputs[outputs < 0.5] = 0
            res_list += outputs.int().tolist()
    return res_list  

# main

In [12]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 设置运算的设备

# - 运行word2vec模块，由训练集和测试集训练生成w2v.model预训练模型。
# - 处理预训练的词向量，生成word2idx, idx2word, embedding_matrix。处理数据集，将句子处理成word2idx的格式，每个句子一个列表，并转为张量；将label处理成列表并转为张量。
train_x, train_y = load_training_data('training_label.txt')
train_x_no_label = load_training_data('training_nolabel.txt')
test_x = load_testing_data('testing_data.txt')

sen_len = 20
batch_size = 128
epoches=15
lr = 0.001
w2v_path = "w2v_all.model"

print(f'preprocess data ... ...')
preprocess = Prepropress(train_x, sen_len=sen_len, w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True)
train_x = preprocess.sentence_word2idx()
train_y = preprocess.labels_to_tensor(train_y)

print(f'dataset ... ...')
# x_train, y_train, x_val, y_val = train_x[:180000], y[:180000], train_x[180000:], y[180000:]
X_train, X_val, y_train, y_val = train_test_split(
    train_x, train_y, test_size=0.1, random_state=1, stratify=train_y)
# 将数据集包装成dataset类
train_dataset = MyDataset(X_train, y_train)
val_dataset = MyDataset(X_val, y_val)

print(f'dataloader ... ...')
# 将数据集分批成dataloader类
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=8)

print(f'traning ... ...')
model = LSTM_net(embedding=embedding, embedding_dim=250, hidden_dim=150, n_layer=1, dropout=0.5, fix_embedding=True)
model = model.to(device)
train(train=train_loader, val=val_loader, epoches=epoches, model=model, lr=lr, batch_size=batch_size, device=device)

preprocess data ... ...
dataset ... ...
dataloader ... ...
traning ... ...
total parameters: 14186101 ; trainable parameters: 14186101
[Epoch_0], total_loss: 0.478, total_acc: 76.548
Valid:[Epoch_0], total_loss: 0.429, total_acc: 79.837
saving model with acc 79.837
---------------------------------------------
[Epoch_1], total_loss: 0.394, total_acc: 82.038
Valid:[Epoch_1], total_loss: 0.425, total_acc: 79.757
---------------------------------------------
[Epoch_2], total_loss: 0.338, total_acc: 85.066
Valid:[Epoch_2], total_loss: 0.460, total_acc: 79.145
---------------------------------------------
[Epoch_3], total_loss: 0.278, total_acc: 88.053
Valid:[Epoch_3], total_loss: 0.542, total_acc: 78.583
---------------------------------------------
[Epoch_4], total_loss: 0.224, total_acc: 90.491
Valid:[Epoch_4], total_loss: 0.591, total_acc: 77.941
---------------------------------------------
[Epoch_5], total_loss: 0.182, total_acc: 92.381
Valid:[Epoch_5], total_loss: 0.724, total_acc: 7

# predict and write to csv file

In [13]:
import pandas as pd 

print(f'loading testing data ... ...')
test_x = load_testing_data('testing_data.txt')
preprocess = Prepropress(test_x, sen_len=50)
embedding = preprocess.make_embedding(load=True)
test_x = preprocess.sentence_word2idx()
test_dataset = MyDataset(test_x, None)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=128, 
                                          shuffle=False, num_workers=8)

print(f'loading model ... ...')
model = torch.load('ckpt_rnn1.model')
outputs = test(test_loader, model, batch_size=batch_size, device=device)

# 保存为csv
tmp = pd.DataFrame({"id":[str(i) for i in range(len(test_x))], "label": outputs})
print(f'saving csv ... ...')
tmp.to_csv('predict_rnn1.csv', index=False)
print(f'finish predicting!')

loading testing data ... ...
loading model ... ...
saving csv ... ...
finish predicting!


# 结果
[Epoch_0], total_loss: 0.478, total_acc: 76.548

Valid:[Epoch_0], total_loss: 0.429, total_acc: 79.837