# baseline1.3

## 导入库

In [1]:
from sklearn import model_selection
from sklearn import metrics
import torch
from torch import optim
import time
import re
import copy
import pandas as pd
import nltk
import numpy as np
from torch.utils import data
from torch import nn
from torch.nn import functional as f

## 参数初始化

In [2]:
is_ensemble = False

In [3]:
flag = True
if flag:
    train_file = 'data/train.csv'
    embedding_file = 'data/glove.840B.300d.txt'
    test_file = 'data/test.csv'
else:
    train_file = '../input/train.csv'
    embedding_file = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    test_file = '../input/test.csv'

In [4]:
max_len = 100
batch_size = 256
test_batch_size = 1024
epochs = 1
LR = 5e-3
lr_decay = 0.5

In [5]:
config_model_rnn_1 = {
    'mode': 'LSTM',
    'hidden_size': 150,
    'dropout_p': 0.2,
    'encoder_dropout_p': 0.1,
    'encoder_layer_num': 1,
    'is_bn': False
}

In [6]:
config_model_rnn_2 = {
    'mode': 'GRU',
    'hidden_size': 150,
    'dropout_p': 0.2,
    'encoder_dropout_p': 0.1,
    'encoder_layer_num': 1,
    'is_bn': False
}

## 数据预处理
 - 分词
 - train：长度筛选

In [7]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }
specials_d = ["’", "‘", "´", "`"]
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
specials_c = {'\u200b': ' ','…': ' ... ','\ufeff': '','करना': '','है': ''}
mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

In [8]:
def pre_data(sentence_list):

    result = []
    for i in sentence_list:
        i = re.sub(r'\s+', ' ', i)

        # 缩写词替换
        for s in specials_d:
            i = i.replace(s, "'")
        for word in contraction_mapping.keys():
            i = i.replace(word, contraction_mapping[word])

        # 错误词替换
        for s in specials_c:
            i = i.replace(s, specials_c[s])
        for word in mispell_dict.keys():
            i = i.replace(word, mispell_dict[word])

        i = re.sub(r'\s+', ' ', i)
        i = nltk.word_tokenize(i)

        result.append(i)

    return result

In [9]:
def deal_data(data, max_len=100, is_train=True):
    df = pd.read_csv(data)
    questions = df['question_text'].values
    question_word_lists = pre_data(questions)
    question_word_list_len = [len(q) for q in question_word_lists]
    if is_train:
        target = df['target'].values
        question_os = []
        target_os = []
        for q, t, l in zip(question_word_lists, target, question_word_list_len):
            if l <= max_len:
                question_os.append(q)
                target_os.append(t)
        print('deal_data, retain data:%d/%d' % (len(question_os), len(questions)))
        return question_os, target_os

    else:
        question_os = question_word_lists
        return question_os

In [10]:
train_questions, train_targets = deal_data(train_file, max_len=max_len)
print('train_len:%d' % (len(train_questions)))
test_questions = deal_data(test_file, max_len=max_len, is_train=False)
print('test_len:%d' % (len(test_questions)))

deal_data, retain data:1306111/1306122
train_len:1306111
test_len:56370


In [11]:
# fake
# train_questions, train_targets = train_questions[: 1000], train_targets[: 1000]

## 建立词表
 - glove

In [12]:
def build_word_embedding(questions, glove_file):

    # 初始化embedding字典
    def get_matrixs(word, *nums):
        return word, np.asarray(nums, dtype='float32')
    embedding_dict = dict([get_matrixs(*o.split(' ')) for o in open(glove_file, 'r')])

    # 初始化词表
    word_set = set()
    for q in questions:
        for word in q:
            word_set.add(word)
    vocab_all_size = len(word_set)

    # 词表删选
    word_set = set()
    for q in questions:
        for word in q:
            if word in embedding_dict:
                word_set.add(word)
    vocab_size = len(word_set)

    print('words in pre-embedding, num:%d/%d, radio:%.4f' % (vocab_size, vocab_all_size, vocab_size/vocab_all_size))

    # 构建词表、embedding矩阵
    w2i = {'<pad>': 0}
    count = 1
    embedding = np.zeros([len(word_set)+2, 300])
    for word in word_set:
        if word not in w2i:
            w2i[word] = count
            embedding[count] = embedding_dict[word]
            count += 1
    w2i['<unk>'] = count
    assert len(w2i) == len(embedding)

    print('build_word_embedding,  vocab size:%d' % len(w2i))

    return w2i, embedding

In [13]:
def build_word_embedding(questions, glove_file):

    # 初始化embedding字典
    def get_matrixs(word, *nums):
        return word, np.asarray(nums, dtype='float32')
    embedding_dict = dict([get_matrixs(*o.split(' ')) for o in open(glove_file, 'r')])

    # 初始化词表
    vocab = {}
    for q in questions:
        for word in q:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1

    # 检查覆盖率、词表删选
    known_num = 0
    all_num = 0
    word_set = []
    for word in vocab.keys():
        if word in embedding_dict:
            known_num += vocab[word]
            word_set.append(word)
        all_num += vocab[word]

    print('words in pre-embedding, num:%d/%d, radio:%.4f' % (len(word_set), len(vocab), len(word_set)/len(vocab)))
    print('known words in all text:%.4f' % (known_num/all_num))

    # 构建词表、embedding矩阵
    w2i = {'<pad>': 0}
    count = 1
    embedding = np.zeros([len(word_set)+2, 300])
    for word in word_set:
        if word not in w2i:
            w2i[word] = count
            embedding[count] = embedding_dict[word]
            count += 1
    w2i['<unk>'] = count
    assert len(w2i) == len(embedding)

    print('build_word_embedding,  vocab size:%d' % len(w2i))

    return w2i, embedding

In [14]:
w2i, embedding = build_word_embedding(train_questions+test_questions, embedding_file)

words in pre-embedding, num:200419/313313, radio:0.6397
known words in all text:0.9923
build_word_embedding,  vocab size:200421


## index

In [15]:
def word2indexs(words, lang):

    def word2index(word_list):
        return [lang[word] if word in lang else lang['<unk>'] for word in word_list]

    return [word2index(word_list) for word_list in words]

In [16]:
train_questions = word2indexs(train_questions, w2i)
test_questions = word2indexs(test_questions, w2i)

## padding

In [17]:
def padding(words, max_len, pad_index=0):

    def padd(word_list):
        if len(word_list) > max_len:
            tmp = word_list[: max_len]
        else:
            tmp = word_list + [pad_index] * (max_len - len(word_list))
        return tmp

    results = [padd(word_list) for word_list in words]
    return results

In [18]:
train_questions = padding(train_questions, max_len)
test_questions = padding(test_questions, max_len)

## 随机划分训练集、验证集

In [19]:
train_questions, val_questions, train_targets, val_targets = model_selection.train_test_split(
        train_questions, train_targets, test_size=0.1, random_state=333)
assert len(train_questions) == len(train_targets)
assert len(val_questions) == len(val_targets)
print('train_len:%d, val_len:%d' % (len(train_questions), len(val_questions)))

train_len:1175499, val_len:130612


## 构建train、val dataloader

In [20]:
def get_dataloader(dataset, batch_size, shuffle, drop_last):
    dataset = [torch.LongTensor(d) for d in dataset]
    dataset = data.TensorDataset(*dataset)
    data_iter = data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last
    )
    return data_iter

In [21]:
train_loader = get_dataloader(
    dataset=[train_questions, train_targets],
    batch_size=batch_size,
    shuffle=True,
    drop_last=True
)

val_loader = get_dataloader(
    dataset=[val_questions, val_targets],
    batch_size=batch_size,
    shuffle=False,
    drop_last=False
)

test_loader = get_dataloader(
    dataset=[test_questions],
    batch_size=test_batch_size,
    shuffle=False,
    drop_last=False
)

## model

### embedding
 - 基础embedding
 - <unk> 可训练

In [22]:
class Embedding(nn.Module):
    """ standard embedding """
    def __init__(self, embedding):
        super(Embedding, self).__init__()

        self.vocab_size = embedding.shape[0]
        self.w2v_size = embedding.shape[1]

        self.embedding_fix = nn.Embedding(
            num_embeddings=self.vocab_size,
            embedding_dim=self.w2v_size,
            padding_idx=0,
            _weight=torch.Tensor(embedding)
        )
        self.embedding_fix.weight.requires_grad = False

        self.embedding_v = nn.Embedding(
            num_embeddings=2,
            embedding_dim=self.w2v_size,
            padding_idx=0
        )

        self.embedding_dim = self.embedding_fix.embedding_dim

    def forward(self, tensor):
        """
        :param tensor: (batch_size, c_len)
        :return: (batch_size, c_len, w2v)
        """
        embedding_1 = self.embedding_fix(tensor)

        tensor = tensor - (self.vocab_size - self.embedding_v.num_embeddings)
        tensor = f.relu(tensor)
        embedding_2 = self.embedding_v(tensor)

        embedding = embedding_1 + embedding_2

        return embedding

### encoder
 - LSTM、 GRU

In [23]:
class Rnn(nn.Module):

    def __init__(self, param):
        super(Rnn, self).__init__()

        self.mode = param['mode']
        self.input_size = param['input_size']
        self.hidden_size = param['hidden_size']
        self.dropout_p = param['encoder_dropout_p']
        self.directional = True
        self.layer_num = param['encoder_layer_num']
        self.is_bn = param['is_bn']

        if self.mode == 'LSTM':
            self.rnn = nn.LSTM(
                input_size=self.input_size,
                hidden_size=self.hidden_size,
                num_layers=self.layer_num,
                bidirectional=self.directional,
                dropout=self.dropout_p if self.layer_num > 1 else 0
            )
        elif self.mode == 'GRU':
            self.rnn = nn.GRU(
                input_size=self.input_size,
                hidden_size=self.hidden_size,
                num_layers=self.layer_num,
                bidirectional=self.directional,
                dropout=self.dropout_p if self.layer_num > 1 else 0
            )

        if self.is_bn:
            self.layer_norm = nn.LayerNorm(self.input_size)

        self.dropout = nn.Dropout(p=self.dropout_p)
        self.reset_parameters()

    def reset_parameters(self):
        """ use xavier_uniform to initialize rnn weights """
        ih = (param for name, param in self.named_parameters() if 'weight_ih' in name)
        hh = (param for name, param in self.named_parameters() if 'weight_hh' in name)
        b = (param for name, param in self.named_parameters() if 'bias' in name)

        for t in ih:
            torch.nn.init.xavier_uniform_(t)
        for t in hh:
            torch.nn.init.orthogonal_(t)
        for t in b:
            torch.nn.init.constant_(t, 0)

    def forward(self, vec, mask):
        """
        :param vec: (seq_len, batch_size, input_size)
        :param mask: (batch_size, seq_len)
        :return: (seq_len, batch_size, hidden_size*directional_num)
        """

        # layer normalization
        if self.is_bn:
            seq_len, batch_size, input_size = vec.size
            vec = vec.contiguous().view(-1, input_size)
            vec = self.layer_norm(vec)
            vec = vec.view(seq_len, batch_size, input_size)

        # dropout
        vec = self.dropout(vec)

        # forward
        lengths = mask.long().sum(1)
        length_sort, idx_sort = torch.sort(lengths, descending=True)
        _, idx_unsort = torch.sort(idx_sort)

        v_sort = vec.index_select(1, idx_sort)
        v_pack = nn.utils.rnn.pack_padded_sequence(v_sort, length_sort)
        outputs, _ = self.rnn(v_pack, None)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs.index_select(1, idx_unsort)

        # 未填充， outputs的第一维可能小于seq_len
        return outputs

### model：Bi-Rnn

In [24]:
class Model_Rnn(nn.Module):
    """ rnn """
    def __init__(self, param):
        super(Model_Rnn, self).__init__()

        self.hidden_size = param['hidden_size']
        self.dropout_p = param['dropout_p']

        # embedding
        self.embedding = Embedding(param['embedding'])

        # encoder
        param['input_size'] = self.embedding.embedding_dim
        self.encoder = Rnn(param)

        # outputs
        self.fc1 = nn.Sequential(
            nn.Linear(self.hidden_size*2, self.hidden_size),
            nn.ReLU()
        )

        self.fc2 = nn.Sequential(
            nn.Linear(self.hidden_size, 1),
            nn.Sigmoid()
        )

        # dropout
        self.dropout = nn.Dropout(param['dropout_p'])

    def forward(self, batch):

        questions = batch[0]

        # mask
        def get_mask(tensor): return torch.ne(tensor, 0)
        question_mask = get_mask(questions)

        # embedding
        question_vec = self.embedding(questions)
        question_vec = question_vec.transpose(0, 1)

        # encoder (seq_len, batch_size, h*2)
        question_vec = self.encoder(question_vec, question_mask)

        # output
        question_vec = torch.sum(question_vec, dim=0)
        question_mask = question_mask.long().sum(1)
        question_mask = question_mask.view(-1, 1).float()
        question_vec = question_vec / question_mask  # (batch_size, h*2)

        question_vec = self.dropout(question_vec)
        output = self.fc1(question_vec)
        output = self.dropout(output)
        output = self.fc2(output)  # (batch_size, 1)

        return output

## Model: Bi-LSTM

In [25]:
param = config_model_rnn_1
param['embedding'] = embedding

model_lstm = Model_Rnn(param)
model_lstm = model_lstm.cuda()
model_best_state = None
loss_best = 999
accuracy_best = 0
lr = LR

criterion = torch.nn.BCELoss()
optimizer_param = filter(lambda p: p.requires_grad, model_lstm.parameters())
optimizer = optim.Adam(optimizer_param, lr=1e-4)

## train

In [26]:
model_param_num = 0
for parameter in model_lstm.parameters():
    if parameter.requires_grad:
        model_param_num += parameter.nelement()
print('start training, param_num:%d' % model_param_num)

start training, param_num:588301


In [27]:
train_loss = 0
train_c = 0
t_nums = len(train_questions) // batch_size
every_nums = t_nums // 10
time0 = time.time()
loss_val_last = 99999.0
for e in range(epochs):
    for i, batch in enumerate(train_loader):
        batch = [b.cuda() for b in batch]
        model_lstm.train()
        optimizer.zero_grad()
        outputs = model_lstm(batch)
        loss_value = criterion(outputs, batch[1].view(-1, 1).float())
        loss_value.backward()
        optimizer.step()

        train_loss += loss_value.item()
        train_c += 1

        if train_c % every_nums == 0:
            val_loss = 0
            val_c = 0
            correct_num = 0
            sum_num = 0
            with torch.no_grad():
                model_lstm.eval()
                for val_batch in val_loader:
                    val_batch = [b.cuda() for b in val_batch]
                    outputs = model_lstm(val_batch)
                    loss_value = criterion(outputs, val_batch[1].view(-1, 1).float())

                    correct_num += ((outputs > 0.5).long() == val_batch[1].view(-1, 1)).sum().item()
                    sum_num += outputs.size(0)

                    val_loss += loss_value.item()
                    val_c += 1
            print('training, epochs:%2d, steps:%2d/%2d, train_loss:%.4f, val_loss:%.4f, accuracy:%.4f, lr:%.4f, time:%4ds' %
                      (e, (i+1), t_nums, train_loss/train_c, val_loss/val_c, correct_num/sum_num, lr, time.time()-time0))

            train_loss = 0
            train_c = 0

            if loss_best > (val_loss / val_c):
                accuracy_best = correct_num/sum_num
                loss_best = val_loss / val_c
                model_best_state = copy.deepcopy(model_lstm.state_dict())

            # 动态调整lr
            if loss_val_last < (val_loss / val_c) and lr >= 1e-4:
                lr = lr * lr_decay
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr

            loss_val_last = val_loss / val_c

print('training, best_loss:%.4f, best_accuracy:%.4f' % (loss_best, accuracy_best))

model_lstm.load_state_dict(model_best_state)
model_lstm.eval()

training, epochs: 0, steps:459/4591, train_loss:0.2241, val_loss:0.1398, accuracy:0.9465, lr:0.0050, time:  15s
training, epochs: 0, steps:918/4591, train_loss:0.1329, val_loss:0.1279, accuracy:0.9495, lr:0.0050, time:  32s
training, epochs: 0, steps:1377/4591, train_loss:0.1271, val_loss:0.1236, accuracy:0.9510, lr:0.0050, time:  47s
training, epochs: 0, steps:1836/4591, train_loss:0.1258, val_loss:0.1212, accuracy:0.9517, lr:0.0050, time:  63s
training, epochs: 0, steps:2295/4591, train_loss:0.1223, val_loss:0.1228, accuracy:0.9503, lr:0.0050, time:  78s
training, epochs: 0, steps:2754/4591, train_loss:0.1256, val_loss:0.1185, accuracy:0.9541, lr:0.0025, time:  94s
training, epochs: 0, steps:3213/4591, train_loss:0.1186, val_loss:0.1118, accuracy:0.9552, lr:0.0025, time: 110s
training, epochs: 0, steps:3672/4591, train_loss:0.1140, val_loss:0.1113, accuracy:0.9566, lr:0.0025, time: 127s
training, epochs: 0, steps:4131/4591, train_loss:0.1137, val_loss:0.1066, accuracy:0.9584, lr:0.00

Model_Rnn(
  (embedding): Embedding(
    (embedding_fix): Embedding(200421, 300, padding_idx=0)
    (embedding_v): Embedding(2, 300, padding_idx=0)
  )
  (encoder): Rnn(
    (rnn): LSTM(300, 150, bidirectional=True)
    (dropout): Dropout(p=0.1)
  )
  (fc1): Sequential(
    (0): Linear(in_features=300, out_features=150, bias=True)
    (1): ReLU()
  )
  (fc2): Sequential(
    (0): Linear(in_features=150, out_features=1, bias=True)
    (1): Sigmoid()
  )
  (dropout): Dropout(p=0.2)
)

## Model: Bi-gru

In [28]:
if is_ensemble:
    param = config_model_rnn_2
    param['embedding'] = embedding

    model_gru = Model_Rnn(param)
    model_gru = model_gru.cuda()
    model_best_state = None
    loss_best = 999
    accuracy_best = 0
    lr = LR

    criterion = torch.nn.BCELoss()
    optimizer_param = filter(lambda p: p.requires_grad, model_gru.parameters())
    optimizer = optim.Adam(optimizer_param, lr=1e-4)

### train

In [29]:
if is_ensemble:
    model_param_num = 0
    for parameter in model_gru.parameters():
        if parameter.requires_grad:
            model_param_num += parameter.nelement()
    print('start training, param_num:%d' % model_param_num)

In [30]:
if is_ensemble:
    train_loss = 0
    train_c = 0
    t_nums = len(train_questions) // batch_size
    every_nums = t_nums // 10
    time0 = time.time()
    loss_val_last = 99999.0
    for e in range(epochs):
        for i, batch in enumerate(train_loader):
            batch = [b.cuda() for b in batch]
            model_gru.train()
            optimizer.zero_grad()
            outputs = model_gru(batch)
            loss_value = criterion(outputs, batch[1].view(-1, 1).float())
            loss_value.backward()
            optimizer.step()

            train_loss += loss_value.item()
            train_c += 1

            if train_c % every_nums == 0:
                val_loss = 0
                val_c = 0
                correct_num = 0
                sum_num = 0
                with torch.no_grad():
                    model_gru.eval()
                    for val_batch in val_loader:
                        val_batch = [b.cuda() for b in val_batch]
                        outputs = model_gru(val_batch)
                        loss_value = criterion(outputs, val_batch[1].view(-1, 1).float())

                        correct_num += ((outputs > 0.5).long() == val_batch[1].view(-1, 1)).sum().item()
                        sum_num += outputs.size(0)

                        val_loss += loss_value.item()
                        val_c += 1
                print('training, epochs:%2d, steps:%2d/%2d, train_loss:%.4f, val_loss:%.4f, accuracy:%.4f, lr:%.4f, time:%4ds' %
                          (e, (i+1), t_nums, train_loss/train_c, val_loss/val_c, correct_num/sum_num, lr, time.time()-time0))

                train_loss = 0
                train_c = 0

                if loss_best > (val_loss / val_c):
                    accuracy_best = correct_num/sum_num
                    loss_best = val_loss / val_c
                    model_best_state = copy.deepcopy(model_gru.state_dict())

                # 动态调整lr
                if loss_val_last < (val_loss / val_c) and lr >= 1e-4:
                    lr = lr * lr_decay
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr

                loss_val_last = val_loss / val_c

    print('training, best_loss:%.4f， best_accuracy:%.4f' % (loss_best, correct_num/sum_num))

    model_gru.load_state_dict(model_best_state)
    print(model_gru.eval())

### 集成策略

In [31]:
# 均值
def ensemble_mean(model_result):
    y_pred = np.zeros(shape=[len(model_result[0])])
    for r in model_result:
        y_pred += np.array(r)
    y_pred = y_pred / len(model_result)
    return y_pred

### 阈值选择

In [32]:
val_loader = get_dataloader(
    dataset=[val_questions, val_targets],
    batch_size=test_batch_size,
    shuffle=False,
    drop_last=False
)

In [33]:
if is_ensemble is False:
    scores = np.arange(0.1, 0.501, 0.01)
    best_score = -1
    best_accuracy = 0
    for score in scores:
        y_true_tmp = []
        y_pred_tmp = []
        with torch.no_grad():
            for val_batch in val_loader:
                val_batch = [b.cuda() for b in val_batch]
                outputs = model_lstm(val_batch)
                y_pred_tmp += (outputs > score).long().view(-1).cpu().numpy().tolist()
                y_true_tmp += val_batch[1].view(-1).cpu().numpy().tolist()
        acc_tmp = metrics.f1_score(y_true_tmp, y_pred_tmp)
        print('score choosing, score:%.2f, accuracy:%.4f' % (score, acc_tmp))
        if best_accuracy < acc_tmp:
            best_score = score
            best_accuracy = acc_tmp
    print('valing, best_score:%.2f, best_accuracy:%.4f' % (best_score, best_accuracy))

score choosing, score:0.10, accuracy:0.5566
score choosing, score:0.11, accuracy:0.5643
score choosing, score:0.12, accuracy:0.5719
score choosing, score:0.13, accuracy:0.5790
score choosing, score:0.14, accuracy:0.5859
score choosing, score:0.15, accuracy:0.5918
score choosing, score:0.16, accuracy:0.5975
score choosing, score:0.17, accuracy:0.6026
score choosing, score:0.18, accuracy:0.6076
score choosing, score:0.19, accuracy:0.6109
score choosing, score:0.20, accuracy:0.6148
score choosing, score:0.21, accuracy:0.6203
score choosing, score:0.22, accuracy:0.6238
score choosing, score:0.23, accuracy:0.6270
score choosing, score:0.24, accuracy:0.6295
score choosing, score:0.25, accuracy:0.6319
score choosing, score:0.26, accuracy:0.6342
score choosing, score:0.27, accuracy:0.6358
score choosing, score:0.28, accuracy:0.6381
score choosing, score:0.29, accuracy:0.6403
score choosing, score:0.30, accuracy:0.6426
score choosing, score:0.31, accuracy:0.6445
score choosing, score:0.32, accu

In [34]:
if is_ensemble:
    model_group = [model_lstm, model_gru]
    scores = np.arange(0.1, 0.501, 0.01)
    best_score = -1
    best_accuracy = 0
    for score in scores:
        model_result = [[] for _ in range(len(model_group))]
        y_true = []
        with torch.no_grad():
            for val_batch in val_loader:
                val_batch = [b.cuda() for b in val_batch]

                for index in range(len(model_group)):
                    outputs = model_group[index](val_batch)
                    outputs = outputs.view(-1).cpu().numpy().tolist()
                    model_result[index] += outputs

                y_true += val_batch[1].view(-1).cpu().numpy().tolist()

        # 集成策略：均值
        y_pred = ensemble_mean(model_result)

        y_pred = (y_pred > score).astype(int).tolist()
        acc_tmp = metrics.f1_score(y_true, y_pred)
        print('score choosing, score:%.2f, accuracy:%.4f' % (score, acc_tmp))
        if best_accuracy < acc_tmp:
            best_score = score
            best_accuracy = acc_tmp
    print('valing, best_score:%.2f, best_accuracy:%.4f' % (best_score, best_accuracy))

## test预测

### test 数据处理

### 测试模型构建

In [35]:
model = model_lstm

### 结果生成

In [36]:
if is_ensemble is False:
    result = []
    with torch.no_grad():
        for test_batch in test_loader:
            test_batch = [b.cuda() for b in test_batch]
            outputs = model(test_batch)
            outputs = (outputs > best_score).long()
            result += outputs.view(-1).cpu().numpy().tolist()

In [37]:
if is_ensemble:
    result = []
    model_result = [[] for _ in range(len(model_group))]
    with torch.no_grad():
        for test_batch in test_loader:
            test_batch = [b.cuda() for b in test_batch]

            for index in range(len(model_group)):
                outputs = model_group[index](test_batch)
                outputs = outputs.view(-1).cpu().numpy().tolist()
                model_result[index] += outputs

        # 集成策略：均值
        y_pred = ensemble_mean(model_result)

        y_pred = (y_pred > best_score).astype(int).tolist()
        result = y_pred

### 文件输出

In [38]:
test_df = pd.read_csv(test_file)
submission = pd.DataFrame(
    {'qid': test_df['qid'], 'prediction': result},
    columns=['qid', 'prediction']
)
submission.to_csv('submission.csv', index=False)