# baseline

## 导入库

In [1]:
from sklearn import model_selection
import torch
from torch import optim
import time
import copy
import pandas as pd
import nltk
import numpy as np
from torch.utils import data
from torch import nn
from torch.nn import functional as f

## 参数初始化

In [2]:
flag = True
if flag:
    train_file = 'data/train.csv'
    embedding_file = 'data/glove.840B.300d.txt'
    test_file = 'data/test.csv'
else:
    train_file = '../input/train.csv'
    embedding_file = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    test_file = '../input/train.csv'

In [4]:
max_len = 100
batch_size = 256
epochs = 10

In [5]:
config_model_rnn_1 = {
    'mode': 'LSTM',
    'hidden_size': 150,
    'dropout_p': 0.2,
    'encoder_dropout_p': 0.1,
    'encoder_layer_num': 1,
    'is_bn': False
}

In [6]:
param = config_model_rnn_1

## 数据预处理
 - 分词
 - train：长度筛选

In [7]:
def deal_data(data, max_len=100, is_train=True):
    df = pd.read_csv(data)
    questions = df['question_text'].values
    question_word_lists = [nltk.word_tokenize(q) for q in questions]
    question_word_list_len = [len(q) for q in question_word_lists]
    if is_train:
        target = df['target'].values
        question_os = []
        target_os = []
        for q, t, l in zip(question_word_lists, target, question_word_list_len):
            if l <= max_len:
                question_os.append(q)
                target_os.append(t)
        print('deal_data, retain data:%d/%d' % (len(question_os), len(questions)))
        return question_os, target_os

    else:
        question_os = question_word_lists
        return question_os

In [8]:
train_questions, train_targets = deal_data(train_file, max_len=max_len)

deal_data, retain data:1306111/1306122


In [9]:
test_questions = deal_data(test_file, max_len=max_len, is_train=False)

In [8]:
# fake
# train_questions, train_targets = train_questions[: 100000], train_targets[: 100000]

## 建立词表
 - glove

In [10]:
def build_word_embedding(questions, glove_file):

    # 初始化embedding字典
    def get_matrixs(word, *nums):
        return word, np.asarray(nums, dtype='float32')
    embedding_dict = dict([get_matrixs(*o.split(' ')) for o in open(glove_file, 'r')])

    # 初始化词表
    word_set = set()
    for q in questions:
        for word in q:
            word_set.add(word)
    vocab_all_size = len(word_set)

    # 词表删选
    word_set = set()
    for q in questions:
        for word in q:
            if word in embedding_dict:
                word_set.add(word)
    vocab_size = len(word_set)

    print('words in pre-embedding, num:%d/%d, radio:%.4f' % (vocab_size, vocab_all_size, vocab_size/vocab_all_size))

    # 构建词表、embedding矩阵
    w2i = {'<pad>': 0}
    count = 1
    embedding = np.zeros([len(word_set)+2, 300])
    for word in word_set:
        if word not in w2i:
            w2i[word] = count
            embedding[count] = embedding_dict[word]
            count += 1
    w2i['<unk>'] = count
    assert len(w2i) == len(embedding)

    print('build_word_embedding,  vocab size:%d' % len(w2i))

    return w2i, embedding

In [11]:
w2i, embedding = build_word_embedding(train_questions+test_questions, embedding_file)

KeyboardInterrupt: 

In [None]:
param['embedding'] = embedding

## index

In [None]:
def word2indexs(words, lang):

    def word2index(word_list):
        return [lang[word] if word in lang else lang['<unk>'] for word in word_list]

    return [word2index(word_list) for word_list in words]

In [None]:
train_questions = word2indexs(train_questions, w2i)

## padding

In [None]:
def padding(words, max_len, pad_index=0):

    def padd(word_list):
        if len(word_list) > max_len:
            tmp = word_list[: max_len]
        else:
            tmp = word_list + [pad_index] * (max_len - len(word_list))
        return tmp

    results = [padd(word_list) for word_list in words]
    return results

In [None]:
train_questions = padding(train_questions, max_len)

## 随机划分训练集、验证集

In [None]:
train_questions, val_questions, train_targets, val_targets = model_selection.train_test_split(
        train_questions, train_targets, test_size=0.1, random_state=333)

In [None]:
assert len(train_questions) == len(train_targets)
assert len(val_questions) == len(val_targets)
print('train size:%d, val size:%d' % (len(train_questions), len(val_questions)))

## 构建train、val dataloader

In [None]:
def get_dataloader(dataset, batch_size, shuffle, drop_last):
    dataset = [torch.LongTensor(d) for d in dataset]
    dataset = data.TensorDataset(*dataset)
    data_iter = data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last
    )
    return data_iter

In [None]:
train_loader = get_dataloader(
    dataset=[train_questions, train_targets],
    batch_size=batch_size,
    shuffle=True,
    drop_last=True
)

val_loader = get_dataloader(
    dataset=[val_questions, val_targets],
    batch_size=batch_size,
    shuffle=False,
    drop_last=False
)

## model

### embedding
 - 基础embedding
 - <unk> 可训练

In [None]:
class Embedding(nn.Module):
    """ standard embedding """
    def __init__(self, embedding):
        super(Embedding, self).__init__()

        self.vocab_size = embedding.shape[0]
        self.w2v_size = embedding.shape[1]

        self.embedding_fix = nn.Embedding(
            num_embeddings=self.vocab_size,
            embedding_dim=self.w2v_size,
            padding_idx=0,
            _weight=torch.Tensor(embedding)
        )
        self.embedding_fix.weight.requires_grad = False

        self.embedding_v = nn.Embedding(
            num_embeddings=2,
            embedding_dim=self.w2v_size,
            padding_idx=0
        )

        self.embedding_dim = self.embedding_fix.embedding_dim

    def forward(self, tensor):
        """
        :param tensor: (batch_size, c_len)
        :return: (batch_size, c_len, w2v)
        """
        embedding_1 = self.embedding_fix(tensor)

        tensor = tensor - (self.vocab_size - self.embedding_v.num_embeddings)
        tensor = f.relu(tensor)
        embedding_2 = self.embedding_v(tensor)

        embedding = embedding_1 + embedding_2

        return embedding

### encoder
 - LSTM、 GRU

In [None]:
class Rnn(nn.Module):

    def __init__(self, param):
        super(Rnn, self).__init__()

        self.mode = param['mode']
        self.input_size = param['input_size']
        self.hidden_size = param['hidden_size']
        self.dropout_p = param['encoder_dropout_p']
        self.directional = True
        self.layer_num = param['encoder_layer_num']
        self.is_bn = param['is_bn']

        if self.mode == 'LSTM':
            self.rnn = nn.LSTM(
                input_size=self.input_size,
                hidden_size=self.hidden_size,
                num_layers=self.layer_num,
                bidirectional=self.directional,
                dropout=self.dropout_p if self.layer_num > 1 else 0
            )
        elif self.mode == 'GRU':
            self.rnn = nn.GRU(
                input_size=self.input_size,
                hidden_size=self.hidden_size,
                num_layers=self.layer_num,
                bidirectional=self.directional,
                dropout=self.dropout_p if self.layer_num > 1 else 0
            )

        if self.is_bn:
            self.layer_norm = nn.LayerNorm(self.input_size)

        self.dropout = nn.Dropout(p=self.dropout_p)
        self.reset_parameters()

    def reset_parameters(self):
        """ use xavier_uniform to initialize rnn weights """
        ih = (param for name, param in self.named_parameters() if 'weight_ih' in name)
        hh = (param for name, param in self.named_parameters() if 'weight_hh' in name)
        b = (param for name, param in self.named_parameters() if 'bias' in name)

        for t in ih:
            torch.nn.init.xavier_uniform_(t)
        for t in hh:
            torch.nn.init.orthogonal_(t)
        for t in b:
            torch.nn.init.constant_(t, 0)

    def forward(self, vec, mask):
        """
        :param vec: (seq_len, batch_size, input_size)
        :param mask: (batch_size, seq_len)
        :return: (seq_len, batch_size, hidden_size*directional_num)
        """

        # layer normalization
        if self.is_bn:
            seq_len, batch_size, input_size = vec.size
            vec = vec.contiguous().view(-1, input_size)
            vec = self.layer_norm(vec)
            vec = vec.view(seq_len, batch_size, input_size)

        # dropout
        vec = self.dropout(vec)

        # forward
        lengths = mask.long().sum(1)
        length_sort, idx_sort = torch.sort(lengths, descending=True)
        _, idx_unsort = torch.sort(idx_sort)

        v_sort = vec.index_select(1, idx_sort)
        v_pack = nn.utils.rnn.pack_padded_sequence(v_sort, length_sort)
        outputs, _ = self.rnn(v_pack, None)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs.index_select(1, idx_unsort)

        # 未填充， outputs的第一维可能小于seq_len
        return outputs

### model：Bi-Rnn

In [None]:
class Model_Rnn(nn.Module):
    """ rnn """
    def __init__(self, param):
        super(Model_Rnn, self).__init__()

        self.hidden_size = param['hidden_size']
        self.dropout_p = param['dropout_p']

        # embedding
        self.embedding = Embedding(param['embedding'])

        # encoder
        param['input_size'] = self.embedding.embedding_dim
        self.encoder = Rnn(param)

        # outputs
        self.fc1 = nn.Sequential(
            nn.Linear(self.hidden_size*2, self.hidden_size),
            nn.ReLU()
        )

        self.fc2 = nn.Sequential(
            nn.Linear(self.hidden_size, 1),
            nn.Sigmoid()
        )

        # dropout
        self.dropout = nn.Dropout(param['dropout_p'])

    def forward(self, batch):

        questions = batch[0]

        # mask
        def get_mask(tensor): return torch.ne(tensor, 0)
        question_mask = get_mask(questions)

        # embedding
        question_vec = self.embedding(questions)
        question_vec = question_vec.transpose(0, 1)

        # encoder (seq_len, batch_size, h*2)
        question_vec = self.encoder(question_vec, question_mask)

        # output
        question_vec = torch.sum(question_vec, dim=0)
        question_mask = question_mask.long().sum(1)
        question_mask = question_mask.view(-1, 1).float()
        question_vec = question_vec / question_mask  # (batch_size, h*2)

        question_vec = self.dropout(question_vec)
        output = self.fc1(question_vec)
        output = self.dropout(output)
        output = self.fc2(output)  # (batch_size, 1)

        return output

In [None]:
model = Model_Rnn(param)
model = model.cuda()
model_best_state = None
loss_best = 999

### loss, optimizer

In [None]:
criterion = torch.nn.BCELoss()

In [None]:
optimizer_param = filter(lambda p: p.requires_grad, model.parameters())
optimizer = optim.Adam(optimizer_param, lr=1e-4)

## train

In [None]:
model_param_num = 0
for parameter in model.parameters():
    if parameter.requires_grad:
        model_param_num += parameter.nelement()
print('start training, param_num:%d' % model_param_num)

In [None]:
train_loss = 0
train_c = 0
t_nums = len(train_questions) // batch_size
every_nums = t_nums // 10
time0 = time.time()
for e in range(epochs):
    for i, batch in enumerate(train_loader):
        batch = [b.cuda() for b in batch]
        model.train()
        optimizer.zero_grad()
        outputs = model(batch)
        loss_value = criterion(outputs, batch[1].view(-1, 1).float())
        loss_value.backward()
        optimizer.step()

        train_loss += loss_value.item()
        train_c += 1

        if train_c % every_nums == 0:
            val_loss = 0
            val_c = 0
            correct_num = 0
            sum_num = 0
            with torch.no_grad():
                model.eval()
                for val_batch in val_loader:
                    val_batch = [b.cuda() for b in val_batch]
                    outputs = model(val_batch)
                    loss_value = criterion(outputs, val_batch[1].view(-1, 1).float())

                    correct_num += ((outputs > 0.5).long() == val_batch[1].view(-1, 1)).sum().item()
                    sum_num += outputs.size(0)

                    val_loss += loss_value.item()
                    val_c += 1
            print('training, epochs:%2d, steps:%2d/%2d, train_loss:%.4f, val_loss:%.4f, accuracy:%.4f, time:%4ds' %
                      (e, (i+1), t_nums, train_loss/train_c, val_loss/val_c, correct_num/sum_num, time.time()-time0))

            if loss_best > (val_loss/val_c):
                loss_best = val_loss / val_c
                model_best_state = copy.deepcopy(model.state_dict())
print('training, best_loss:%.4f' % loss_best)

### 阈值选择

In [None]:
model = Model_Rnn(param)
model = model.cuda()
model.load_state_dict(model_best_state)
model.eval()
scores = np.arange(0, 1, 0.05)
best_score = -1
best_accuracy = 0
for score in scores:
    correct_num = 0
    sum_num = 0
    with torch.no_grad():
        for val_batch in val_loader:
            val_batch = [b.cuda() for b in val_batch]
            outputs = model(val_batch)
            correct_num += ((outputs > score).long() == val_batch[1].view(-1, 1)).sum().item()
            sum_num += outputs.size(0)
    print('score choosing, score:%.2f, accuracy:%.4f' % (score, correct_num/sum_num))
    if best_accuracy < correct_num / sum_num:
        best_score = score
        best_accuracy = correct_num / sum_num
print('valing, best_score:%.2f, best_accuracy:%.4f' % (best_score, best_accuracy))

## test预测

### test 数据处理

In [None]:
test_questions = deal_data(test_file, is_train=False)
print('test size:%d' % len(test_questions))
w2i_test, embedding_test = build_word_embedding(test_questions, embedding_file)
test_questions = word2indexs(test_questions, w2i_test)
test_questions = padding(test_questions, max_len)
test_loader = get_dataloader(
    dataset=[test_questions],
    batch_size=batch_size,
    shuffle=False,
    drop_last=False
)

### 测试模型构建

In [None]:
model = Model_Rnn(param)
model.load_state_dict(model_best_state)
model.embedding.embedding_fix = nn.Embedding(
    num_embeddings=embedding_test.shape[0],
    embedding_dim=embedding_test.shape[1],
    padding_idx=0,
    _weight=torch.Tensor(embedding_test)
)
model.embedding.vocab_size = embedding_test.shape[0]
model = model.cuda()

### 结果生成

In [None]:
result = []
with torch.no_grad():
    for test_batch in test_loader:
        test_batch = [b.cuda() for b in test_batch]
        outputs = model(test_batch)
        outputs = (outputs > best_score).long()
        result += outputs.view(-1).cpu().numpy().tolist()

### 文件输出

In [None]:
test_df = pd.read_csv(test_file)
submission = pd.DataFrame(
    {'qid': test_df['qid'], 'prediction': result},
    columns=['qid', 'prediction']
)
submission.to_csv('submission.csv', index=False)