# single1.1
    - 数据预处理

## 导入库

In [1]:
from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
import torch
from torch import optim
import time
import re
import copy
import pandas as pd
import nltk
import numpy as np
from torch.utils import data
from torch import nn
from torch.nn import functional as f
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
flag = True

In [3]:
time_start = time.time()

## 参数初始化

In [4]:
if flag:
    train_file = 'data/train.csv'
    embedding_file = 'data/glove.840B.300d.txt'
#     embedding_file = 'data/paragram_300_sl999.txt'
    test_file = 'data/test.csv'
else:
    train_file = '../input/train.csv'
    embedding_file = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
#     embedding_file = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    test_file = '../input/test.csv'

In [5]:
max_len = 72  # 有待进一步确定
max_features = 120000  # 有待进一步确定
batch_size = 512
test_batch_size = 2048

## 数据预处理
 - 分词
 - train：长度筛选

In [6]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

In [7]:
time0 = time.time()
df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)

# lower
df_train['question_text'] = df_train['question_text'].str.lower()
df_test['question_text'] = df_test['question_text'].str.lower()

# add split 
df_train['question_text'] = df_train['question_text'].apply(lambda x: clean_text(x))
df_test['question_text'] = df_test['question_text'].apply(lambda x: clean_text(x))

# fill up missing values
train_questions = df_train['question_text'].fillna("_##_").values
test_questions = df_test['question_text'].fillna("_##_").values

# split word
train_questions = [q.split() for q in train_questions]
test_questions = [q.split() for q in test_questions]

train_targets = df_train['target'].values

print('train_len:%d' % (len(train_questions)))
print('test_len:%d' % (len(test_questions)))
print('time:%d' % (time.time() - time0))

train_len:1306122
test_len:56370
time:29


## 建立词表
 - glove

### 构建embedding

In [8]:
def build_word_embedding(questions, glove_file):

    # 初始化embedding字典
    def get_matrixs(word, *nums):
        return word, np.asarray(nums, dtype='float32')
    if glove_file in ['data/paragram_300_sl999.txt', '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt']:
        embedding_dict = dict(get_matrixs(*o.split(" ")) for o in open(glove_file, encoding="utf8", errors='ignore') if len(o)>100)
    else:
        embedding_dict = dict([get_matrixs(*o.split(' ')) for o in open(glove_file)])
    
    # 初始化词表
    vocab = {}
    for q in questions:
        for word in q:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1

    # 检查词表覆盖率、词表删选
    known_num = 0
    all_num = 0
    word_set = []
    for word in vocab.keys():
        if word in embedding_dict:
            known_num += vocab[word]
            word_set.append(word)
        all_num += vocab[word]
        
    print('words in pre-embedding, num:%d/%d, radio:%.4f' % (len(word_set), len(vocab), len(word_set)/len(vocab)))
    print('known words in all text:%.4f' % (known_num/all_num))


    # 构建词表、embedding矩阵
    w2i = {'<pad>': 0}
    count = 1
    embedding = np.zeros([len(word_set)+2, 300])
    for word in word_set:
        if word not in w2i:
            w2i[word] = count
            embedding[count] = embedding_dict[word]
            count += 1
    w2i['<unk>'] = count
    assert len(w2i) == len(embedding)

    print('build_word_embedding,  vocab size:%d' % len(w2i))

    return w2i, embedding

In [9]:
time0 = time.time()
w2i, embedding = build_word_embedding(train_questions+test_questions, embedding_file)
print('time:%d' % (time.time() - time0))

words in pre-embedding, num:126280/200487, radio:0.6299
known words in all text:0.9939
build_word_embedding,  vocab size:126282
time:89


## index

In [10]:
def word2indexs(words, lang):

    def word2index(word_list):
        return [lang[word] if word in lang else lang['<unk>'] for word in word_list]

    return [word2index(word_list) for word_list in words]

In [11]:
train_questions = word2indexs(train_questions, w2i)
test_questions = word2indexs(test_questions, w2i)

## padding

In [12]:
def padding(words, max_len, pad_index=0):

    def padd(word_list):
        if len(word_list) > max_len:
            tmp = word_list[: max_len]
        else:
            tmp = word_list + [pad_index] * (max_len - len(word_list))
        return tmp

    results = [padd(word_list) for word_list in words]
    return results

In [13]:
train_questions = padding(train_questions, max_len)
test_questions = padding(test_questions, max_len)

## numpy, split

In [14]:
train_questions = np.array(train_questions)
train_targets = np.array(train_targets)
test_questions = np.array(test_questions)

splits = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=333).split(train_questions, train_targets))

## 构建train、val dataloader

In [15]:
def get_dataloader(dataset, batch_size, shuffle, drop_last):
    dataset = [torch.LongTensor(d) for d in dataset]
    dataset = data.TensorDataset(*dataset)
    data_iter = data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last
    )
    return data_iter

## model building

### embedding
 - 基础embedding
 - <unk> 可训练

In [16]:
class Embedding(nn.Module):
    """ standard embedding """
    def __init__(self, embedding):
        super(Embedding, self).__init__()
        self.vocab_size = embedding.shape[0]
        self.w2v_size = embedding.shape[1]
        self.embedding_fix = nn.Embedding(
            num_embeddings=self.vocab_size,
            embedding_dim=self.w2v_size,
            padding_idx=0,
            _weight=torch.Tensor(embedding)
        )
        self.embedding_fix.weight.requires_grad = False
        self.embedding_v = nn.Embedding(
            num_embeddings=2,
            embedding_dim=self.w2v_size,
            padding_idx=0
        )
        self.embedding_dim = self.embedding_fix.embedding_dim
    def forward(self, tensor):
        """
        :param tensor: (batch_size, c_len)
        :return: (batch_size, c_len, w2v)
        """
        embedding_1 = self.embedding_fix(tensor)
        tensor = tensor - (self.vocab_size - self.embedding_v.num_embeddings)
        tensor = f.relu(tensor)
        embedding_2 = self.embedding_v(tensor)
        embedding = embedding_1 + embedding_2
        return embedding

### encoder
 - LSTM、 GRU

In [17]:
class Rnn(nn.Module):
    def __init__(self, param):
        super(Rnn, self).__init__()
        self.mode = param['mode']
        self.input_size = param['input_size']
        self.hidden_size = param['hidden_size']
        self.dropout_p = param['dropout_p']
        self.directional = True
        self.layer_num = param['encoder_layer_num']
        self.is_bn = param['is_bn']
        if self.mode == 'LSTM':
            self.rnn = nn.LSTM(
                input_size=self.input_size,
                hidden_size=self.hidden_size,
                num_layers=self.layer_num,
                bidirectional=self.directional,
                dropout=self.dropout_p if self.layer_num > 1 else 0
            )
        elif self.mode == 'GRU':
            self.rnn = nn.GRU(
                input_size=self.input_size,
                hidden_size=self.hidden_size,
                num_layers=self.layer_num,
                bidirectional=self.directional,
                dropout=self.dropout_p if self.layer_num > 1 else 0
            )
        if self.is_bn:
            self.layer_norm = nn.LayerNorm(self.input_size)
        self.dropout = nn.Dropout(p=self.dropout_p)
        self.reset_parameters()

    def reset_parameters(self):
        """ use xavier_uniform to initialize rnn weights """
        ih = (param for name, param in self.named_parameters() if 'weight_ih' in name)
        hh = (param for name, param in self.named_parameters() if 'weight_hh' in name)
        b = (param for name, param in self.named_parameters() if 'bias' in name)
        for t in ih:
            torch.nn.init.xavier_uniform_(t)
        for t in hh:
            torch.nn.init.orthogonal_(t)
        for t in b:
            torch.nn.init.constant_(t, 0)
            
    def forward(self, vec, mask):
        """
        :param vec: (seq_len, batch_size, input_size)
        :param mask: (batch_size, seq_len)
        :return: (seq_len, batch_size, hidden_size*directional_num)
        """
        # layer normalization
        if self.is_bn:
            seq_len, batch_size, input_size = vec.size
            vec = vec.contiguous().view(-1, input_size)
            vec = self.layer_norm(vec)
            vec = vec.view(seq_len, batch_size, input_size)

        # forward
        lengths = mask.long().sum(1)
        length_sort, idx_sort = torch.sort(lengths, descending=True)
        _, idx_unsort = torch.sort(idx_sort)

        v_sort = vec.index_select(1, idx_sort)
        v_pack = nn.utils.rnn.pack_padded_sequence(v_sort, length_sort)
        outputs, _ = self.rnn(v_pack, None)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs.index_select(1, idx_unsort)
        
        # 未填充， outputs的第一维可能小于seq_len
        return outputs

### self-attn

In [18]:
class SelfAttn(nn.Module):
    def __init__(self, input_size):
        super(SelfAttn, self).__init__()

        self.wq = nn.Linear(input_size, input_size//2)
        self.v = nn.Linear(input_size//2, 1)

    def forward(self, question_vec, question_mask):
        """
        :param question_vec: (seq_len, batch_size, input_size)
        :param question_mask: (batch_size, seq_len)
        :return: (batch_size, input_size)
        """
        wq = self.wq(question_vec)
        wq = torch.tanh(wq)
        s = self.v(wq).squeeze(2).transpose(0, 1)  # (batch_size, seq_len)

        mask = question_mask.eq(0)
        s.masked_fill_(mask, -float('inf'))
        s = f.softmax(s, dim=1)

        result = torch.bmm(s.unsqueeze(1), question_vec.transpose(0, 1)).squeeze(1)

        return result

## Model

In [19]:
class Model_xy(nn.Module):
    """ rnn """
    def __init__(self, param):
        super(Model_xy, self).__init__()
        
        # embedding
        self.embedding = Embedding(param['embedding'])
        
        # lstm
        param['input_size'] = self.embedding.embedding_dim
        param['mode'] = 'LSTM'
        self.lstm = Rnn(param)
        
        # gru
        param['mode'] = 'GRU'
        self.gru = Rnn(param)
        
        # attn
        self.lstm_attn = SelfAttn(param['hidden_size']*2)
        self.gru_attn = SelfAttn(param['hidden_size']*2)
        
        # outputs
        self.fc1 = nn.Linear(param['hidden_size']*12, param['hidden_size'])
        self.fc2 = nn.Linear(param['hidden_size'], 1)
        
        # dropout
        self.dropout = nn.Dropout(param['dropout_p'])
        self.dropout_emb = nn.Dropout(param['dropout_emb_p'])
        
        # init
        self.reset_parameters()
    
    def reset_parameters(self):
        torch.nn.init.xavier_uniform_(self.fc1.weight)
        torch.nn.init.xavier_uniform_(self.fc2.weight)
        torch.nn.init.constant_(self.fc1.bias, 0.0)
        torch.nn.init.constant_(self.fc2.bias, 0.0) 

    def forward(self, batch):

        questions = batch[0]
        
        # mask
        def get_mask(tensor): return torch.ne(tensor, 0)
        question_mask = get_mask(questions)
        mask_len = question_mask.long().sum(1).view(-1, 1).float()
        
        # embedding
        question_vec = self.embedding(questions)
        question_vec = question_vec.transpose(0, 1)
        question_vec = self.dropout_emb(question_vec)
        
        # lstm
        lstm_vec = self.lstm(question_vec, question_mask)
        
        # lstm:avg
        lstm_avg = torch.sum(lstm_vec, dim=0)
        lstm_avg = lstm_avg / mask_len  # (batch_size, h*2)
        
        # lstm:max
        lstm_max = torch.max(lstm_vec, dim=0)[0]
        
        # lstm:attn
        lstm_attn = self.lstm_attn(lstm_vec, question_mask[:, :lstm_vec.size(0)])
        
        # gru
        gru_vec = self.gru(question_vec, question_mask)
        
        # gru:avg
        gru_avg = torch.sum(gru_vec, dim=0)
        gru_avg = gru_avg / mask_len
        
        # gru: max
        gru_max = torch.max(gru_vec, dim=0)[0]
        
        # gru:attn
        gru_attn = self.gru_attn(gru_vec, question_mask[:, :gru_vec.size(0)])
        
        vec = torch.cat([lstm_avg, gru_avg, lstm_max, gru_max, lstm_attn, gru_attn], dim=1)
        
        # output: 
        output = f.relu(self.fc1(vec))
        output = self.dropout(output)
        output = torch.sigmoid(self.fc2(output))  # (batch_size, 1)

        return output

## Training

In [20]:
def train(param):
    if param['name'] == 'model_xy':
        model = Model_xy(param)
    model = model.cuda()
    
    # train/val loader
    i = param['data_i']
    train_x = train_questions[splits[i][0]]
    train_y = train_targets[splits[i][0]]
    train_loader = get_dataloader(
        dataset=[train_x, train_y],
        batch_size=batch_size,
        shuffle=True,
        drop_last=False
    )
    
    val_x = train_questions[splits[i][1]]
    val_y = train_targets[splits[i][1]]
    val_loader = get_dataloader(
        dataset=[val_x, val_y],
        batch_size=test_batch_size,
        shuffle=False,
        drop_last=False
    )
    
    criterion = torch.nn.BCELoss()
    optimizer_param = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adam(optimizer_param, lr=param['lr'], weight_decay=param['l2_decay'])
    lr = param['lr']
    
    model_param_num = 0
    for parameter in model.parameters():
        if parameter.requires_grad:
            model_param_num += parameter.nelement()
    print('%s, param_num:%d' % (param['name'], model_param_num))
    
    # train
    model_best_state = None
    train_loss = 0
    train_c = 0
    t_nums = len(train_loader)
    every_nums = t_nums // param['every_print']
    time0 = time.time()
    loss_val_last = 99999.0
    loss_best = 999
    accuracy_best = 0
    e_best = 0
    
    for e in range(param['epoch']):
        train_loss = 0
        train_c = 0
        
#         if e == 0:
#             lr = param['lr']
#         elif e == 1:
#             lr = param['lr'] 
#         elif e == 2:
#             lr = param['lr'] 
            
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
                    
        for i, batch in enumerate(train_loader):
            batch = [b.cuda() for b in batch]
            model.train()
            optimizer.zero_grad()
            outputs = model(batch)
            loss_value = criterion(outputs, batch[1].view(-1, 1).float())
            loss_value.backward()
            optimizer.step()

            train_loss += loss_value.item()
            train_c += 1
            

#             if train_c % every_nums == 0 and (param['need_print'] or (param['need_print'] is False and e==param['epoch']-1)):
#             if ((i % every_nums == 0 and i != 0) or (i+1 == t_nums)) and e+1 == param['epoch']:    
            if (i % every_nums == 0 and i != 0) or (i+1 == t_nums):
                val_loss = 0
                val_c = 0
                correct_num = 0
                sum_num = 0
                with torch.no_grad():
                    model.eval()
                    for val_batch in val_loader:
                        val_batch = [b.cuda() for b in val_batch]
                        outputs = model(val_batch)
                        loss_value = criterion(outputs, val_batch[1].view(-1, 1).float())

                        correct_num += ((outputs > 0.5).long() == val_batch[1].view(-1, 1)).sum().item()
                        sum_num += outputs.size(0)

                        val_loss += loss_value.item()
                        val_c += 1
                print('training, epochs:%2d, steps:%2d/%2d, train_loss:%.4f, val_loss:%.4f, accuracy:%.4f, lr:%.4f, time:%4ds' %
                        (e, (i+1), t_nums, train_loss/train_c, val_loss/val_c, correct_num/sum_num, lr, time.time()-time0))

                train_loss = 0
                train_c = 0
                
#                 if loss_val_last < val_loss / val_c:
#                     lr = lr * 0.5
#                     if lr < 0.001:
#                         lr = 0.001

                if loss_best > (val_loss / val_c):
                    accuracy_best = correct_num/sum_num
                    loss_best = val_loss / val_c
                    e_best = e
                    model_best_state = copy.deepcopy(model.state_dict())

                loss_val_last = val_loss / val_c
                

    print('training, best_eposh:%d, best_loss:%.4f, best_accuracy:%.4f' % (e_best, loss_best, accuracy_best))

    model.load_state_dict(model_best_state)
    model.eval()  
    
    # eval
    if True:
        scores = np.arange(0.1, 0.501, 0.01)
        accuracy = []
        y_pred = []
        y_true = []
        with torch.no_grad():
            for val_batch in val_loader:
                val_batch = [b.cuda() for b in val_batch]
                outputs = model(val_batch)
                outputs = outputs.view(-1).cpu().numpy().tolist()
                y_pred += outputs
                y_true += val_batch[1].view(-1).cpu().numpy().tolist()
        
        y_pred = np.array(y_pred)
        
        for score in scores:
            y_pred_tmp = (y_pred > score).astype(int).tolist()
            acc_tmp = metrics.f1_score(y_true, y_pred_tmp)
            accuracy.append(acc_tmp)
            if False:
                print('score choosing, score:%.2f, accuracy:%.4f' % (score, acc_tmp))
        accuracy = np.array(accuracy)
        best_index = np.argmax(accuracy)
        best_score = scores[best_index]
        best_f1 = accuracy[best_index]
        print('valing, best_score:%.2f, best_accuracy:%.4f' % (best_score, best_f1))
    
    return model, best_f1

### train

In [None]:
model_group = []
model_f1 = []

for i in range(len(splits)):
    time0 = time.time()
    config_model_i = {
        'data_i':i,
        'epoch':5,
        'name':'model_xy',
        'hidden_size':100,
        'dropout_emb_p':0.1,
        'dropout_p':0.2,
        'embedding':embedding,
        'encoder_layer_num':1,
        'is_bn':False,
        'l2_decay':0,
        'need_print':True,
        'lr':1e-3,
        'every_print':5       
    }
    print('start %d model training...' % (i+1))
    model_i, f1_i = train(config_model_i)
    model_group.append(model_i)
    model_f1.append(f1_i)
    print('%d model training finish, time:%d\n' % (i+1, time.time()-time0))   
print(model_f1)
print('result_f1:%.4f' % (sum(model_f1)/len(model_f1)))

start 1 model training...
model_xy, param_num:724003
training, epochs: 0, steps:409/2041, train_loss:0.1311, val_loss:0.1146, accuracy:0.9533, lr:0.0010, time:  39s
training, epochs: 0, steps:817/2041, train_loss:0.1157, val_loss:0.1074, accuracy:0.9573, lr:0.0010, time:  77s
training, epochs: 0, steps:1225/2041, train_loss:0.1102, val_loss:0.1134, accuracy:0.9556, lr:0.0010, time: 115s
training, epochs: 0, steps:1633/2041, train_loss:0.1064, val_loss:0.1030, accuracy:0.9589, lr:0.0010, time: 154s
training, epochs: 0, steps:2041/2041, train_loss:0.1064, val_loss:0.1011, accuracy:0.9594, lr:0.0010, time: 192s
training, epochs: 1, steps:409/2041, train_loss:0.1007, val_loss:0.1006, accuracy:0.9599, lr:0.0010, time: 230s
training, epochs: 1, steps:817/2041, train_loss:0.1000, val_loss:0.0995, accuracy:0.9600, lr:0.0010, time: 268s
training, epochs: 1, steps:1225/2041, train_loss:0.1010, val_loss:0.0992, accuracy:0.9604, lr:0.0010, time: 306s
training, epochs: 1, steps:1633/2041, train_los

## test 

In [None]:
# 均值
def ensemble_mean(model_result):
    y_pred = np.zeros(shape=[len(model_result[0])])
    for r in model_result:
        y_pred += np.array(r)
    y_pred = y_pred / len(model_result)
    return y_pred

In [None]:
test_loader = get_dataloader(
    dataset=[test_questions],
    batch_size=test_batch_size,
    shuffle=False,
    drop_last=False
)

In [None]:
time0 = time.time()
model_result = [[] for _ in range(len(model_group))]
with torch.no_grad():
    for test_batch in test_loader:
        test_batch = [b.cuda() for b in test_batch]

        for index in range(len(model_group)):
            outputs = model_group[index](test_batch)
            outputs = outputs.view(-1).cpu().numpy().tolist()
            model_result[index] += outputs

    # 集成策略：均值
    print('jiaquan mean,', end='')
    y_pred = ensemble_mean(model_result)
    y_pred = (y_pred > 0.34 ).astype(int).tolist()
    result = y_pred

print('test, ensemble, time:%d' % (time.time()-time0))

In [None]:
test_df = pd.read_csv(test_file)
submission = pd.DataFrame(
    {'qid': test_df['qid'], 'prediction': result},
    columns=['qid', 'prediction']
)
submission.to_csv('submission.csv', index=False)

In [None]:
print('time:%d' % (time.time()-time_start))