In [1]:
from collections import Counter
from contextlib import contextmanager
import copy
from functools import partial
from itertools import chain
from multiprocessing import Pool
import os
import random
import re
import string
import time
import warnings

In [2]:
import joblib
import numpy as np
import pandas as pd

from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.utils import shuffle

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
from torch.utils.data import Dataset, Sampler, DataLoader
from torch.optim.optimizer import Optimizer

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [43]:
PRETRAINED_PATH = "./input/crawl-300d-2M.vec"
TRAIN_PATH = "./input/train.csv"
TEST_PATH = "./input/test.csv"
SUBMISSION_PATH = "./input/sample_submission.csv"

In [112]:
embed_size = 300
max_len = 200
max_words = 10000

batch_size = 512
train_epoch = 3
n_split = 5

miu = 0.9
update_per_epoch = 10

seed = 201912
device = torch.device('cpu')

In [5]:
ps = PorterStemmer()
lc = LancasterStemmer()
sb = SnowballStemmer('english')

In [11]:
@contextmanager
def timer(msg):
    t0 = time.time()
    print('%s start.'%msg)
    yield
    elapsed_time = time.time() - t0
    print('%s done in %s min.'%(msg, elapsed_time/60))

In [14]:
misspell_dict = {"aren't": "are not", "can't": "cannot"}

In [96]:
def replace_typical_mis(text):
    mis_re = re.compile("(%s)"%"|".join(misspell_dict.keys()))
    return mis_re.sub(lambda x:misspell_dict[x.group(0)], text)

In [31]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']',
          '>', '%', '=', '#', '*', '+', '\\', '•', '~', '@', '£', '·', '_', '{', '}', '©', '^',
          '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â', '█',
          '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶',
          '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼',
          '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
          'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪',
          '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√']

In [59]:
#add space before & after puctuation 
def add_space(x):
    x = str(x)
    for punct in puncts + list(string.punctuation):
        if punct in x:
            x = x.replace(punct, ' ' + punct + ' ')
    return x

In [60]:
def clean_numbers(x):
    return re.sub(r'\d+', ' ', x)

In [41]:
def load_embedding(embedding_path, word_index):
    embedding_dict = dict((o.strip().split(' ')[0], o.strip().split(' ')[1:]) for o in open(embedding_path))
    nb_words = min(2 + max_words, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_size))
    
    for word, index in word_index.items():
        embedding_vec = embedding_dict.get(word)
        if embedding_vec != None:
            embedding_matrix[index] = embedding_vec
            continue
        #try upper & lower case
        embedding_vec = embedding_dict.get(word.lower())
        if embedding_vec != None:
            embedding_matrix[index] = embedding_vec
            continue
        
        embedding_vec = embedding_dict.get(word.upper())
        if embedding_vec != None:
            embedding_matrix[index] = embedding_vec
            continue
            
        embedding_vec = embedding_dict.get(word.capitalize())
        if embedding_vec != None:
            embedding_matrix[index] = embedding_vec
            continue
        
        embedding_vec = embedding_dict.get(ps.stem(word))
        if embedding_vec != None:
            embedding_matrix[index] = embedding_vec
            continue
        
        embedding_vec = embedding_dict.get(lc.stem(word))
        if embedding_vec != None:
            embedding_matrix[i] = embedding_vec
            continue
            
        embedding_vec = embedding_dict.get(sb.stem(word))
        if embedding_vec != None:
            embedding_matrix[i] = embedding_vec
            continue
        
        return embedding_matrix

In [109]:
def load_process():
    train = pd.read_csv(TRAIN_PATH)
    test = pd.read_csv(TEST_PATH)
    
    #process basic steps
    process_cols = ['question_title', 'question_body', 'answer']
    for col in process_cols:
        train[col] = train[col].astype(str).apply(lambda x: x.lower()).apply(replace_typical_mis).apply(add_space).apply(clean_numbers).apply(lambda x: x.strip())
        test[col] = test[col].astype(str).apply(lambda x: x.lower()).apply(replace_typical_mis).apply(add_space).apply(clean_numbers).apply(lambda x: x.strip())

    #have no na value
    #get targets
    question_target_cols = train.columns[11:-9]
    answer_target_cols = train.columns[-9:]
    

    
    np.random.seed(seed)
    train_index = np.random.permutation(len(train))
    
    train_x = train[process_cols][train_index]
    train_question_y = train[question_target_cols][train_index]
    train_answer_y = train[answer_target_cols][train_index]
    
    return train_x, train_question_y, train_answer_y 

In [111]:
def build_vocab(texts, max_features = max_words):
    #build token_to_id & id_to_token
    counter = Counter()
    for text in texts:
        counter.update(text.split())

    vocab = {
        'token2id': {'<PAD>': 0, '<UNK>': max_features + 1, '<START>': max_features + 2},
        'id2token': {}
    }
    vocab['token2id'].update(
        {token: _id + 1 for _id, (token, count) in
         enumerate(counter.most_common(max_features))})
    vocab['id2token'] = {v: k for k, v in vocab['token2id'].items()}
    return vocab

In [114]:
def tokenize(texts, vocab):
    #get the first 200 words of a text and transfer it to ids
    return [[vocab['token2id'].get(token) for token in text.split()[:max_len]] for text in texts]