# Preface

This kernel is a fork of [this](https://www.kaggle.com/bminixhofer/simple-lstm-pytorch-version) kernel made to work on Fast.AI and <br>
Uses Weighted BCE Loss as described in [this](https://www.kaggle.com/tanreinama/simple-lstm-using-identity-parameters-solution) kernel. <br>
Other than that nothing else has been changed. All improvemnts mentioned [here](https://www.kaggle.com/bminixhofer/simple-lstm-pytorch-version) could still apply.

# Imports & Utility functions

In [1]:
from fastai.train import Learner
from fastai.train import DataBunch
from fastai.callbacks import *
from fastai.basic_data import DatasetType

In [2]:
import numpy as np
import pandas as pd
import os
import time
import gc
import random
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F

Using TensorFlow backend.


In [3]:
# disable progress bars when submitting
def is_interactive():
   return 'SHLVL' not in os.environ

if not is_interactive():
    def nop(it, *a, **k):
        return it

    tqdm = nop

In [4]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [5]:
CRAWL_EMBEDDING_PATH = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
GLOVE_EMBEDDING_PATH = '../input/glove840b300dtxt/glove.840B.300d.txt'
NUM_MODELS = 2
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 220

In [6]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))

# def build_matrix(word_index, path):
#     embedding_index = load_embeddings(path)
#     embedding_matrix = np.zeros((len(word_index) + 1, 300))
#     unknown_words = []
    
#     for word, i in word_index.items():
#         try:
#             embedding_matrix[i] = embedding_index[word]
#         except KeyError:
#             unknown_words.append(word)
#     return embedding_matrix, unknown_words

def build_matrix(word_index, embedding_index):
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

In [7]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def train_model(learn,test,output_dim,lr=0.001,
                batch_size=512, n_epochs=4,
                enable_checkpoint_ensemble=True):
    
    all_test_preds = []
    checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)]
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    n = len(learn.data.train_dl)
    phases = [(TrainingPhase(n).schedule_hp('lr', lr * (0.6**(i)))) for i in range(n_epochs)]
    sched = GeneralScheduler(learn, phases)
    learn.callbacks.append(sched)
    for epoch in range(n_epochs):
        learn.fit(1)
        test_preds = np.zeros((len(test), output_dim))    
        for i, x_batch in enumerate(test_loader):
            X = x_batch[0].cuda()
            y_pred = sigmoid(learn.model(X).detach().cpu().numpy())
            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred

        all_test_preds.append(test_preds)


    if enable_checkpoint_ensemble:
        test_preds = np.average(all_test_preds, weights=checkpoint_weights, axis=0)    
    else:
        test_preds = all_test_preds[-1]
        
    return test_preds

In [8]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x
    
class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix, num_aux_targets):
        super(NeuralNet, self).__init__()
        embed_size = embedding_matrix.shape[1]
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)
    
        self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.linear2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        
        self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)
        self.linear_aux_out = nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)
        
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        # global average pooling
        avg_pool = torch.mean(h_lstm2, 1)
        # global max pooling
        max_pool, _ = torch.max(h_lstm2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        
        return out

In [9]:
# def preprocess(data):
#     '''
#     Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
#     '''
#     punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
#     def clean_special_chars(text, punct):
#         for p in punct:
#             text = text.replace(p, ' ')
#         return text

#     data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
#     return data

# Preprocessing

In [10]:
train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

In [11]:
df = pd.concat([train ,test],sort=False)

In [12]:
# clean website address
def clean_web(text):
    temp1 = text.split()
    for i in range(len(temp1)):
        if '://mobile' in  temp1[i][:9]:
#             print(temp1[i])
            temp1[i] = ''
        elif 'ttps://' in temp1[i][:7]:
#             print(temp1[i])
            temp1[i] = ''
        elif 'Http:' in temp1[i] or 'Https:' in temp1[i]:
            pos = temp1[i].find('Http')
            temp1[i] = temp1[i][:pos]
        elif 'http:' in temp1[i] or 'https:' in temp1[i]:
            pos = temp1[i].find('http')
            temp1[i] = temp1[i][:pos]
        elif 'www.' == temp1[i][:4]:  # just in case like 'Awww...I', so I'm not using 'www.' in temp1[i]
            temp1[i] = ''
    return ' '.join(temp1)

temp = df['comment_text'].apply(lambda x: clean_web(x))
no_web_comment_text = temp


In [13]:
# def build_vocab(texts):
#     sentences = texts.apply(lambda x: x.split()).values
#     vocab = {}
#     for sentence in tqdm(sentences):
#         for word in sentence:
#             try:
#                 vocab[word] += 1
#             except KeyError:
#                 vocab[word] = 1
#     return vocab
# first add lower based on no_web_comment_text
temp = temp.apply(lambda x: x.lower())

In [14]:
# correct Contractions
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", 
                       "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                       "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", 
                       "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", 
                       "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", 
                       "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", 
                       "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", 
                       "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                       "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", 
                       "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                       "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", 
                       "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", 
                       "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", 
                       "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", 
                       "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", 
                       "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                       "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", 
                       "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", 
                       "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  
                       "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", 
                       "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", 
                       "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", 
                       "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 
                       "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
                       "y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                       "you'll've": "you will have", "you're": "you are", "you've": "you have" }

In [15]:
def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

temp = temp.apply(lambda x: clean_contractions(x, contraction_mapping))

In [16]:
# clean all Chinese, Japanese and Korean Characters
def remove_Asia_letters(check_str):
    for ch in check_str:
        if u'\u4e00' <= ch <= u'\u9fff' or u'\uac00' <= ch <= u'\ud7ff' or u'\u3040' <= ch <= u'\u30ff':
            check_str = check_str.replace(ch,' ')
    return check_str

temp = temp.apply(lambda x: remove_Asia_letters(x))

In [17]:
import emoji
def demojize_text(check_str):
    for ch in check_str:
        if ch in emoji.UNICODE_EMOJI:
            return emoji.demojize(check_str)
    return check_str

temp = temp.apply(lambda x: demojize_text(x))

In [18]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", 
                 "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', 
                 "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', 
                 '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', 'ᴀɴᴅ':'and','ᴀ':'a','naïve':'naive',
                '―':'-','ʜᴏᴍᴇ':'home','ᴜᴘ':'up','ʙʏ':'by','yᴏᴜ':'you','ᴀᴛ':'at','ᴄᴏᴍᴘᴜᴛᴇʀ':'computer','ᴛʜɪs':'this',
                 'ᴍᴏɴᴛʜ':'month','ᴡᴏʀᴋɪɴɢ':'working','chrétien':'chretien','ᴊᴏʙ':'job','ᴏғ':'of','ʜᴏᴜʀʟʏ':'hourly',
                 'ᴡᴇᴇᴋ':'week','ʟɪɴᴋ':'link','ᴛᴏ':'to','ʜᴀᴠᴇ':'have','ᴄᴀɴ':'can','ᴇɴᴅ':'end','ғɪʀsᴛ':'first',
                 'ʏᴏᴜʀ':'your','sɪɢɴɪɴɢ':'signing','ʙᴏᴛᴛᴏᴍ':'bottom','ғᴏʟʟᴏᴡɪɴɢ':'following','mᴀᴋᴇ':'make',
                 'ᴄᴏɴɴᴇᴄᴛɪᴏɴ':'connection','ɪɴᴛᴇʀɴᴇᴛ':'internet','ʀᴇʟɪᴀʙʟᴇ':'reliable','ɴᴇᴇᴅ':'need','ᴏɴʟʏ':'only',
                 'ɪɴᴄᴏᴍᴇ':'income','ᴇxᴛʀᴀ':'extra','ᴀɴ':'an','ɴᴇᴇᴅɪɴɢ':'needing','ᴀɴʏᴏɴᴇ':'anyone','ᴏʀ':'or',
                 'ᴍᴏᴍs':'moms','sᴛᴀʏ':'stay','sᴛᴜᴅᴇɴᴛs':'students','gʀᴇᴀᴛ':'great','ғʀᴏᴍ':'from','sᴛᴀʀᴛ':'start',
                 'québec':'quebec','𝒂𝒏𝒅':'and','brexit':'british exit','»':'>>','«':'<<','·':'.','co₂':'co2',
                'cliché':'cliche','½':'1/2','›':'>','♡':'love','✬':'star','ᴛʜᴇ':'the','aᴛ':'at','ʜaᴠᴇ':'have',
                 'ᴄaɴ':'can','ʙᴏᴛtoᴍ':'bottom','maᴋᴇ':'make','ʀᴇʟɪaʙʟᴇ':'reliable','ᴇxᴛʀa':'extra','aɴ':'an',
                 'needɪɴɢ':'needing','aɴʏᴏɴᴇ':'anyone','sᴛaʏ':'stay','gʀᴇaᴛ':'great','sᴛaʀᴛ':'start','ά':'a', '𝓴':'k',
                 '𝘢':'a','ã':'a', 'ﬂ':'fl', 'ĥ':'h','𝗲':'e', '𝒐':'o', '🇳':'n', '𝒗':'v','⒊':'3.','ï':'i', '𝙜':'g', 
                 'λ':'lambda', '𝑹':'r','ｎ':'n', '¡':'i', '𝖗':'r', '𝑾':'w', '𝒖':'u', '𝘆':'y', '!':'!', 
                 '🇼':'w', 'й':'n', '𝘧':'f', 'ᴘ':'p','𝓉':'t', '𝟐':'2','ﬃ':'ffi','ĉ':'c', 'ᑭ':'rho', '𝖌':'g', 
                 'п':'pi', '౦':'o', '𝑮':'g','ξ':'xi','ἰ':'i', 'ᑯ':'d','🇧':'b', '𝒑':'p', '𝓊':'u', 'н':'h', '𝒸':'c',
                 '𝘼':'a','𝘲':'q', 'ｕ':'u', 'ĕ':'e', '𝙛':'f','ν':'v', 'מ':'n', 'ĭ':'i', 'ǐ':'i','å':'a', '𝓽':'t',
                 'ἴ':'i', 'ύ':'u', 'ć':'c', 'ä':'a', 'ř':'r', 'ġ':'g', '𝓵':'l','🇻':'v', '𝒕':'t','𝘬':'k','𝗸':'k', 'ａ':'a',
                 'ⲏ':'h', 'ᴅ':'d', 'ү':'gamma', '𝖉':'d', 'е':'e','🇫':'f', '𝓻':'r', '𝙪':'u', 'ᴄ':'c','χ':'x','𝑻':'t', 
                 '𝑥':'x', '𝒇':'f', 'ā':'a', '𝘵':'t','ｄ':'d', 'ɴ':'n','𝑯':'h', 'ｃ':'c','ш':'w','𝒁':'z', '𝐫':'r', 
                 '𝓲':'i', '𝖈':'c', 'ｐ':'p', 'ᴦ':'r','𝖕':'p', 'ê':'e','𝖂':'b', '𝐯':'v','𝓀':'k',  '𝓮':'e', '𝗞':'k', 
                 '𝒃':'b', 'ᴍ':'m', 'ρ':'rho', '𝒶':'a', 'ℴ':'o', 'ῖ':'i', '𝗵':'h', '𝙯':'z', 'ô':'o', 'τ':'tau', '𝐩':'p', 'ꭻ':'j',
                '\xad':' ','┈':'....','ʻ':"'",'ü':'u','é':'e','ᴇ':'e','è':'e','ғ':'f','𝒊':'i','ʏ':'y','𝒂':'a','ᴋ':'k','𝒏':'n',
                 'а':'a','ś':'s', 'í':'i','о':'o','\x7f':' ','𝒔':'s','𝙚':'e','ö':'o','в':'b','s':'s','▀':' ','▄':' ','➤':' ',
                 '═':'=','☻':' smile ','❥':' love ','★':' star ','м':'m','𝙣':'n','𝒅':'d','𝙧':'r','𝒄':'c','ō':'o','𝙨':'s','ᴛ':'t',
                 'ē':'e','ᴜ':'u','𝒎':'m','𝙞':'i','υ':'u','ɪ':'i','к':'k','▰':' ','▔':' ','▬':' ','̶':'-','╲':"\\",'╱':'/'}

In [19]:
def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, ' ') # we can try just replace puncts into space
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': '','✰':' ','§':' ','○':'','❧':'','ι':''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

temp = temp.apply(lambda x: clean_special_chars(x, punct, punct_mapping))

In [20]:
x_train = temp[:train.shape[0]]
y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
x_test = temp[train.shape[0]:]

identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

weights = np.ones((len(x_train),)) / 4
# Subgroup
weights += (train[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) / 4
# Background Positive, Subgroup Negative
weights += (( (train['target'].values>=0.5).astype(bool).astype(np.int) +
   (train[identity_columns].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
# Background Negative, Subgroup Positive
weights += (( (train['target'].values<0.5).astype(bool).astype(np.int) +
   (train[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
loss_weight = 1.0 / weights.mean()

y_train = np.vstack([(train['target'].values>=0.5).astype(np.int),weights]).T

In [21]:

import pickle
pickle_out = open('x_train.pickle','wb')
pickle.dump(x_train,pickle_out)
pickle_out.close()

pickle_out = open('y_aux_train.pickle','wb')
pickle.dump(y_aux_train,pickle_out)
pickle_out.close()

pickle_out = open('x_test.pickle','wb')
pickle.dump(x_test,pickle_out)
pickle_out.close()

pickle_out = open('weights.pickle','wb')
pickle.dump(weights,pickle_out)
pickle_out.close()

pickle_out = open('loss_weight.pickle','wb')
pickle.dump(loss_weight,pickle_out)
pickle_out.close()

pickle_out = open('y_train.pickle','wb')
pickle.dump(y_train,pickle_out)
pickle_out.close()

In [22]:
del train
del test
del df
gc.collect()

21

In [23]:
# x_train = preprocess(train['comment_text'])
# y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
# x_test = preprocess(test['comment_text'])

# identity_columns = [
#     'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
#     'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
# # Overall
# weights = np.ones((len(x_train),)) / 4
# # Subgroup
# weights += (train[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) / 4
# # Background Positive, Subgroup Negative
# weights += (( (train['target'].values>=0.5).astype(bool).astype(np.int) +
#    (train[identity_columns].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
# # Background Negative, Subgroup Positive
# weights += (( (train['target'].values<0.5).astype(bool).astype(np.int) +
#    (train[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
# loss_weight = 1.0 / weights.mean()

# y_train = np.vstack([(train['target'].values>=0.5).astype(np.int),weights]).T

In [24]:
# identity_columns = [
#     'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
#     'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
# # Overall
# weights = np.ones((len(x_train),)) / 4
# # Subgroup
# weights += (train_identity_columns.fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) / 4
# # Background Positive, Subgroup Negative
# weights += (( (train_target.values>=0.5).astype(bool).astype(np.int) +
#    (train[identity_columns].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
# # Background Negative, Subgroup Positive
# weights += (( (train_target.values<0.5).astype(bool).astype(np.int) +
#    (train_identity_columns.fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
# loss_weight = 1.0 / weights.mean()

# y_train = np.vstack([(train_target.values>=0.5).astype(np.int),weights]).T

In [25]:
max_features = None

In [26]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(x_train) + list(x_test))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [27]:
max_features = max_features or len(tokenizer.word_index) + 1
max_features

282482

In [28]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in tqdm(sentences):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

vocab_no_web = build_vocab(no_web_comment_text)

def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")

In [29]:
C_embedding = load_embeddings(CRAWL_EMBEDDING_PATH)
add_lower(C_embedding, vocab_no_web)


Added 31155 words to embedding


In [30]:
crawl_matrix, unknown_words_crawl = build_matrix(tokenizer.word_index, C_embedding)
print('n unknown words (crawl): ', len(unknown_words_crawl))

del C_embedding
del unknown_words_crawl
gc.collect()

n unknown words (crawl):  110238


0

In [31]:
G_embedding = load_embeddings(GLOVE_EMBEDDING_PATH)
add_lower(G_embedding, vocab_no_web)
glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index, G_embedding)
print('n unknown words (glove): ', len(unknown_words_glove))

del G_embedding
del unknown_words_glove
gc.collect()

Added 25381 words to embedding
n unknown words (glove):  111000


0

In [32]:
del vocab_no_web

embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)
embedding_matrix.shape

del crawl_matrix
del glove_matrix
gc.collect()


0

In [33]:

pickle_out = open('embedding_matrix.pickle','wb')
pickle.dump(embedding_matrix,pickle_out)
pickle_out.close()



In [34]:
# print (os.listdir('./'))
# pickle_in = open('embedding_matrix.pickle','rb')
# test_matrix = pickle.load(pickle_in)
# pickle_in.close()

In [35]:
# print(embedding_matrix.shape)
# print(test_matrix.shape)
# # 
# np.array_equal(embedding_matrix,test_matrix)

In [36]:
# type(x_train)
# pickle_out = open('x_train.pickle','wb')
# pickle.dump(x_train,pickle_out)
# pickle_out.close()

In [37]:
# print (os.listdir('./'))
# pickle_in = open('x_train.pickle','rb')
# test_matrix2 = pickle.load(pickle_in)
# pickle_in.close()

In [38]:
# np.array_equal(x_train,test_matrix2)

In [39]:
# x_train_torch = torch.tensor(x_train, dtype=torch.long)
# y_train_torch = torch.tensor(np.hstack([y_train, y_aux_train]), dtype=torch.float32)

In [40]:
# x_test_torch = torch.tensor(x_test, dtype=torch.long)

# Training

In [41]:
# batch_size = 512

# train_dataset = data.TensorDataset(x_train_torch, y_train_torch)
# valid_dataset = data.TensorDataset(x_train_torch[:batch_size], y_train_torch[:batch_size])
# test_dataset = data.TensorDataset(x_test_torch)

# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

# databunch = DataBunch(train_dl=train_loader,valid_dl=valid_loader)

In [42]:
# def custom_loss(data, targets):
#     ''' Define custom loss function for weighted BCE on 'target' column '''
#     bce_loss_1 = nn.BCEWithLogitsLoss(weight=targets[:,1:2])(data[:,:1],targets[:,:1])
#     bce_loss_2 = nn.BCEWithLogitsLoss()(data[:,1:],targets[:,2:])
#     return (bce_loss_1 * loss_weight) + bce_loss_2

In [43]:
# all_test_preds = []

# for model_idx in range(NUM_MODELS):
#     print('Model ', model_idx)
#     seed_everything(1234 + model_idx)
#     model = NeuralNet(embedding_matrix, y_aux_train.shape[-1])
#     learn = Learner(databunch,model,loss_func=custom_loss)
#     test_preds = train_model(learn,test_dataset,output_dim=7)    
#     all_test_preds.append(test_preds)

In [44]:
# submission = pd.DataFrame.from_dict({
#     'id': test['id'],
#     'prediction': np.mean(all_test_preds, axis=0)[:, 0]
# })

# submission.to_csv('submission.csv', index=False)

Note that the solution is not validated in this kernel. So for tuning anything, you should build a validation framework using e. g. KFold CV. If you just check what works best by submitting, you are very likely to overfit to the public LB.