In [19]:
#!/usr/bin/env python
# coding: utf-8

# # Improved LSTM baseline
# 
# This kernel is a somewhat improved version of [Keras - Bidirectional LSTM baseline](https://www.kaggle.com/CVxTz/keras-bidirectional-lstm-baseline-lb-0-051) along with some additional documentation of the steps. (NB: this notebook has been re-run on the new test set.)

# In[1]:


import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers


# We include the GloVe word vectors in our input files. To include these in your kernel, simple click 'input files' at the top of the notebook, and search 'glove' in the 'datasets' section.

# In[2]:


path = 'input/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
#EMBEDDING_FILE=f'{path}glove6b50d/glove.6B.50d.txt'
TRAIN_DATA_FILE=f'{path}{comp}train.csv'
TEST_DATA_FILE=f'{path}{comp}test.csv'


# Set some basic config parameters:

# In[3]:


#embed_size = 50 # how big is each word vector
embed_size = 768 # how big is each word vector
max_features = 30522 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 128 # max number of words in a comment to use


# Read in our data and replace missing values:

# In[4]:


train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)


print('Loading train and test set')
list_sentences_train = train_df["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = train_df[list_classes].values
list_sentences_test = test_df["comment_text"].fillna("_na_").values
print('Datasets succ. loaded')
'''
from sklearn.model_selection import train_test_split

train, test = train_test_split(train_df, test_size = 0.1, random_state=42)

train_lines, train_labels = train.comment_text.values, train[list_classes].values
test_lines, test_labels = test.comment_text.values, test[list_classes].values
'''

train_lines = list_sentences_train
test_lines = list_sentences_test
train_labels = y_train


# In[5]:


#list_sentences_test.shape


# In[13]:


#wget https://raw.githubusercontent.com/google-research/bert/master/modeling.py
#wget https://raw.githubusercontent.com/google-research/bert/master/optimization.py
#wget https://raw.githubusercontent.com/google-research/bert/master/run_classifier.py
#wget https://raw.githubusercontent.com/google-research/bert/master/tokenization.py 
#wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip


# In[6]:


BERT_MODEL = 'uncased_L-12_H-768_A-12'
BERT_PRETRAINED_DIR = 'models/uncased_L-12_H-768_A-12'
print('BERT_MODEL dir',BERT_MODEL)
print(BERT_PRETRAINED_DIR)

# In[7]:


import modeling
import optimization
import run_classifier
import tokenization
import tensorflow as tf

print('Loaded auxiliary scripts')

def create_examples(lines, set_type, labels=None):
#Generate data for the BERT model
    guid = f'{set_type}'
    examples = []
    if guid == 'train':
        for line, label in zip(lines, labels):
            text_a = line
            label = str(label)
            examples.append(
              run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    else:
        for line in lines:
            text_a = line
            label = '[0 0 0 0 0 0]'
            examples.append(
              run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples

# Model Hyper Parameters
tf.random.set_random_seed(49)

TRAIN_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
WARMUP_PROPORTION = 0.1
MAX_SEQ_LENGTH = 128
# Model configs
SAVE_CHECKPOINTS_STEPS = 1000 #if you wish to finetune a model on a larger dataset, use larger interval
# each checpoint weights about 1,5gb
ITERATIONS_PER_LOOP = 1000
NUM_TPU_CORES = 8
VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json')
INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')
DO_LOWER_CASE = BERT_MODEL.startswith('uncased')

#label_list = ['0', '1']
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)
train_examples = create_examples(train_lines, 'train', labels=train_labels)
test_examples = create_examples(test_lines, 'test')

print('train_examples object loaded')


# In[9]:


num_train_steps = int(
    len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

print('Steps calculated')

# In[10]:


l = [] 
def return_l(n, l_r):
    if n == 1:
        l = l_r
        return l_r
    new_l = []
    for t in l_r:
        t.append(0)
        new_l.append(t[:])
        t.pop()
        t.append(1)
        new_l.append(t[:])
        t.pop()
    return return_l(n-1, new_l)

def return_str(n, l_r):
    if n == 1:
        new_l = []
        for t in l_r:
            new_l.append(t +']')
        l = new_l
        return new_l
    new_l = []
    for t in l_r:
        
        new_l.append(t +' 0')
        new_l.append(t +' 1')
    return return_str(n-1, new_l)


label_list_mult = return_str(6, ['[1','[0'])


# In[12]:


# Train the model.
print('Please wait..., loading train words in model')
train_features = run_classifier.convert_examples_to_features(train_examples, label_list_mult,  MAX_SEQ_LENGTH, tokenizer)
print('train_features loaded')
print('Please wait..., loading test words in model')
test_features = run_classifier.convert_examples_to_features(test_examples, label_list_mult,  MAX_SEQ_LENGTH, tokenizer)
print('test_features loaded')

# In[13]:


formOfList_label_list_mult = return_l(6, [[1], [0]])


# In[16]:

print('loading X_t and y_train')
X_t = []
y_train = []
for i, token in enumerate(train_features): 
    X_t.append(token.input_ids)
    #print(token.label_id)
    y_train.append(formOfList_label_list_mult[token.label_id])
X_t = np.asarray(X_t)
y_train = np.asarray(y_train)
print('loaded')

print('loading X_te')
X_te = []
for i, token in enumerate(test_features): 
    X_te.append(token.input_ids)
    #print(token.label_id)
X_te = np.asarray(X_te)
print('X_te loaded')


Loading train and test set
Datasets succ. loaded
BERT_MODEL dir uncased_L-12_H-768_A-12
models/uncased_L-12_H-768_A-12
Loaded auxiliary scripts
train_examples object loaded
Steps calculated
Please wait..., loading train words in model
INFO:tensorflow:Writing example 0 of 159571
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: train
INFO:tensorflow:tokens: [CLS] explanation why the edit ##s made under my user ##name hardcore metallic ##a fan were reverted ? they weren ' t van ##dal ##isms , just closure on some gas after i voted at new york dolls fa ##c . and please don ' t remove the template from the talk page since i ' m retired now . 89 . 205 . 38 . 27 [SEP]
INFO:tensorflow:input_ids: 101 7526 2339 1996 10086 2015 2081 2104 2026 5310 18442 13076 12392 2050 5470 2020 16407 1029 2027 4694 1005 1056 3158 9305 22556 1010 2074 8503 2006 2070 3806 2044 1045 5444 2012 2047 2259 14421 6904 2278 1012 1998 3531 2123 1005 1056 6366 1996 23561 2013 1996 2831 3931 2144 1045 1005 1049 3394 2

train_features loaded
Please wait..., loading test words in model
INFO:tensorflow:Writing example 0 of 153164
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: test
INFO:tensorflow:tokens: [CLS] yo bitch ja rule is more su ##cc ##es ##ful then you ' ll ever be what ##s up with you and hating you sad mo ##fu ##ck ##as . . . i should bitch slap ur pet ##hed ##ic white faces and get you to kiss my ass you guys sick ##en me . ja rule is about pride in da music man . don ##t di ##ss that shit on him . and nothin is wrong bei ##n like tu ##pac he was a brother too . . . fuck ##in white boys get things right next time . , [SEP]
INFO:tensorflow:input_ids: 101 10930 7743 14855 3627 2003 2062 10514 9468 2229 3993 2059 2017 1005 2222 2412 2022 2054 2015 2039 2007 2017 1998 22650 2017 6517 9587 11263 3600 3022 1012 1012 1012 1045 2323 7743 14308 24471 9004 9072 2594 2317 5344 1998 2131 2017 2000 3610 2026 4632 2017 4364 5305 2368 2033 1012 14855 3627 2003 2055 6620 1999 4830 2189 2158 1012 212

In [None]:

# In[19]:

print('importing torch bert  model...')
import torch
from pytorch_pretrained_bert import BertModel, BertTokenizer, BertForSequenceClassification
bert_model = BertModel.from_pretrained("bert-base-uncased",cache_dir="model")
embedding_matrix = []
for token in bert_model.embeddings.word_embeddings.parameters():
    embedding_matrix.append(token)
emb_ma = embedding_matrix[0].tolist()
array_emb_ma = np.asarray(emb_ma)

print('embedding matrix defined')
embedding_matrix = array_emb_ma


# Simple bidirectional LSTM with two fully connected layers. We add some dropout to the LSTM since even 2 epochs is enough to overfit.

# In[21]


# In[22]:


inp = Input(shape=(maxlen,))

x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)

x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print('model defined and compiled ')
# Now we're ready to fit out model! Use `validation_split` when not submitting.


model.fit(X_t, y_train, batch_size=32, epochs=2, validation_split=0.1);


# And finally, get predictions for the test set and prepare a submission CSV:

# In[ ]:






importing torch bert  model...
Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
embedding matrix defined
model defined and compiled 
Train on 143613 samples, validate on 15958 samples
Epoch 1/2
  6368/143613 [>.............................] - ETA: 1:04:10 - loss: 0.1772 - acc: 0.9570

In [None]:

y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission = pd.read_csv(f'{path}{comp}sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('submission.csv', index=False)

# serialize model to YAML
model_yaml = model.to_yaml()
with open("model.yaml", "w") as yaml_file:
    yaml_file.write(model_yaml)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")


# In[12]:
