In [1]:
%%bash

# checking if kaggle API is configured

if [ ! -d ~/.kaggle/ ]; then

    echo "Kaggle credentials are not configured"

else

    echo "Kaggle API is already configured"

fi

Kaggle API is already configured


In [2]:
%%bash

# checking if data is downloaded

if [ ! -d data/ ]; then

    echo "Downloading dataset..."
    kaggle competitions download nlp-getting-started

    echo "Unzipping datasets"
    unzip -qq nlp-getting-started.zip -d data/
  
    rm -rf nlp-getting-started.zip

else

    echo "Dataset already dataset downloaded."

fi

Dataset already dataset downloaded.


In [3]:
import numpy as np
import pandas as pd
 
import tensorflow as tf

!pip -q install transformers --user

import re
import transformers

MODEL_TYPE = 'bert-base-uncased'
MAX_SEQUENCE_LENGTH = 160

LEARNING_RATE=2e-5
EPOCHS=2
BATCH_SIZE=32
DROPOUT=0.1

In [4]:
ROOT_DIR = 'data/'

train = pd.read_csv(ROOT_DIR + 'train.csv')
test = pd.read_csv(ROOT_DIR + 'test.csv')

In [5]:
def remove_urls(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [6]:
%%time

# cleaning the data a bit

df = pd.concat([train,test], sort=False)

df['text']=df['text'].apply(lambda x : remove_urls(x))
df['text']=df['text'].apply(lambda x : remove_html(x))
df['text']=df['text'].apply(lambda x : remove_emoji(x))
df['text']=df['text'].apply(lambda x : re.sub(r'[^a-zA-Z#]+', ' ', x))

train_cleaned = df[:train.shape[0]]
test_cleaned = df[train.shape[0]:]

test_cleaned = test_cleaned.drop('target', axis=1)

CPU times: user 140 ms, sys: 0 ns, total: 140 ms
Wall time: 140 ms


In [7]:
# from https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub

def bert_encode(texts, tokenizer, max_len):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [8]:
tokenizer = transformers.BertTokenizer.from_pretrained(MODEL_TYPE)

train_input = bert_encode(train_cleaned.text.values, tokenizer, max_len=MAX_SEQUENCE_LENGTH)
test_input = bert_encode(test_cleaned.text.values, tokenizer, max_len=MAX_SEQUENCE_LENGTH)

train_labels = train_cleaned.target.values

In [9]:
def create_model(learning_rate, is_vanilla=True):

    token_inputs = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH), dtype=tf.int32, name='input_word_ids')
    mask_inputs = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    seg_inputs = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')

    bert_model = transformers.TFBertModel.from_pretrained(MODEL_TYPE)
    seq_output, _ = bert_model([token_inputs, mask_inputs, seg_inputs])

    if is_vanilla:
        # just feeding into the final dense layer:
        X = seq_output[:, 0, :]
    else:
        # pool -> dense -> dropout -> final dense
        X = tf.keras.layers.GlobalAveragePooling1D()(seq_output)
        X = tf.keras.layers.Dense(100, activation='relu')(X)
        X = tf.keras.layers.Dropout(DROPOUT)(X)
        
    output_= tf.keras.layers.Dense(1, activation='sigmoid', name='output')(X)


    model = tf.keras.models.Model([token_inputs, mask_inputs, seg_inputs],output_)
    #print(model.summary())
    
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=LEARNING_RATE),
              loss='binary_crossentropy',
              metrics=['accuracy'])

    #tf.keras.utils.plot_model(model)
    
    return model

In [10]:
from sklearn.model_selection import GridSearchCV

# recommended hyperparameters from 
learning_rate=[2e-5, 3e-5, 5e-5]
epochs=[1]
batch_size=[8, 16]
dropout=[0.1]

# trying two different architectures, vanilla BERT or + pool -> dense -> dropout 
is_vanilla=True

# could use GridSearchCV and KerasClassifier but cannot handle multiple inputs
for rate in learning_rate:
    for batch in batch_size:
        for is_vanilla in [True, False]:
            print("\n===>Rate: {}, batch size: {}, is_vanilla: {}".format(rate,batch, is_vanilla))
            model=create_model(rate, is_vanilla=is_vanilla)
            history = model.fit(train_input,
                train_labels,
                validation_split=0.2,
                epochs=1,
                batch_size = batch,
                verbose=1)
            
            submission = pd.read_csv(ROOT_DIR + 'sample_submission.csv')
            yhat = model.predict(test_input)
            submission['target'] = yhat.round().astype(int)
            submission.to_csv('submission_bert_{}_{}_{}.csv'.format(rate, batch, is_vanilla), index=False)


===>Rate: 2e-05, batch size: 8, is_vanilla: True
Train on 6090 samples, validate on 1523 samples

===>Rate: 2e-05, batch size: 8, is_vanilla: False
Train on 6090 samples, validate on 1523 samples

===>Rate: 2e-05, batch size: 16, is_vanilla: True
Train on 6090 samples, validate on 1523 samples

===>Rate: 2e-05, batch size: 16, is_vanilla: False
Train on 6090 samples, validate on 1523 samples

===>Rate: 3e-05, batch size: 8, is_vanilla: True
Train on 6090 samples, validate on 1523 samples

===>Rate: 3e-05, batch size: 8, is_vanilla: False
Train on 6090 samples, validate on 1523 samples

===>Rate: 3e-05, batch size: 16, is_vanilla: True
Train on 6090 samples, validate on 1523 samples

===>Rate: 3e-05, batch size: 16, is_vanilla: False
Train on 6090 samples, validate on 1523 samples

===>Rate: 5e-05, batch size: 8, is_vanilla: True
Train on 6090 samples, validate on 1523 samples

===>Rate: 5e-05, batch size: 8, is_vanilla: False
Train on 6090 samples, validate on 1523 samples

===>Rate: 

In [11]:
%%time

model = create_model(LEARNING_RATE, is_vanilla=False)
model.fit(train_input,
          train_labels,
          validation_split=0.2,
          epochs=EPOCHS,
          batch_size = BATCH_SIZE,
          verbose=1)

Train on 6090 samples, validate on 1523 samples
Epoch 1/2
Epoch 2/2
CPU times: user 1min 32s, sys: 27.9 s, total: 2min
Wall time: 4min 27s


<tensorflow.python.keras.callbacks.History at 0x7f92e45c9400>

In [12]:
submission = pd.read_csv(ROOT_DIR + 'sample_submission.csv')
yhat = model.predict(test_input)
submission['target'] = yhat.round().astype(int)
submission.to_csv('submission_bert.csv', index=False)

In [13]:
#!kaggle competitions submit -c nlp-getting-started -f submission_bert.csv -m "testing API submission"