In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from wordcloud import STOPWORDS

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [4]:
# ENABLE_GPU = False
# if ENABLE_GPU:
#     from tensorflow.python.client import device_lib
#     tf.config.list_physical_devices('GPU')

## Import dataset

In [5]:
train = pd.read_csv('../dataset/train.csv')
test = pd.read_csv('../dataset/test.csv')

## Preprocessing

In [6]:
test_id = test.id
train_id = train.id

#drop id and location
c = ['id', 'location']
train = train.drop(columns=c)
test = test.drop(columns=c)

#### Missing Value

In [7]:
#fill missing keywords with unknown
train['keyword'] = train.keyword.fillna('unknown')
test['keyword'] = test.keyword.fillna('unknown')

#fill keyword to tweets
train['text'] = train['text'] + ' ' + train['keyword']
test['text'] = test['text'] + ' ' + test['keyword']

#### Truncations 

In [8]:
df_truncation = pd.read_csv('../dataset/Truncations.csv').set_index('index')
df_truncation.head()

Unnamed: 0_level_0,Truncation
index,Unnamed: 1_level_1
ain't,am not / are not / is not / has not / have not
aren't,are not / am not
can't,cannot
can't've,cannot have
'cause,because


In [9]:
truncation = df_truncation.Truncation.to_dict()
truncation

{"ain't": 'am not / are not / is not / has not / have not',
 "aren't": 'are not / am not',
 "can't": 'cannot',
 "can't've": 'cannot have',
 "'cause": 'because',
 "could've": 'could have',
 "couldn't": 'could not',
 "couldn't've": 'could not have',
 "didn't": 'did not',
 "doesn't": 'does not',
 "don't": 'do not',
 "hadn't": 'had not',
 "hadn't've": 'had not have',
 "hasn't": 'has not',
 "haven't": 'have not',
 "he'd": 'he had / he would',
 "he'd've": 'he would have',
 "he'll": 'he shall / he will',
 "he'll've": 'he shall have / he will have',
 "he's": 'he has / he is',
 "how'd": 'how did',
 "how'd'y": 'how do you',
 "how'll": 'how will',
 "how's": 'how has / how is / how does',
 "I'd": 'I had / I would',
 "I'd've": 'I would have',
 "I'll": 'I shall / I will',
 "I'll've": 'I shall have / I will have',
 "I'm": 'I am',
 "I've": 'I have',
 "isn't": 'is not',
 "it'd": 'it had / it would',
 "it'd've": 'it would have',
 "it'll": 'it shall / it will',
 "it'll've": 'it shall have / it will have'

In [10]:
#compile all contraction words
truncations_re = re.compile('(%s)' % '|'.join(truncation.keys()))


#define function to expand contractions and showcase
def expand_contractions(s, contractions = truncation):
#     print(s)
    def replace(match):
#         print( contractions[match.group(0)])
        return contractions[match.group(0)]
    return truncations_re.sub(replace, s)

expand_contractions("ain't")

'am not / are not / is not / has not / have not'

#### Strip Sentence
remove URL, symbol, emojis

In [11]:
def strip(x):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x).split())

strip('@shawn Titanic #tragedy could have been prevented Economic Times: Telegraph.co.uk Titanic tragedy could have been prevented... http://bet.ly/tuN2wx')
strip('Barbados #Bridgetown JAMAICA Â‰Ã›Ã’ Two cars set ablaze: SANTA CRUZ Â‰Ã›Ã“ Head of the St Elizabeth Police Superintende...  http://t.co/wDUEaj8Q4J')

'Barbados Bridgetown JAMAICA Two cars set ablaze SANTA CRUZ Head of the St Elizabeth Police Superintende'

#### Remove Stopwords

'hello a world' -> 'hello world'

In [12]:
#this removes stopwords
def remove_stopwords(x):
    return ' '.join([i for i in x.split() if i not in STOPWORDS])

In [13]:
#Uncommend to see STOPWORDS
# STOPWORDS

#### Preprocess Tweets

In [14]:
CLEAN_TWEETS = True
if CLEAN_TWEETS:
    for df in [train, test]:
        df['text'] = df['text'].apply(expand_contractions)
        df['text'] = df['text'].apply(strip)
        df['text'] = df['text'].apply(remove_stopwords)

## Meta Features

In [15]:
def average_word_length(x):
    x = x.split()
    return np.mean([len(i) for i in x])

In [16]:
# for df in [train, test]:
#     #Word Count
#     #'hello hello a world' -> 4
#     df['word count'] = df['text'].apply(lambda x: len(x.split()))
    
#     #Character Count
#     #'hello hello a world' -> 19 (including space) 
#     df['character count'] = df['text'].apply(lambda x: len(x))
    
#     #Average word length
#     #'hello hello a world' -> 4
#     df['average word length'] = df['text'].apply(average_word_length)
    
#     #Unique word count
#     #'hello hello a world' -> 3 unique words
#     df['unique word count'] = df['text'].apply(lambda x: len(set(x.split())))
    
#     #Stopword count
#     #'hello hello a world' -> 1 stopword
#     df['stopword count'] = df['text'].apply(lambda x: len([i for i in x.lower().split() if i in STOPWORDS]))
    
#     #Stopword ratio
#     #'hello hello a world' -> 1/4 = 0.25
#     df['stopword ratio'] = df['stopword count'] / df['word count']
    
#     #URL count
#     #'hello hello a world' -> 0 URL count
#     df['url count'] = df['text'].apply(lambda x: len([i for i in x.lower().split() if 'http' in i or 'https' in i]))

In [17]:
y = train.target
train = train.drop(columns='target')

#### Standard scaler

In [18]:
# stdsc = StandardScaler()
# train.iloc[:, 2:] = stdsc.fit_transform(train.iloc[:, 2:])
# test.iloc[:, 2:] = stdsc.transform(test.iloc[:, 2:])

#### Train Test Split

In [19]:
x_train, x_val, y_train, y_val = train_test_split(train, y, 
                                                  train_size=0.7, 
                                                  random_state=2)

## Tokenizer

In [20]:
from transformers import BertTokenizer
TOKENIZER = BertTokenizer.from_pretrained("bert-base-uncased")

In [21]:
enc = TOKENIZER.encode("Hello World!")
dec = TOKENIZER.decode(enc)
print("Encode: " + str(enc))
print("Decode: " + str(dec))

#In bert tokenizer,
# CLS is the reserved token to represent the start of sequence
# while SEP separate segment or sentence

Encode: [101, 7592, 2088, 999, 102]
Decode: [CLS] hello world! [SEP]


## BERT Model

In [22]:
from transformers import TFBertModel, BertModel

In [23]:
#get BERT layer
bert_base = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [24]:
def bert_encode(data,maximum_len) :
    input_ids = []
    attention_masks = []
  

    for i in range(len(data.text)):
        encoded = TOKENIZER.encode_plus(data.text[i],
                                        add_special_tokens=True,
                                        max_length=maximum_len,
                                        pad_to_max_length=True,
                                        return_attention_mask=True)
      
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        
    return np.array(input_ids),np.array(attention_masks)

## Hyperparameters

In [25]:
BATCH_SIZE = 16

EPOCHS = 2

#we will not be using metadata 
USE_META = False

ADD_DENSE = False
DENSE_DIM = 64

ADD_DROPOUT = True
DROPOUT = .2

TRAIN_BASE = True

In [26]:
def build_model(model_layer, learning_rate, use_meta = USE_META, add_dense = ADD_DENSE,
               dense_dim = DENSE_DIM, add_dropout = ADD_DROPOUT, dropout = DROPOUT):
    
    
    input_ids = tf.keras.Input(shape=(60,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(60,),dtype='int32')
    
    
    
    transformer_layer = model_layer([input_ids,attention_masks])
    
    
    output = transformer_layer[1]
    
        
    if add_dense:
        print("Training with additional dense layer...")
        output = tf.keras.layers.Dense(dense_dim,activation='relu')(output)
    
    
    if add_dropout:
        print("Training with dropout...")
        output = tf.keras.layers.Dropout(dropout)(output)
    
    
    output = tf.keras.layers.Dense(1,activation='sigmoid')(output)
    
    
    print("Training without meta-data...")
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)

    model.compile(tf.keras.optimizers.Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [27]:
print('Encoding Tweets...')
train_input_ids,train_attention_masks = bert_encode(train,60)
test_input_ids,test_attention_masks = bert_encode(test,60)
print('Tweets encoded')
print('')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Encoding Tweets...
Tweets encoded



In [28]:
BERT_base = build_model(bert_base, learning_rate = 1e-5)
BERT_base.summary()

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Training with dropout...
Training without meta-data...
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 60)]         0                                            
_______________________________________________________________________________



In [29]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('base_model.h5', monitor='val_loss', save_best_only = True, save_weights_only = True)

In [30]:
history = BERT_base.fit([train_input_ids,train_attention_masks], y, validation_split = .2, epochs = EPOCHS, callbacks = [checkpoint], batch_size = BATCH_SIZE)

Epoch 1/2
Epoch 2/2


## Submission

In [31]:
prediction = BERT_base.predict([test_input_ids,test_attention_masks])



In [36]:
np.round(prediction)

(3263, 1)

In [44]:
submission = pd.DataFrame(columns=['id', 'target'])

In [49]:
submission['id'] = test_id
submission['target'] = np.round(prediction).astype('int64')

In [50]:
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [52]:
submission.to_csv('../submission/submission_v1.csv', index=False)