In [1]:
import pandas as pd
import preprocessor as prep_t
import spacy
import re
import string
import numpy as np

from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dropout, LeakyReLU, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

import time
import psutil

# Data set

In [2]:
df = pd.read_csv('../data/train.csv')

# Preprocessing

## Mislabeled data

In [3]:
df_mislabeled = df.groupby(['text']).nunique().sort_values(by='target', ascending=False)
df_mislabeled = df_mislabeled[df_mislabeled['target'] > 1]['target']

In [4]:
df['target_relabeled'] = df['target'].copy() 
df.loc[df['text'] == 'like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit', 'target_relabeled'] = 0
df.loc[df['text'] == 'Hellfire is surrounded by desires so be careful and donÛªt let your desires control you! #Afterlife', 'target_relabeled'] = 0
df.loc[df['text'] == 'To fight bioterrorism sir.', 'target_relabeled'] = 0
df.loc[df['text'] == '.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4', 'target_relabeled'] = 1
df.loc[df['text'] == 'CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring', 'target_relabeled'] = 1
df.loc[df['text'] == '#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption', 'target_relabeled'] = 0
df.loc[df['text'] == 'In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!', 'target_relabeled'] = 0
df.loc[df['text'] == 'Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE', 'target_relabeled'] = 1
df.loc[df['text'] == 'RT NotExplained: The only known image of infamous hijacker D.B. Cooper. http://t.co/JlzK2HdeTG', 'target_relabeled'] = 1
df.loc[df['text'] == "Mmmmmm I'm burning.... I'm burning buildings I'm building.... Oooooohhhh oooh ooh...", 'target_relabeled'] = 0
df.loc[df['text'] == "wowo--=== 12000 Nigerian refugees repatriated from Cameroon", 'target_relabeled'] = 0
df.loc[df['text'] == "He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam", 'target_relabeled'] = 0
df.loc[df['text'] == "Hellfire! We donÛªt even want to think about it or mention it so letÛªs not do anything that leads to it #islam!", 'target_relabeled'] = 0
df.loc[df['text'] == "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'", 'target_relabeled'] = 0
df.loc[df['text'] == "Caution: breathing may be hazardous to your health.", 'target_relabeled'] = 1
df.loc[df['text'] == "I Pledge Allegiance To The P.O.P.E. And The Burning Buildings of Epic City. ??????", 'target_relabeled'] = 0
df.loc[df['text'] == "#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect", 'target_relabeled'] = 0
df.loc[df['text'] == "that horrible sinking feeling when youÛªve been at home on your phone for a while and you realise its been on 3G this whole time", 'target_relabeled'] = 0

In [5]:
df = df.drop(columns=['target'])
df = df.rename(columns={'target_relabeled':'target'})

## Clean location

Keep only useful location values (countries, cities)  using spaCy's pre-trained NER model

In [8]:
nlp = spacy.load("en_core_web_sm")

In [9]:
def extract_gpe(text):
    doc = nlp(text)
    
    for ent in doc.ents:
        if ent.label_ == 'GPE':
            return ent.text
    

In [10]:
df.loc[df.location.notnull(), 'gpe_extracted'] = df.loc[df.location.notnull(), 'location'].map(extract_gpe)
df.loc[df.location.isin(['USA', 'Wordwide']), 'gpe_extracted'] =  df.loc[df.location.isin(['USA', 'Wordwide']), 'location']
df.loc[df.location =='M!$$!$$!PP!', 'location'] = "MISSISIPPI"

## Clean text

Clean text by removing \x89, removing urls, mention and emojis

In [14]:
prep_t.set_options(prep_t.OPT.URL, prep_t.OPT.EMOJI, prep_t.OPT.MENTION)

In [15]:
df['cleaned_text']=df['text'].apply(lambda x:re.sub(r"(.*[a-zA-Z]?)\x89[^\W]*([a-zA-Z]?.*)", r"\1, \2", x))
df['cleaned_text'] = df['cleaned_text'].map(prep_t.clean)

Replace consecutive dots by "..."

In [16]:
consequitivedots = re.compile(r'\.{2,}')
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: consequitivedots.sub(' ... ', x))

Remove no-ASCII charcters

In [17]:
def remove_non_ascii(text):
    """
        Remove non-ASCII characters 
    """
    return re.sub(r'[^\x00-\x7f]',r'', text)

In [18]:
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: remove_non_ascii(x))
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: x.replace('#', ' ').replace('@', " "))

df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join(x.split()))
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: x.strip())

# Feature engineering

This part aims to create some meta data varaible about tweets

## Text meta data

Create several variables:
* Create a list of frequent keywords for true disaster tweets. Assign 1 if the keyword is in the list else 0
* numbre words in the tweet
* mean length of words
* number of characters
* punctuation count
* hashtag count
* number of urls
* url length
* length of cleaned text
* Create a list of frequent keywords for true disaster tweets. Assign 1 if the keyword is in the list else 0


In [19]:
tmp = df.groupby('keyword')['target'].mean().reset_index()
tmp = tmp.sort_values('target', ascending = False)

true_keywords = tmp[tmp.target > 0.5].keyword.values

df['is_true_keyword'] =  df['keyword'].isin(true_keywords)

In [23]:
#word_count
df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))

In [24]:
#mean_word_length
df['mean_word_length'] = df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [25]:
#number of characters
df['char_count'] = df['text'].apply(lambda x: len(str(x)))

In [26]:
df['punctuation_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

df['hashtag_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c == '#']))

df['mention_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c == '@']))

df['url_count'] = df['text'].apply(lambda x: len(prep_t.parse(x).urls) if np.all(pd.notnull(prep_t.parse(x).urls)) else 0)

In [30]:
def length_url(text):
    parsed_tweet = prep_t.parse(text)
    c= 0
    if np.any(pd.notnull(parsed_tweet.urls)):
        for i in range(len(parsed_tweet.urls)):
            c = c+ len(parsed_tweet.urls[i].match)
    else:
        pass
    
    return c

In [31]:
df['len_url'] = df['text'].apply(lambda x: length_url(x))

In [32]:
df['len_cleaned_text'] = df['cleaned_text'].map(len)

In [33]:
tmp = df.groupby('gpe_extracted')['target'].mean().reset_index()

tmp = tmp.sort_values('target', ascending = False)

true_gpe_extracted = tmp[tmp.target > 0.5].gpe_extracted.values

df['true_gpe_extracted'] = df['gpe_extracted'].isin(true_gpe_extracted)

# BERT + META DATA

## Select columns

Keep only useful columns

In [34]:
df = df[['target', 'is_true_keyword', 'word_count', 'mean_word_length', 'char_count',
       'punctuation_count', 'hashtag_count', 'mention_count', 'url_count',
       'len_url', 'cleaned_text', 'len_cleaned_text', 'true_gpe_extracted']].copy()
df['is_true_keyword'] = df['is_true_keyword'].map(int) #transform from boolean to int
df['true_gpe_extracted'] = df['true_gpe_extracted'].map(int)

## Params

In [35]:
meta_data_cols = ['is_true_keyword','word_count', 'mean_word_length', 'char_count',
                  'punctuation_count', 'hashtag_count', 'mention_count', 'url_count',
                  'len_url', 'len_cleaned_text', 'true_gpe_extracted']

Set parameters

In [39]:
ntargets = 2
seq_max=84
nb_meta = len(meta_data_cols)

loss = "binary_crossentropy"
activation = "softmax"

We select subsamples here (the training on overall dataset was done on colab to boost the training speed with GPU

In [40]:
df_train = df[:300].copy()
df_val = df[300:400].copy()

## Train set

Define train set (X,y)

In [41]:
X = df_train.drop(columns = ['target'])
y = df_train['target'].values

In [46]:
# transform target column to a matrix
y_categorical = to_categorical(y)
nb_labels = len(np.unique(y))

# with BERT tokenizer's batch_encode_plus batch of both the sentences areencoded together and separated by [SEP] token.
sequence = X["cleaned_text"].values.tolist()
tokenizer = BertTokenizer.from_pretrained('bert-base-cased') 
seqs = tokenizer.batch_encode_plus(sequence,
                                   max_length=seq_max, 
                                   pad_to_max_length=True)

# convert batch of encoded features to numpy array.
X_seq, X_attention = (np.asarray(seqs["input_ids"]), np.asarray(seqs["attention_mask"]))
# tuple of metadata dataframe and number of metadata
X_meta, nb_meta_features = (X[meta_data_cols], len(meta_data_cols))

# create X_input and y_input 
X_input = [X_seq, X_attention, X_meta]
y_input = y_categorical

## Validation set

In [48]:
X_val = df_val.drop(columns=['target'])
y_val = df_val['target'].values
y_categorical_val = to_categorical(y_val)

sequence_val = X_val["cleaned_text"].values.tolist()
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
seqs_val = tokenizer.batch_encode_plus(sequence_val,
                                   max_length=seq_max, 
                                   pad_to_max_length=True)

X_seq_val, X_attention_val = (np.asarray(seqs_val["input_ids"]), np.asarray(seqs_val["attention_mask"]))
X_meta_val, nb_meta_features_val = (X_val[meta_data_cols], len(meta_data_cols))

X_input_val = [X_seq_val, X_attention_val, X_meta_val]
y_input_val = y_categorical_val

## Model

In [49]:
# instantiate a Keras tensor for text and attention_mask
text_input = Input(shape=(seq_max,), dtype="int32")
attention_input = Input(shape=(seq_max,), dtype="int32")

# Create BERT architecture
x = TFBertModel.from_pretrained("bert-base-cased")(inputs = text_input, attention_mask = attention_input)[0][:, 0, :]

# instantiate a Keras tensor for metadata
Meta_input = Input(shape=(nb_meta,), dtype="float32")

# list of inputs
inputs = [text_input, attention_input, Meta_input]

# first NN model for metadata
concatenate_1 = Meta_input

y = Dense(150, activation="linear")(concatenate_1)
y = Dropout(0.2)(y)
y = LeakyReLU(alpha=0.05)(y)
y = Dense(100, activation="linear")(y)
y = Dropout(0.2)(y)
y = LeakyReLU(alpha=0.05)(y)
y = Dense(80, activation="linear")(y)
y = Dropout(0.2)(y)
y = LeakyReLU(alpha=0.05)(y)

# concatenate meta data embeddings with encoded text by BERT
concatenate_2 = Concatenate(axis=1)([x, y])

# NN for output
z = Dense(200, activation="linear")(concatenate_2)
z = Dropout(0.2)(z)
z = LeakyReLU(alpha=0.05)(z)
z = Dense(100, activation="linear")(z)
z = Dropout(0.2)(z)
z = LeakyReLU(alpha=0.05)(z)
output = Dense(ntargets, activation=activation)(z)

model = Model(inputs=inputs, outputs=output)
model.compile(optimizer=Adam(learning_rate=5e-5), loss=loss, metrics=["accuracy"])

## Fit & estimate

Fit and estimate

In [34]:
def get_available_memory():
    return psutil.virtual_memory()._asdict()['available']

In [35]:
memory_start = get_available_memory()
training_start = time.time()
model.fit(X_input, y_input, batch_size=16, epochs = 1, validation_data=(X_input_val, y_input_val))
training_end = time.time()
model_memory = memory_start - get_available_memory()
model_memory = round(model_memory / 1e9 * 1024 , 1)
print('Our model is using {} Mb memory (RAM).'.format(str(model_memory)))

Our model is using -376.1 Mb memory (RAM).


## Fit all train.csv data

See colab notebook

# Prediction

## Test data set

In [57]:
df_test = pd.read_csv('../data/test.csv')

In [58]:
df_test.loc[df_test.location.notnull(), 'gpe_extracted'] = df_test.loc[df_test.location.notnull(), 'location'].map(extract_gpe)
df_test.loc[df_test.location.isin(['USA', 'Wordwide']), 'gpe_extracted'] =  df_test.loc[df_test.location.isin(['USA', 'Wordwide']), 'location']
df_test.loc[df_test.location =='M!$$!$$!PP!', 'location'] = "MISSISIPPI"

df_test['cleaned_text']=df_test['text'].apply(lambda x:re.sub(r"(.*[a-zA-Z]?)\x89[^\W]*([a-zA-Z]?.*)", r"\1, \2", x))
df_test['cleaned_text'] = df_test['cleaned_text'].map(prep_t.clean)
df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x: consequitivedots.sub(' ... ', x))
df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x: remove_non_ascii(x))
df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x: x.replace('#', ' ').replace('@', " "))
df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x: ' '.join(x.split()))

df_test['is_true_keyword'] =  df_test['keyword'].isin(true_keywords)

df_test['word_count'] = df_test['text'].apply(lambda x: len(str(x).split()))

df_test['mean_word_length'] = df_test['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

df_test['char_count'] = df_test['text'].apply(lambda x: len(str(x)))

df_test['punctuation_count'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

df_test['hashtag_count'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c == '#']))

df_test['mention_count'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c == '@']))

df_test['url_count'] = df_test['text'].apply(lambda x: len(prep_t.parse(x).urls) if np.all(pd.notnull(prep_t.parse(x).urls)) else 0)

df_test['len_url'] = df_test['text'].apply(lambda x: length_url(x))

df_test['len_cleaned_text'] = df_test['cleaned_text'].map(len)

df_test['true_gpe_extracted'] = df_test['gpe_extracted'].isin(true_gpe_extracted)

In [59]:
df_test = df_test[['is_true_keyword', 'word_count', 'mean_word_length', 'char_count',
       'punctuation_count', 'hashtag_count', 'mention_count', 'url_count',
       'len_url', 'cleaned_text', 'len_cleaned_text', 'true_gpe_extracted']].copy()

df_test['is_true_keyword'] = df_test['is_true_keyword'].map(int)
df_test['true_gpe_extracted'] = df_test['true_gpe_extracted'].map(int)

In [60]:
df_test = df_test[:10]

Predictions

In [75]:
y_res = []
for i in range(df_test.shape[0]):
    df_test_copy = deepcopy(df_test.loc[i:i, :])
    
    
    sequence_test = df_test_copy["cleaned_text"].values.tolist()
    #tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    seqs_test = tokenizer.batch_encode_plus(sequence_test,
                                       max_length=seq_max, 
                                       pad_to_max_length=True)
   
    X_seq_test, X_attention_test = (np.asarray(seqs_test["input_ids"]), np.asarray(seqs_test["attention_mask"]))
    
    X_meta_test, nb_meta_features_test = (df_test_copy[meta_data_cols], len(meta_data_cols))
    
    X_input_test = [X_seq_test, X_attention_test, X_meta_test]
    
    y_res.append(np.argmax(model.predict(X_input_test), axis =1)[0])

In [76]:
y_res

[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]

Save

In [228]:
output = pd.read_csv('./../data/test.csv')

output = output[['id']]

output['target'] = y_res
#output[['id', 'target']].to_csv('submissions_bert_meta.csv', sep=",", index = False)