# Data Preparation and Text Cleaning

The starter notebook for this competition uses DistilBERT model from Keras NLP. So that will be used here with some basic cleaning up of the tweets and of situations where the targets differ for multiple version of the same tweet. 

In [1]:
!pip install keras-core --upgrade
!pip install -q keras-nlp --upgrade

import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

Collecting keras-core
  Downloading keras_core-0.1.5-py3-none-any.whl (924 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m924.6/924.6 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting namex (from keras-core)
  Downloading namex-0.0.7-py3-none-any.whl (5.8 kB)
Installing collected packages: namex, keras-core
Successfully installed keras-core-0.1.5 namex-0.0.7


In [2]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
import keras_core as keras
import keras_nlp

import random as python_random
import re
import string
import emoji
import nltk

print("TensorFlow version:", tf.__version__)
print("KerasNLP version:", keras_nlp.__version__)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Using TensorFlow backend
TensorFlow version: 2.12.0
KerasNLP version: 0.6.1


In [3]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

print('Training Set Shape = {}'.format(train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(test.memory_usage().sum() / 1024**2))

Training Set Shape = (7613, 5)
Training Set Memory Usage = 0.29 MB
Test Set Shape = (3263, 4)
Test Set Memory Usage = 0.10 MB


In [4]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


A lot of the tweets in the dataset need to be cleaned up. Doing so should improve the results. In researching a way to clean up this text, the following Stack Overflow post was extremely helpful: https://stackoverflow.com/questions/64719706/cleaning-twitter-data-pandas-python

In [6]:
train_clean_tweets = []
for tweet in train['text']:
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    emojis = emoji.distinct_emoji_list(tweet)
    tweet = ''.join(c for c in tweet if c not in emojis) #Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    #tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet) \
         #if w.lower() in tweet or not w.isalpha())
    train_clean_tweets.append(tweet)
    
train['clean_text'] = train_clean_tweets

In [7]:
train

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfires evacuation ord..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,Two giant cranes holding a bridge collapse int...
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,ahrary The out of control wild fires in Calif...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,M1.94 [01:04 UTC]?5km S of Volcano Hawaii.
7611,10872,,,Police investigating after an e-bike collided ...,1,Police investigating after an e-bike collided ...


In [8]:
test_clean_tweets = []
for tweet in test['text']:
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    emojis = emoji.distinct_emoji_list(tweet)
    tweet = ''.join(c for c in tweet if c not in emojis) #Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    #tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet) \
         #if w.lower() in tweet or not w.isalpha())
    test_clean_tweets.append(tweet)
    
test['clean_text'] = test_clean_tweets

# Varying Target Values for the Same Tweets

Looking at the number of unique values in each column of the train dataset it shows that there are 7613 total columns, but only 6922 of the input columns are unique, which is a total of 791 rows. That is a lot. The question whether a unique input value with many occurances are all labeled with the same target values.

In [9]:
train.nunique()

id            7613
keyword        221
location      3341
text          7503
target           2
clean_text    6922
dtype: int64

To explore this potential labeling issue, a new column called 'unique_input' is created to be able to look at some of the larger occurances of unique values.

In [10]:
train['unique_text'] = pd.factorize(train['clean_text'])[0] + 1

In [11]:
train

Unnamed: 0,id,keyword,location,text,target,clean_text,unique_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,2
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,3
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfires evacuation ord...",4
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,5
...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,Two giant cranes holding a bridge collapse int...,1080
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,ahrary The out of control wild fires in Calif...,6667
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,M1.94 [01:04 UTC]?5km S of Volcano Hawaii.,6497
7611,10872,,,Police investigating after an e-bike collided ...,1,Police investigating after an e-bike collided ...,1573


Looking at the top five unique occurances, only the 4th one, 4061, had variations in the target values. It doesn't appear to be a disaster, but 5 out of 17 occurances were coded as a disaster.

In [12]:
train['unique_text'].value_counts().nlargest(5)

5351    24
6848    20
4862    19
4061    17
447     15
Name: unique_text, dtype: int64

In [13]:
print(train.loc[train['unique_text'] == 4061])

        id    keyword             location  \
4391  6243  hijacking    perth, australia    
4392  6244  hijacking             Mongolia   
4393  6245  hijacking  brisbane, australia   
4394  6246  hijacking                China   
4396  6248  hijacking  Chiyoda Ward, Tokyo   
4397  6253  hijacking                 rome   
4399  6255  hijacking         EastCarolina   
4400  6256  hijacking               Brazil   
4403  6259  hijacking                  NaN   
4404  6261  hijacking               France   
4405  6262  hijacking                  NaN   
4407  6265  hijacking                tokyo   
4408  6267  hijacking                china   
4412  6272  hijacking               Brazil   
4414  6274  hijacking                  NaN   
4415  6276  hijacking                Japan   
4420  6283  hijacking                  NaN   

                                                   text  target  \
4391  #hot  Funtenna: hijacking computers to send da...       0   
4392  #hot  Funtenna: hijacking compu

There are 314 tweets that that are repeated more than once. There is a pretty good chance that some more of these may have different target codes for the same text. 

In [14]:
train['unique_text'].value_counts().ne(1).sum()

314

One way to correct this potential problem is to use the target mode for a set of duplicate tweets and change any targets that don't match to this mode value. For instance, in the example above for number 4061, the mode would be 0 and the 5 values that are not 0 would be changed to 0. 

To start this process a new dataframe is created to capture the mode for each unique tweet. 

In [15]:
train_unique_mode = train.groupby('unique_text').agg({'target': lambda x: x.value_counts().index[0]}).reset_index()

In [16]:
train_unique_mode

Unnamed: 0,unique_text,target
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1
...,...,...
6917,6918,1
6918,6919,1
6919,6920,1
6920,6921,1


These mode values are then added as a new column called 'new_target' in the train dataset. 

In [17]:
train['new_target'] = train['unique_text'].map(train_unique_mode.set_index('unique_text')['target'])

In [18]:
train.head(25)

Unnamed: 0,id,keyword,location,text,target,clean_text,unique_text,new_target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,1,1
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,2,1
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,3,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfires evacuation ord...",4,1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,5,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1,RockyFire Update => California Hwy. 20 closed ...,6,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1,flood disaster Heavy rain causes flash floodin...,7,1
7,13,,,I'm on top of the hill and I can see a fire in...,1,I'm on top of the hill and I can see a fire in...,8,1
8,14,,,There's an emergency evacuation happening now ...,1,There's an emergency evacuation happening now ...,9,1
9,15,,,I'm afraid that the tornado is coming to our a...,1,I'm afraid that the tornado is coming to our a...,10,1


It looks like there are 89 rows where the new target is not equal to the original target, which means 89 rows were changed based on looking at the mode of unique tweets with more than one occurance. 

In [19]:
len(train.query('new_target != target'))

89

# Preparing to Use the Model

The parameters from the starter notebook are used here and an 80/20 validation split is performed below. 

In [20]:
BATCH_SIZE = 32
NUM_TRAINING_EXAMPLES = train.shape[0]
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.2
STEPS_PER_EPOCH = int(NUM_TRAINING_EXAMPLES)*TRAIN_SPLIT // BATCH_SIZE

EPOCHS = 2
AUTO = tf.data.experimental.AUTOTUNE

In [21]:
from sklearn.model_selection import train_test_split

X = train["clean_text"]
y = train["new_target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SPLIT, random_state=42)

X_test = test["clean_text"]

To ensure the results are the same for multiple iterations

In [22]:
def reset_seeds():
   np.random.seed(42) 
   python_random.seed(42)
   tf.random.set_seed(42)

reset_seeds() 

# Running the Model

Text inputs need to be transformed to numeric token ids and arranged in several Tensors before being input to BERT.

The BertClassifier model can be configured with a preprocessor layer, in which case it will automatically apply preprocessing to raw inputs during fit(), predict(), and evaluate(). This is done by default when creating the model with from_preset().

The DistilBERT model that is chosen learns a distilled (approximate) version of BERT, retaining 97% performance but using only half the number of parameters ([paper](https://arxiv.org/abs/1910.01108)). 

It has 40% less parameters than bert-base-uncased, runs 60% faster while preserving over 95% of BERT’s performances as measured on the GLUE language understanding benchmark.

Specifically, it doesn't have token-type embeddings, pooler and retains only half of the layers from Google's BERT.

In [23]:
# Load a DistilBERT model.
preset= "distil_bert_base_en_uncased"

# Use a shorter sequence length.
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset,
                                                                   sequence_length=160,
                                                                   name="preprocessor_4_tweets"
                                                                  )

# Pretrained classifier.
classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset,
                                                               preprocessor = preprocessor, 
                                                               num_classes=2)

classifier.summary()

Downloading data from https://storage.googleapis.com/keras-nlp/models/distil_bert_base_en_uncased/v1/vocab.txt
[1m231508/231508[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step       
Downloading data from https://storage.googleapis.com/keras-nlp/models/distil_bert_base_en_uncased/v1/model.h5
[1m265570304/265570304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


In [24]:
# Compile
classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), #'binary_crossentropy',
    optimizer=keras.optimizers.Adam(1e-5),
    metrics= ["accuracy"]  
)

# Fit
history = classifier.fit(x=X_train,
                         y=y_train,
                         batch_size=BATCH_SIZE,
                         epochs=EPOCHS, 
                         validation_data=(X_val, y_val)
                        )

Epoch 1/2
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 606ms/step - accuracy: 0.7374 - loss: 0.5479 - val_accuracy: 0.8562 - val_loss: 0.3668
Epoch 2/2
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 629ms/step - accuracy: 0.8535 - loss: 0.3619 - val_accuracy: 0.8628 - val_loss: 0.3530


# Submission 

In [25]:
def reset_seeds():
   np.random.seed(42) 
   python_random.seed(42)
   tf.random.set_seed(42)

reset_seeds() 

In [26]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [27]:
sample_submission["target"] = np.argmax(classifier.predict(X_test), axis=1)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 181ms/step


In [28]:
sample_submission.to_csv("submission.csv", index=False)