# 0. Motivation

This notebook's goal is to exlore a set of tweets and build an algorithm around it, that understands whether a tweet is about a real disaster or not. It accomplishes that via Natural Language Processing, which is a combination of machine learning techniques with text and stastistical approaches to transform that text in a format that the machine learning algorithms can understand.

# 0. Setup

In [2]:
import tokenization
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import tensorflow_hub as hub
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Input, Dropout
import tensorflow.compat.v1 as tf
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.initializers import Constant
from keras.models import Sequential
from keras import layers, metrics, optimizers, losses
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import matplotlib
import re
import string
import unicodedata
import numpy as np
import pandas as pd
import seaborn as sns

from collections import defaultdict


plt.style.use('seaborn-bright')

random_state_split = 42
Dropout_num = 0
learning_rate = 5.95e-6
valid = 0.15
epochs_num = 3
batch_size_num = 16
target_corrected = False
target_big_corrected = False


# 1. Exploring the data

We will use `pandas` to exlore the datasets.

## 1.1 Training Set Data Preview

In [14]:
training = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [None]:
# Training set preview
training.head()

## 1.2 Test Set Data Preview

In [None]:
# Test set preview
test.head()

## 1.3 Training Set Format

In [None]:
training.info()

### Breakdown of the columns in the training dataset:

- id: A unique identifier for each training.
- keyword: A particular keyword from the training (can be blank).
- location: The location the training was sent from (can be blank).
- text: The text of the training.
- target: `1` indicates that the training is about a real disaster, `0` that it is not.

## 1.4 Exploring the dimensions of the datasets

In [None]:
print('there are {} rows and {} columns in test.csv'.format(test.shape[0], test.shape[1]))
print('there are {} rows and {} columns in training.csv'.format(training.shape[0], training.shape[1]))

## 1.5 Visualizing the target classes

In [None]:
plt.figure(figsize=(12, 6))
plt.title("Count of the target classes")
sns.countplot(y=training["target"], linewidth=1, palette='Set2')
plt.show()

*There are around 3200 samples of tweets about real disasters and about 4500 non-disaster tweets.*

## 1.6 Analyzing the length of the tweets

In [None]:
disaster_tweets = training[training['target'] == 1]['text'].str
non_disaster_tweets = training[training['target'] == 0]['text'].str

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 7))

char_len_dis = disaster_tweets.len()
ax1.hist(char_len_dis, color='orange', edgecolor='black', linewidth=0.7)
ax1.set_title('disaster tweets')

char_len_ndis = non_disaster_tweets.len()
ax2.hist(char_len_ndis, color='green', edgecolor='black', linewidth=0.7)
ax2.set_title('non-disaster tweets')

plt.suptitle("Characters per training")
plt.show()

*The characters count of disaster and non-disaster tweets are between 120 and 140.*

## 1.7 Analyzing the number of words in the tweets

In [None]:
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

char_len_dis = disaster_tweets.split().map(lambda x: len(x))
ax1.hist(char_len_dis, color='orange', edgecolor='black', linewidth=0.7)
ax1.set_title('disaster tweets')

char_len_ndis = non_disaster_tweets.split().map(lambda x: len(x))
ax2.hist(char_len_ndis, color='green', edgecolor='black', linewidth=0.7)
ax2.set_title('non-disaster tweets')

plt.suptitle("length of words in text")
plt.tight_layout()
plt.show()

*The number of words in disaster and non-disaster tweets are in the range 15 - 20.*

## 1.8 Analysis of the average length of words in each tweet

In [None]:
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

char_len_dis = disaster_tweets.split().apply(lambda x: [len(i) for i in x])
sns.distplot(char_len_dis.map(lambda x: np.mean(x)), ax=ax1, color='orange')
ax1.set_title('disaster tweets')

char_len_ndis = non_disaster_tweets.split().apply(lambda x: [len(i) for i in x])
sns.distplot(char_len_ndis.map(lambda x: np.mean(x)), ax=ax2, color='green')
ax2.set_title('non-disaster tweets')

plt.suptitle("Average length of words in tweets")
plt.tight_layout()
plt.show()

*The average word count for disaster tweets is in the range 7 - 7.5, while for non-disaster tweets in the range of 4.5 - 5.*

In [None]:
# Creating a corpus for further analysis.
def create_corpus(target):
    corpus = []
    for x in training[training['target'] == target]['text'].str.split():
        corpus.extend(x)
    return corpus

## 1.9 Analysing the top stop words in text.

In [None]:
def analyse_stopwords(func, target):
    values_list = []
    for labels in range(0, len(target)):
        dic = defaultdict(int)
        corpus = func(target[labels])
        for word in corpus:
            dic[word] += 1
        top = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]
        x_items, y_values = zip(*top)
        values_list.append(x_items)
        values_list.append(y_values)

    _, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 7))
    ax1.barh(values_list[0], values_list[1], color="lightblue", edgecolor='black', linewidth=1.2)
    ax1.set_title("non-disaster tweets")

    ax2.barh(values_list[2], values_list[3], color="lightgreen", edgecolor='black', linewidth=1.2)
    ax2.set_title("disaster tweets")

    plt.suptitle("Top stop words in the dataset")
    plt.show()


analyse_stopwords(create_corpus, [0, 1])

*From the bar chart we can observe that the most frequently occurring stopwords in both disaster/non-disaster tweets is "the" (1000+ occurrences) while the least occurring for non-disaster is "for" (400+ occurrences) and for disaster tweets is "is" (300+ occurrences).*

## 1.10 Analysing punctuation

In [None]:
def analyse_punctuation(func, target):
    values_list = []
    special = string.punctuation
    for labels in range(0, len(target)):
        dic = defaultdict(int)
        corpus = func(target[labels])
        for i in corpus:
            if i in special:
                dic[i] += 1
        x_items, y_values = zip(*dic.items())
        values_list.append(x_items)
        values_list.append(y_values)

    _, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    ax1.bar(values_list[0], values_list[1],
            color="lightblue", edgecolor='black', linewidth=0.7)
    ax1.set_title("non-disaster tweets")

    ax2.bar(values_list[2], values_list[3],
            color="lightgreen", edgecolor='black', linewidth=0.7)
    ax2.set_title("disaster tweets")

    plt.suptitle("punctuation in tweets")
    plt.show()


analyse_punctuation(create_corpus, [0, 1])


*From the bar chart we can see that the punctuation with the most occurrences in both disaster/non-disaster tweets is "-" (350+) while the ones with the least occurrences for non-disaster are "%", ":", "$", "_" and for disaster tweets they are "=>" and ")".*

## 1.11 Missing values analysis

Both the training and test datasets have the same missing values for `location` and `keyword`.

- **2.3% - 2.4%** missing keywords in boths sets
- **97.6% - 97.7%** missing locations in both sets

In [None]:
missing_train = training.isnull().sum()
missing_test = test.isnull().sum()
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
missing_train = missing_train[missing_train > 0].sort_values()
ax1.pie(missing_train, autopct='%1.1f%%', startangle=30, explode=[0.9, 0], labels=["keyword", "location"],
        colors=['yellow', 'cyan'])
ax1.set_title("null values present in the training dataset")

missing_test = missing_test[missing_test > 0].sort_values()
ax2.pie(missing_test, autopct='%1.1f%%', startangle=30, explode=[0.9, 0], labels=["keyword", "location"],
        colors=['yellow', '#66ff00'])
ax2.set_title("null values present in the test dataset")

plt.suptitle("Distribution of null values in the dataset")
plt.tight_layout()
plt.show()


*The above pictorial representation displays the missing values in each of the datasets. From the distribution, it is observed that columns `keyword` and `location` contain missing values. For the training data, the % of missing values is 97.6 for `location` and 24 for `keyword`, while for the testing dataset, it is 97.7% for `location` and 23% for `keyword`. Also, the column having maximum missing values is: `location` while `keyword` column has the minimum count of missing values for both sets of data.*

## 1.11 Analysing the top 20 disastrous keywords

In [None]:
plt.figure(figsize=(10, 7))
training[training['target'] == 1]['keyword'].value_counts()[:20].plot(kind='barh', fontsize=12, color='#0096FF',
                                                                      linewidth=0.7,
                                                                      title='Top 20 keywords in disastrous tweets')
plt.show()


*From the above bar chart is evident that `outbrak`, `wreckage` and `derailment` are the most frequent keywords in disastrous tweets with close to 40 occurrences.*

## 1.12 Analysing the top 20 disastrous locations

In [None]:
plt.figure(figsize=(10, 7))
training[training["target"] == 1]["location"].value_counts()[:20].plot(kind='barh', fontsize=12, color='#0096FF',
                                                                       linewidth=0.7,
                                                                       title='Top 20 most frequent locations for disastrous tweets')
plt.show()


*From the above bar chart we can see that, from the tweets that do have a location, the United States is the most frequent.*

# 2. Cleaning up the data

There are some words/characters/strings that need to be removed and formatted. We will be using the Natural Language Toolkit's stopwords set and WordNetLemmatizer to group together different forms of words in order for them to be analsyed as one.

Here is what operation we will execute over each entry:

- Remove all URL's.
- Remove all Emoji's.
- Remove all HTMl tags.
- Lowercase all text.
- Words shorter than 2 symbols.


In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.update(string.punctuation)


def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)


def cleanup_text(texts):
    corpus = list()
    for text in texts:

        # remove non-ascii characters
        text = unicodedata.normalize('NFKD', text).encode(
            'ascii', 'ignore').decode('utf-8', 'ignore')

        # remove html tags
        text = re.sub(r'<.*?>', ' ', text)

        # remove urls
        text = re.sub(r'https?://\S+|www\.\S+|http?://\S+', ' ', text)

        # remove emojis
        text = remove_emoji(text)

        # remove  punctuation
        text = remove_punct(text)

        # keeping only alphabetic characters
        text = re.sub(r'[^a-zA-Z]', ' ', text)

        text = text.lower()
        text = text.split()

        # remove all words, shorter than 2 characters
        text = [i for i in text if len(i) > 2]

        # remove stopwords from text
        final_text = []
        for word in text:
            word = word.strip()
            if word not in stop_words:
                final_text.append(word)

        text = " ".join(final_text)

        text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
        text = " ".join([lemmatizer.lemmatize(word, pos='v')
                        for word in text.split()])

        corpus.append(text)

    return corpus


training['clean_text'] = cleanup_text(training['text'])
test['clean_text'] = cleanup_text(test['text'])
training.head()


*We can observe the effect of the cleanup and how the algorithm leaves only the important words from the training.*

In [None]:
sample_df = training.sample(n=10).reset_index(drop=True)
for i in range(3):
    print("-" * 100)
    print(f"BEFORE: {sample_df.loc[i, 'text']}\n")
    print(f"AFTER: {sample_df.loc[i, 'clean_text']}")

# 3. Visualizing the data


## 3.1 Analyse the top 20 words in the training data

In [None]:
disaster_tweet_clean = training[training.target == 1]["clean_text"]
non_disaster_tweet_clean = training[training.target == 0]["clean_text"]

color = ['Paired', 'Accent']
dataSplit = [disaster_tweet_clean, non_disaster_tweet_clean]
title = ["disaster tweets", "non-disaster tweets"]
for item in range(2):
    plt.figure(figsize=(20, 8))
    plt.title(title[item], fontsize=14)
    pd.Series(' '.join([i for i in dataSplit[item]]).split()).value_counts().head(20).plot(kind='barh', fontsize=14,
                                                                                           colormap=color[item],
                                                                                           edgecolor='black',
                                                                                           linewidth=0.7)
    plt.show()


*`fire` seems to be the most frequent word among the disaster tweets (+270), while `like` is the most frequent for the non-disaster tweets (+260). Other frequent words in the disaster tweets are `news`, `amp`, and `disaster`. For the non-disaster tweets `amp`, `get`, and `new` are also frequent.*

## 3.2 Further cleanup

From the above graphs we see that, although we cleaned the data, there are still some unnecessary words left, such as `like`, `amp`, `get`. We will now remove them.

In [None]:
common_words = ['via', 'like', 'build', 'get', 'would', 'one', 'two', 'feel', 'lol', 'fuck', 'take', 'way', 'may',
                'first', 'latest', 'want', 'make', 'back', 'see', 'know', 'let', 'look', 'come', 'got', 'still', 'say',
                'think', 'great', 'pleas', 'amp']


def text_cleaning(data):
    return ' '.join(i for i in data.split() if i not in common_words)


training["clean_text"] = training["clean_text"].apply(text_cleaning)
test["clean_text"] = test["clean_text"].apply(text_cleaning)

Let's review the data now

## 3.3 Top 20 words in the training data (after a thorough cleanup)

In [None]:
disaster_tweet_clean = training[training.target == 1]["clean_text"]
non_disaster_tweet_clean = training[training.target == 0]["clean_text"]

color = ['Paired', 'Accent']
dataSplit = [disaster_tweet_clean, non_disaster_tweet_clean]
title = ["disaster tweets", "non-disaster tweets"]
for item in range(2):
    plt.figure(figsize=(20, 8))
    plt.title(title[item], fontsize=14)
    pd.Series(' '.join([i for i in dataSplit[item]]).split()).value_counts().head(20).plot(kind='barh', fontsize=14,
                                                                                           colormap=color[item],
                                                                                           edgecolor='black',
                                                                                           linewidth=0.7)
    plt.show()

*After the cleanup, some the most frequent words in the disaster tweets dataset are `fire`, `news`, `disaster`, `california`, `suicide`. From the non-disaster tweets, the most frequent are: `new`, `body`, `time`, `day`.*

## 3.4 Plotting Common N-grams

In [None]:
def top_n_grams(data, top_grams_num, grams):
    word_freq = []
    if grams == 2:
        count_vec = CountVectorizer(ngram_range=(2, 2)).fit(data)
        bow = count_vec.transform(data)
        add_words = bow.sum(axis=0)
        word_freq = [(word, add_words[0, idx]) for word, idx in count_vec.vocabulary_.items()]
        word_freq = sorted(word_freq, key=lambda x: x[1], reverse=True)
    elif grams == 3:
        count_vec = CountVectorizer(ngram_range=(3, 3)).fit(data)
        bow = count_vec.transform(data)
        add_words = bow.sum(axis=0)
        word_freq = [(word, add_words[0, idx]) for word, idx in count_vec.vocabulary_.items()]
        word_freq = sorted(word_freq, key=lambda x: x[1], reverse=True)

    return word_freq[:top_grams_num]

### 3.4.1 Bigrams

In [None]:
disaster_bigrams = top_n_grams(training[training['target'] == 1]["clean_text"], 10, 2)
non_disaster_bigrams = top_n_grams(training[training['target'] == 0]["clean_text"], 10, 2)

disaster_bigrams_df = pd.DataFrame(disaster_bigrams, columns=['word', 'freq'])
non_disaster_bigrams_df = pd.DataFrame(non_disaster_bigrams, columns=['word', 'freq'])
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 20))

ax1.bar(disaster_bigrams_df["word"], disaster_bigrams_df["freq"], color="lightblue", edgecolor='black', linewidth=0.7)
ax1.set_title("Top 10 disaster bigrams in the dataset")
ax1.set_xlabel("Words")
ax1.set_ylabel("Frequency")
ax1.set_xticklabels(rotation=90, labels=disaster_bigrams_df["word"], fontsize=14)

ax2.bar(non_disaster_bigrams_df["word"], non_disaster_bigrams_df["freq"], color="lightgreen", edgecolor='black',
        linewidth=0.7)
ax2.set_title("Top 10 non-disaster bigrams in the dataset.")
ax2.set_xlabel("Words")
ax2.set_ylabel("Frequency")
ax2.set_xticklabels(rotation=90, labels=non_disaster_bigrams_df["word"], fontsize=14)
plt.tight_layout(pad=1.85)
plt.show()

*`suicide bomber`, `northern california`, and `oil spill` are among the most frequent bigrams in the disaster tweets dataset, while `liked youtube`, `cross body`, and `youtube video` are among the most common bigrams in the non-disaster dataset.*

### 3.4.2 Trigrams

In [None]:
disaster_trigrams = top_n_grams(training[training['target'] == 1]["clean_text"], 10, 3)
non_disaster_trigrams = top_n_grams(training[training['target'] == 0]["clean_text"], 10, 3)

disaster_trigrams_df = pd.DataFrame(disaster_trigrams, columns=['word', 'freq'])
non_disaster_trigrams_df = pd.DataFrame(non_disaster_trigrams, columns=['word', 'freq'])
_, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 20))

ax1.bar(disaster_trigrams_df["word"], disaster_trigrams_df["freq"], color="lightblue", edgecolor='black', linewidth=0.7)
ax1.set_title("Top 10 disaster trigrams in the dataset")
ax1.set_xlabel("Words")
ax1.set_ylabel("Frequency")
ax1.set_xticklabels(rotation=90, labels=disaster_trigrams_df["word"], fontsize=14)

ax2.bar(non_disaster_trigrams_df["word"], non_disaster_trigrams_df["freq"], color="lightgreen", edgecolor='black',
        linewidth=0.7)
ax2.set_title("Top 10 non-disaster trigrams in the dataset.")
ax2.set_xlabel("Words")
ax2.set_ylabel("Frequency")
ax2.set_xticklabels(rotation=90, labels=non_disaster_trigrams_df["word"], fontsize=14)
plt.tight_layout(pad=1.85)
plt.show()

*When it comes to trigrams, `suicide bomber detonated`, and `northern california wildfire` are among the most frequent ones in the disaster tweets dataset, while `liked youtube video`, `cross body bag`, and `reddit quarantine offensive` are among the most common in the non-disaster dataset.*

# 4. Creating the models

A natural way to represent text for computers is to encode each character individually, this seems quite inadequate to represent and understand language. Our goal is to first create a useful embedding for each sentence (or tweet) in our dataset, and then use these embeddings to accurately predict the relevant category.

The simplest approach we can start with is to use a bag of words model, and apply a logistic regression on top. A bag of words just associates an index to each word in our vocabulary, and embeds each sentence as a list of 0s, with a 1 at each index corresponding to a word present in the sentence.

*Credit: [NLP Tutorial](https://github.com/hundredblocks/concrete_NLP_tutorial/blob/master/NLP_notebook.ipynb)*

## 4.1 Bag of Words Counts

In [None]:
def cv(data):
    count_vectorizer = CountVectorizer()

    emb = count_vectorizer.fit_transform(data)

    return emb, count_vectorizer

list_corpus = training["clean_text"].tolist()
list_labels = training["target"].tolist()

X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size=0.2, 
                                                                                random_state=random_state_split)

X_train_counts, count_vectorizer = cv(X_train)
X_test_counts = count_vectorizer.transform(X_test)

### Visualizing the embeddings

In [None]:
def plot_LSA(test_data, test_labels, plot=True):
    lsa = TruncatedSVD(n_components=2)
    lsa.fit(test_data)
    lsa_scores = lsa.transform(test_data)
    color_mapper = {label: idx for idx, label in enumerate(set(test_labels))}
    color_column = [color_mapper[label] for label in test_labels]
    colors = ['orange', 'blue']
    if plot:
        plt.scatter(lsa_scores[:, 0], lsa_scores[:, 1], s=8, alpha=.8,
                    c=test_labels, cmap=matplotlib.colors.ListedColormap(colors))
        orange_patch = mpatches.Patch(color='orange', label='Not')
        blue_patch = mpatches.Patch(color='blue', label='Real')
        plt.legend(handles=[orange_patch, blue_patch], prop={'size': 30})


fig = plt.figure(figsize=(16, 16))
plot_LSA(X_train_counts, y_train)
plt.show()


The embeddings do not look very clearly separated. Let's see if we can do something about it.

## 4.2 Term Frequency — Inverse Document Frequency

In [None]:
def tfidf(data):
    tfidf_vectorizer = TfidfVectorizer()

    train = tfidf_vectorizer.fit_transform(data)

    return train, tfidf_vectorizer

X_train_tfidf, tfidf_vectorizer = tfidf(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
fig = plt.figure(figsize=(16, 16))          
plot_LSA(X_train_tfidf, y_train)
plt.show()

## 4.3 GloVe

We will use GloVe pretrained corpus model to represent our words.

[Credit](https://www.kaggle.com/shahules/basic-eda-cleaning-and-glove#GloVe-for-Vectorization)

In [None]:
def create_corpus(df):
    corpus=[]
    for tweet in tqdm(df['clean_text']):
        words=[word.lower() for word in word_tokenize(tweet) if(word.isalpha()==1)]
        corpus.append(words)
    return corpus

In [None]:
corpus=create_corpus(training)

In [None]:
embedding_dict={}
with open('./glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [None]:
MAX_LEN=50
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [None]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

In [None]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in tqdm(word_index.items()):
    if i < num_words:
        emb_vec=embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i]=emb_vec      

In [None]:
tweet_pad[0][0:]

### Baseline model with GloVe

In [None]:
model = Sequential()

embedding = Embedding(num_words, 100, embeddings_initializer=Constant(embedding_matrix),
                      input_length=MAX_LEN, trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


optimzer = Adam(learning_rate=3e-4)

model.compile(loss='binary_crossentropy',
              optimizer=optimzer, metrics=['accuracy'])

model.summary()


In [None]:
train_v = tweet_pad[:training.shape[0]]
test_v = tweet_pad[training.shape[0]:]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train_v, training['target'].values, test_size=0.15)
print('Shape of train', X_train.shape)
print("Shape of Validation ", X_test.shape)


In [None]:
fig = plt.figure(figsize=(16, 16))
plot_LSA(train_v, training['target'])
plt.show()


In [None]:
history = model.fit(X_train, y_train, batch_size=4, epochs=10,
                    validation_data=(X_test, y_test), verbose=2)


In [None]:
train_pred_GloVe = model.predict(train_v)
train_pred_GloVe_int = train_pred_GloVe.round().astype('int')

## 4.4 BERT using TFHub

[Creadits](https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub)

In [9]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []

    for text in texts:
        text = tokenizer.tokenize(text)

        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)

        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len

        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)

    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)


In [10]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    
    if Dropout_num == 0:
        # Without Dropout
        out = Dense(1, activation='sigmoid')(clf_output)
    else:
        # With Dropout(Dropout_num), Dropout_num > 0
        x = Dropout(Dropout_num)(clf_output)
        out = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [5]:
# Load BERT from the Tensorflow Hub
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

### Build and train BERT model

In [7]:
# Load tokenizer from the bert layer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [15]:
# Encode the text into tokens, masks, and segment flags
train_input = bert_encode(training.text.values, tokenizer, max_len=160)
test_input = bert_encode(test.text.values, tokenizer, max_len=160)
train_labels = training.target.values


In [16]:
# Build BERT model
model_BERT = build_model(bert_layer, max_len=160)
model_BERT.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [17]:
checkpoint = ModelCheckpoint(
    'model_BERT.h5', monitor='val_loss', save_best_only=True)

train_history = model_BERT.fit(
    train_input, train_labels,
    validation_split=valid,
    epochs=epochs_num,  # recomended 3-5 epochs
    callbacks=[checkpoint],
    batch_size=batch_size_num
)


Epoch 1/3


In [None]:
# Prediction by BERT model
model_BERT.load_weights('model_BERT.h5')
test_pred_BERT = model_BERT.predict(test_input)
test_pred_BERT_int = test_pred_BERT.round().astype('int')

In [None]:
# Prediction by BERT model for the training data - for the Confusion Matrix
train_pred_BERT = model_BERT.predict(train_input)
train_pred_BERT_int = train_pred_BERT.round().astype('int')

In [None]:
pred = pd.DataFrame(test_pred_BERT, columns=['preds'])
pred.plot.hist()


## 4.5 Showing confusion matrix

In [None]:
def plot_cm(y_true, y_pred, title, figsize=(5, 5)):
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            elif c == 0:
                annot[i, j] = ''
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
    cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    fig, ax = plt.subplots(figsize=figsize)
    plt.title(title)
    sns.heatmap(cm, cmap="YlGnBu", annot=annot, fmt='', ax=ax)


In [None]:
# Showing Confusion Matrix for the GloVe model
plot_cm(train_pred_GloVe_int, train['target'].values,
        'Confusion matrix for GloVe model', figsize=(7, 7))


In [None]:
# Showing Confusion Matrix for the BERT model
plot_cm(train_pred_BERT_int, train['target'].values,
        'Confusion matrix for BERT model', figsize=(7, 7))
