# NLP text classification analysis

In [2]:
!pip install confuse

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting confuse
  Downloading confuse-2.0.0-py3-none-any.whl (24 kB)
Installing collected packages: confuse
Successfully installed confuse-2.0.0


Packages

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import random
import zipfile

import logging
import confuse
import os
from datetime import datetime, timedelta
#from pyspark.sql.functions import *
#import pyspark.sql.functions as f
#import sparknlp
#from sparknlp.base import *

#from pyspark.sql.types import *
#from pyspark.sql.window import Window
#from pyspark.sql.functions import substring_index
#from sparknlp.base import *
#from sparknlp.annotator import *
import re
import string
#from sparknlp.pretrained import PretrainedPipeline
#import spacy
#import en_core_web_md

from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from statistics import mean
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

Constants

In [20]:
BATCH_SIZE = 32
VOCAB_LENGTH = 15000

## Text data loading & ETL

In [9]:
train_df = pd.read_csv("train_dataset.csv")[["text", "target"]]
test_df = pd.read_csv("test_dataset.csv")[["text"]]
print(train_df.head())

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [12]:
print(test_df.head())

                                                text
0                 Just happened a terrible car crash
1  Heard about #earthquake is different cities, s...
2  there is a forest fire at spot pond, geese are...
3           Apocalypse lighting. #Spokane #wildfires
4      Typhoon Soudelor kills 28 in China and Taiwan


In [13]:
print(f"Value counts of test dataset: {train_df.target.value_counts()}")

Value counts of test dataset: 0    4342
1    3271
Name: target, dtype: int64


In [15]:
train_text, val_text, train_label, val_label = train_test_split(train_df["text"].to_numpy(), 
                                                                train_df["target"].to_numpy(), 
                                                                test_size=0.1, 
                                                                random_state=42)

print(train_text.shape, val_text.shape, train_label.shape, val_label.shape)

(6851,) (762,) (6851,) (762,)


In [16]:
print(train_text[:5])

["'McFadden Reportedly to Test Hamstring Thursday' via @TeamStream http://t.co/jWq4KvJH2j"
 'w--=-=-=-[ NEMA warns Nigerians to prepare for drought http://t.co/5uoOPhSqU3'
 "When I was cooking earlier I got electrocuted some crucial ?????? now I'm psychic lol"
 "I'm On Fire.  http://t.co/WATsmxYTVa"
 "More than 40 families affected by the fatal outbreak of Legionnaires' disease in Edinburgh are to sue two comp... http://t.co/vsoXioOy78"]


In [17]:
print(train_label[:5])

[0 1 0 0 1]


## Model Training

### Tokenization

In [22]:
mean_word_length = int(mean([len(w.split()) for w in train_text]))

In [23]:
tokenizer = layers.TextVectorization(max_tokens=VOCAB_LENGTH, standardize="lower_and_strip_punctuation", split="whitespace", ngrams=None, output_mode="int", output_sequence_length=mean_word_length)
tokenizer.adapt(train_text)

In [24]:
vocab = tokenizer.get_vocabulary()
frequent_words = vocab[:10]
unfrequent_words = vocab[-10:]
print(f"Frequent words: {frequent_words}\n Unfrequent words: {unfrequent_words}")

Frequent words: ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is']
 Unfrequent words: ['httptcoeejrmktj0r', 'httptcoeeewo207mx', 'httptcoee8rpoahpk', 'httptcoedznx6goud', 'httptcoedekrmqtpq', 'httptcoed32pbvio7', 'httptcoed1vpitswy', 'httptcoecxmoinzgu', 'httptcoecmj18azai', 'httptcoecd7hizja1']


### Dense model

In [25]:
class DenseModel():
    def __init__(self, tokenizer, n_units=1, activation="sigmoid"):
        self.tokenizer = tokenizer
        self.n_units = n_units
        self.activation = activation
    
    def get_model(self, model_name="dense_model"):
        dense_inputs = layers.Input(shape=(1, ), dtype="string")
        
        x = self.tokenizer(dense_inputs)
        dense_embedding = layers.Embedding(input_dim=len(vocab), 
                                           output_dim=128, 
                                           embeddings_initializer="uniform", 
                                           input_length=mean_word_length, 
                                           name="embedding")
        x = dense_embedding(x)
        
        x = layers.GlobalAveragePooling1D()(x)
        dense_outputs = layers.Dense(units=self.n_units, activation=self.activation)(x)
        dense_model = tf.keras.Model(dense_inputs, dense_outputs, name=model_name)
        
        return dense_model

### LSTM model

In [26]:
class LSTMModel():
    def __init__(self, tokenizer, vocab, mean_word_length, embedding_output_dim=128, n_lstm_units=64, n_dense_units=1, activation="sigmoid"):
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.mean_word_length = mean_word_length
        self.embedding_output_dim = embedding_output_dim
        self.n_lstm_units = n_lstm_units
        self.n_dense_units = n_dense_units
        self.activation = activation
        
        self.embedding_input_dim = len(vocab)
        
    def get_model(self, model_name="lstm_model"):
        tf.random.set_seed(42)
        
        lstm_embedding = layers.Embedding(input_dim=self.embedding_input_dim, 
                                          output_dim=self.embedding_output_dim, 
                                          embeddings_initializer="uniform",
                                          input_length=self.mean_word_length,
                                          name="embedding-lstm")
        
        lstm_inputs = layers.Input(shape=(1, ), dtype="string")
        x = self.tokenizer(lstm_inputs)
        x = lstm_embedding(x)
        x = layers.LSTM(units=self.n_lstm_units)(x)
        lstm_outputs = layers.Dense(units=self.n_dense_units, activation=self.activation)(x)
        lstm_model = tf.keras.Model(lstm_inputs, lstm_outputs, name=model_name)
        
        return lstm_model

### Bidirectional LSTM model

In [27]:
class BidirectionalLSTMModel():
    def __init__(self, tokenizer, vocab, mean_word_length, embedding_output_dim=128, n_lstm_units=64, n_dense_units=1, activation="sigmoid"):
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.mean_word_length = mean_word_length
        self.embedding_output_dim = embedding_output_dim
        self.n_lstm_units = n_lstm_units
        self.n_dense_units = n_dense_units
        self.activation = activation
        
        self.embedding_input_dim = len(vocab)
        
    def get_model(self, model_name="bidirectional_lstm_model"):
        tf.random.set_seed(42)
        
        bidir_lstm_embedding = layers.Embedding(input_dim=self.embedding_input_dim,
                                               output_dim=self.embedding_output_dim,
                                               embeddings_initializer="uniform",
                                               input_length=self.mean_word_length,
                                               name="embedding-bidirectional-lstm")
        bidir_lstm_inputs = layers.Input(shape=(1, ), dtype="string")
        x = self.tokenizer(bidir_lstm_inputs)
        x = bidir_lstm_embedding(x)
        x = layers.Bidirectional(layers.LSTM(units=self.n_lstm_units))(x)
        bidir_lstm_outputs = layers.Dense(units=1, activation="sigmoid")(x)
        bidir_lstm_model = tf.keras.Model(bidir_lstm_inputs, bidir_lstm_outputs, name=model_name)
        
        return bidir_lstm_model

### Ensemble modeling

In [28]:
def get_ensemble_models(models, train_data, train_label, val_data, num_iter=10, num_epochs=100, loss_funcs=["binary_crossentropy"]):
    """
    Returns a list of num_iter models each trained on binary_crossentropy loss functions by default.
    
    For instance, if num_iter = 10, a list of 60 trained models will be returned.
    10 * len(loss_funcs) * len(models) = 60 
    
    Parameters
    ----------
    models: NLP models passed.
    train_data: Training text dataset before tokenization and embedding.
    train_label: Training label dataset.
    val_data: List of validation dataset before tokenization and embedding.
    """
    ensemble_models = []
    
    for n_iter in range(num_iter):
        for model in models:
            for loss_func in loss_funcs:
                print(f"Reducing: {loss_func} for epochs: {num_epochs}, num_iter: {n_iter}, model: {model.name}")
                
                model.compile(loss=loss_func, optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"])
                
                model.fit(train_data, train_label, epochs=num_epochs, verbose=2, validation_data=val_data,
                         callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=200, restore_best_weights=True),
                                     tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", patience=100, verbose=1)])
                
                ensemble_models.append(model)
    
    return ensemble_models

In [29]:
def make_ensemble_preds(ensemble_models, input_data):
    """
    Returns predictions of ensemble models.

    Parameters
    ----------
    ensemble_models: Trained ensemble of models.
    input_data: Data to be predicted with.
    """
    ensemble_preds = []
    
    for model in ensemble_models:
        preds = model.predict(input_data) # Make predictions with current ensemble model
        ensemble_preds.append(preds)
    
    return tf.constant(tf.squeeze(ensemble_preds))

In [30]:
def get_ensemble_models_summary(models):
    """
    Generates model summaries of ensemble models.

    Parameters
    ----------
    models: Ensemble of models.
    """
    for model in models:
        print(model.summary())

### Training ensemble models

In [31]:
dense_model_obj = DenseModel(tokenizer=tokenizer)
dense_model = dense_model_obj.get_model()

lstm_model_obj = LSTMModel(tokenizer=tokenizer, vocab=vocab, mean_word_length=mean_word_length)
lstm_model = lstm_model_obj.get_model()

bidir_lstm_model_obj = BidirectionalLSTMModel(tokenizer=tokenizer, vocab=vocab, mean_word_length=mean_word_length)
bidir_lstm_model = bidir_lstm_model_obj.get_model()

ensemble_models = [dense_model, lstm_model, bidir_lstm_model]

In [None]:
ensemble_models = get_ensemble_models(models=ensemble_models, 
                                      train_data=train_text, 
                                      train_label=train_label, 
                                      val_data=(val_text, val_label), 
                                      num_iter=10, num_epochs=100)

Reducing: binary_crossentropy for epochs: 100, num_iter: 0, model: dense_model
Epoch 1/100
215/215 - 7s - loss: 0.6087 - accuracy: 0.6990 - val_loss: 0.5379 - val_accuracy: 0.7730 - lr: 0.0010 - 7s/epoch - 34ms/step
Epoch 2/100
215/215 - 4s - loss: 0.4301 - accuracy: 0.8253 - val_loss: 0.4847 - val_accuracy: 0.7913 - lr: 0.0010 - 4s/epoch - 19ms/step
Epoch 3/100
215/215 - 4s - loss: 0.3269 - accuracy: 0.8726 - val_loss: 0.4813 - val_accuracy: 0.7874 - lr: 0.0010 - 4s/epoch - 18ms/step
Epoch 4/100
215/215 - 5s - loss: 0.2561 - accuracy: 0.9056 - val_loss: 0.4996 - val_accuracy: 0.7743 - lr: 0.0010 - 5s/epoch - 23ms/step
Epoch 5/100
215/215 - 4s - loss: 0.2033 - accuracy: 0.9285 - val_loss: 0.5198 - val_accuracy: 0.7756 - lr: 0.0010 - 4s/epoch - 19ms/step
Epoch 6/100
215/215 - 5s - loss: 0.1636 - accuracy: 0.9437 - val_loss: 0.5496 - val_accuracy: 0.7769 - lr: 0.0010 - 5s/epoch - 21ms/step
Epoch 7/100
215/215 - 5s - loss: 0.1329 - accuracy: 0.9558 - val_loss: 0.5831 - val_accuracy: 0.761

## Model Inferencing and Evaluation

### Evaluation functions

In [None]:
def evaluate_preds(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred) * 100
    precision, recall, f1_score, info = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1_score": f1_score, "info": info}

### Evaluating model training

In [None]:
comb_pred_probs = tf.squeeze(lstm_model_pred_probs, axis=1) + tf.squeeze(bidir_lstm_model_pred_probs, axis=1)
comb_pred_probs = tf.round(comb_pred_probs / 2)
print(comb_pred_probs, comb_pred_probs.shape)

In [None]:
ensemble_model_metrics = evaluate_preds(y_true=val_label, y_pred=comb_pred_probs)
print(f"Ensemble model metrics: {ensemble_model_metrics}")

### Inferencing on test data

In [None]:
test_text = test_df[["clean_text"]].to_list()
test_samples = random.sample(test_text, 10)

for test_sample in test_samples:
    pred = make_ensemble_preds(ensemble_models, test_samples)
    print(f"Pred: {int(pred)}")
    print(f"Text: \n{test_sample}\n\n")

### Model saving and loading

In [None]:
print(f"Ensemble model 0: {ensemble_models[0]}")

ensemble_models[0].save("/models/ensemble_models/dense_model", save_format='tf')
ensemble_models[1].save("/models/ensemble_models/lstm_model", save_format='tf')
ensemble_models[2].save("/models/ensemble_models/bidir_lstm_model", save_format='tf')

loaded_ensemble_dense_model = tf.keras.models.load_model("/models/ensemble_models/dense_model")
loaded_ensemble_lstm_model = tf.keras.models.load_model("/models/ensemble_models/lstm_model")
loaded_ensemble_bidir_lstm_model = tf.keras.models.load_model("/models/ensemble_models/bidir_lstm_model")

print(loaded_ensemble_dense_model.summary(), "\n")
print(loaded_ensemble_lstm_model.summary(), "\n")
print(loaded_ensemble_bidir_lstm_model.summary(), "\n")

In [None]:
val_text = ""
val_label = 0

loaded_ensemble_models = [loaded_ensemble_dense_model, loaded_ensemble_lstm_model, loaded_ensemble_bidir_lstm_model]

loaded_ensemble_models.evaluate_preds(val_text, val_label)