<a href="https://colab.research.google.com/github/zeeshanahmad10809/covid_fake_news_classification/blob/main/DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install loguru
!pip install tqdm

Collecting loguru
[?25l  Downloading https://files.pythonhosted.org/packages/6d/48/0a7d5847e3de329f1d0134baf707b689700b53bd3066a5a8cfd94b3c9fc8/loguru-0.5.3-py3-none-any.whl (57kB)
[K     |█████▊                          | 10kB 18.5MB/s eta 0:00:01[K     |███████████▌                    | 20kB 21.6MB/s eta 0:00:01[K     |█████████████████▏              | 30kB 16.5MB/s eta 0:00:01[K     |███████████████████████         | 40kB 14.9MB/s eta 0:00:01[K     |████████████████████████████▋   | 51kB 9.8MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 6.0MB/s 
[?25hInstalling collected packages: loguru
Successfully installed loguru-0.5.3


In [2]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
!unzip crawl-300d-2M.vec.zip
!rm crawl-300d-2M.vec.zip

--2021-02-27 07:52:56--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.74.142, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1523785255 (1.4G) [application/zip]
Saving to: ‘crawl-300d-2M.vec.zip’


2021-02-27 07:53:22 (55.5 MB/s) - ‘crawl-300d-2M.vec.zip’ saved [1523785255/1523785255]

Archive:  crawl-300d-2M.vec.zip
  inflating: crawl-300d-2M.vec       


In [19]:
import os
import re
import string
import random
import numpy as np
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from loguru import logger
from tqdm import tqdm
from pprint import pprint
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, BatchNormalization, Dense, Dropout
from tensorflow.keras.layers import (
    GRU,
    Bidirectional,
    SimpleRNN,
    Conv1D,
    GlobalMaxPool1D,
)
from sklearn.metrics import confusion_matrix, classification_report

nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
SEED_VALUE = 19
os.environ["PYTHONHASHSEED"] = str(SEED_VALUE)
random.seed(SEED_VALUE)
np.random.seed(SEED_VALUE)
tf.random.set_seed(SEED_VALUE)

In [9]:
DATASET1 = "COVID FakeNews Data.csv"
DATASET2 = "dataset-Non-extremist-Extremist.csv"

In [10]:
def remove_url(tweet):
    return " ".join(
        re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", tweet).split()
    )


def remove_punctuation(tweet):
    for ch in string.punctuation:
        if ch in tweet:
            tweet = tweet.replace(ch, "")
    return tweet


def lower_case(tweet):
    return tweet.lower().strip()


def lemmatize(tweet):
    lemmatizer = WordNetLemmatizer()
    tweet = " ".join(lemmatizer.lemmatize(token) for token in tweet.split(" "))
    return tweet


def preprocess_tweet(tweet):
    # tweet = p.clean(tweet)
    tweet = remove_url(tweet)
    tweet = remove_punctuation(tweet)
    tweet = lower_case(tweet)
    tweet = lemmatize(tweet)
    return tweet

In [11]:
class Dataset:
    def __init__(self, dataset_name):
        self.dataset_name = dataset_name
        data = None
        try:
            data = pd.read_csv(self.dataset_name)
        except FileNotFoundError:
            logger.warning("Dataset File is missing!")
            os._exit(0)
        if self.dataset_name == "dataset-Non-extremist-Extremist.csv":
            data["Tweet label"] = data["Tweet label"].replace("Non-extremist", 0)
            data["Tweet label"] = data["Tweet label"].replace("Extremist", 1)
            col_list = data.columns.to_list()
            col_list = [col_list[-1], col_list[0]]
            data = data[col_list]

        data.iloc[:, 0] = data.iloc[:, 0].apply(preprocess_tweet)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            data.iloc[:, 0], data.iloc[:, 1], stratify=data.iloc[:, 1], test_size=0.2
        )
        self.embedding_size = 300
        self.tokenizer = None

    def get_embedding_size(self):
        return self.embedding_size

    def get_word_index(self):
        return self.tokenizer.word_index

    def create_vocab(self):
        self.words_set = set()
        self.max_sentence_len = 0
        for sentence in self.X_train:
            tokens = sentence.split(" ")
            if len(tokens) > self.max_sentence_len:
                self.max_sentence_len = len(tokens)
            for word in tokens:
                self.words_set.add(word)

    def get_vocab_info(self):
        return len(self.words_set), self.max_sentence_len

    def fit(self):
        self.create_vocab()
        vocab_size, max_sentence_len = self.get_vocab_info()
        self.tokenizer = Tokenizer(num_words=vocab_size)
        self.tokenizer.fit_on_texts(self.X_train)

    def load(self):
        self.fit()
        self.X_train = sequence.pad_sequences(
            self.tokenizer.texts_to_sequences(self.X_train),
            maxlen=self.max_sentence_len,
            padding="post",
        )
        self.X_test = sequence.pad_sequences(
            self.tokenizer.texts_to_sequences(self.X_test),
            maxlen=self.max_sentence_len,
            padding="post",
        )
        return self.X_train, self.X_test, self.y_train, self.y_test

In [12]:
def loadFastTextModel(path=""):
    logger.info("Loading FastText Model!")
    embeddings_index = dict()

    try:
        with open(path, "r") as f:
            with tqdm(total=1999996, desc="loading FastText") as pbar:
                for line in f:
                    values = line.strip().split(" ")
                    word = values[0]
                    coefs = np.asarray(values[1:], dtype="float32")
                    embeddings_index[word] = coefs
                    pbar.update(1)

        return embeddings_index
    except FileNotFoundError:
        logger.error("Embedding file not in path!")
        os._exit(0)


def buildEmbeddingMatrix(word_index, vocab_size, embedding_size, embeddings_index):
    logger.info("Building Embedding Matrix!")
    embedding_matrix = np.zeros((vocab_size, embedding_size))

    for word, i in word_index.items():
        if i >= vocab_size:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [13]:
def LSTM_Model(weights, vocab_size, embedding_size, max_sen_len, num_classes):
    return Sequential(
        [
            Embedding(
                vocab_size,
                embedding_size,
                weights=[weights],
                trainable=False,
                input_shape=(max_sen_len,),
            ),
            LSTM(64, return_sequences=True),
            Dropout(0.1),
            LSTM(64),
            Dropout(0.3),
            Dense(64, activation="relu", name="relu_dense1"),
            Dropout(0.1),
            Dense(num_classes, activation="sigmoid", name="sigmoid_dense"),
        ]
    )


def GRU_Model(weights, vocab_size, embedding_size, max_sen_len, num_classes):
    initializer = tf.keras.initializers.GlorotUniform()
    return Sequential(
        [
            Embedding(
                vocab_size,
                embedding_size,
                weights=[weights],
                trainable=False,
                input_shape=(max_sen_len,),
            ),
            GRU(
                64,
                return_sequences=True,
                kernel_initializer=initializer,
                recurrent_initializer=initializer,
            ),
            Dropout(0.1),
            GRU(64, kernel_initializer=initializer, recurrent_initializer=initializer),
            Dropout(0.3),
            Dense(
                64,
                activation="relu",
                kernel_initializer=initializer,
                name="relu_dense1",
            ),
            Dropout(0.1),
            Dense(num_classes, activation="sigmoid", name="sigmoid_dense"),
        ]
    )


def RNN_Model(weights, vocab_size, embedding_size, max_sen_len, num_classes):
    initializer = tf.keras.initializers.GlorotUniform()
    return Sequential(
        [
            Embedding(
                vocab_size,
                embedding_size,
                weights=[weights],
                trainable=False,
                input_shape=(max_sen_len,),
            ),
            SimpleRNN(
                64,
                return_sequences=True,
                kernel_initializer=initializer,
                recurrent_initializer=initializer,
            ),
            Dropout(0.1),
            SimpleRNN(
                64, kernel_initializer=initializer, recurrent_initializer=initializer
            ),
            Dropout(0.3),
            Dense(
                64,
                activation="relu",
                kernel_initializer=initializer,
                name="relu_dense1",
            ),
            Dropout(0.1),
            Dense(num_classes, activation="sigmoid", name="sigmoid_dense"),
        ]
    )


def Conv1d_Model(weights, vocab_size, embedding_size, max_sen_len, num_classes):
    return Sequential(
        [
            Embedding(
                vocab_size,
                embedding_size,
                weights=[weights],
                trainable=False,
                input_shape=(max_sen_len,),
            ),
            Conv1D(128, 3, strides=1, padding="SAME", activation="relu"),
            Dropout(0.1),
            Conv1D(256, 3, strides=1, padding="SAME", activation="relu"),
            Dropout(0.3),
            GlobalMaxPool1D(),
            Dense(64, activation="relu", name="relu_dense1"),
            Dropout(0.1),
            Dense(num_classes, activation="sigmoid", name="sigmoid_dense"),
        ]
    )


def BiLSTM_Model(weights, vocab_size, embedding_size, max_sen_len, num_classes):
    return Sequential(
        [
            Embedding(
                vocab_size,
                embedding_size,
                weights=[weights],
                trainable=False,
                input_shape=(max_sen_len,),
            ),
            Bidirectional(LSTM(64, return_sequences=True)),
            Dropout(0.1),
            Bidirectional(LSTM(64)),
            Dropout(0.3),
            Dense(64, activation="relu", name="relu_dense1"),
            Dropout(0.1),
            Dense(num_classes, activation="sigmoid", name="sigmoid_dense"),
        ]
    )


def buildModel(
    name="lstm",
    embedding_matrix=None,
    vocab_size=16222,
    embedding_size=None,
    max_sen_len=56,
    num_classes=2,
):
    if name == "lstm":
        return LSTM_Model(
            embedding_matrix, vocab_size, embedding_size, max_sen_len, num_classes - 1
        )
    elif name == "gru":
        return GRU_Model(
            embedding_matrix, vocab_size, embedding_size, max_sen_len, num_classes - 1
        )
    elif name == "rnn":
        return RNN_Model(
            embedding_matrix, vocab_size, embedding_size, max_sen_len, num_classes - 1
        )
    elif name == "bilstm":
        return BiLSTM_Model(
            embedding_matrix, vocab_size, embedding_size, max_sen_len, num_classes - 1
        )
    elif name == "conv1d":
        return Conv1d_Model(
            embedding_matrix, vocab_size, embedding_size, max_sen_len, num_classes - 1
        )
    else:
        logger.error(f"Invalid model name {name}")
        os._exit(0)

In [14]:
def show_performance(y_test, y_test_pred):
    pprint(confusion_matrix(y_test, y_test_pred))
    print(classification_report(y_test, y_test_pred, digits=4))

In [15]:
fasttext_embeddings_index = loadFastTextModel("crawl-300d-2M.vec")

2021-02-27 07:58:14.372 | INFO     | __main__:loadFastTextModel:2 - Loading FastText Model!
loading FastText: 100%|██████████| 1999996/1999996 [02:07<00:00, 15702.83it/s]


In [20]:
dataset = Dataset(DATASET2)
X_train, X_test, y_train, y_test = dataset.load()
vocab_size, max_sentence_len = dataset.get_vocab_info()
word_index = dataset.get_word_index()
embedding_size = dataset.get_embedding_size()
embedding_matrix = buildEmbeddingMatrix(
    word_index, vocab_size, embedding_size, fasttext_embeddings_index
)

2021-02-27 08:01:46.705 | INFO     | __main__:buildEmbeddingMatrix:22 - Building Embedding Matrix!


In [21]:
model = buildModel(
    name="lstm",
    embedding_matrix=embedding_matrix,
    vocab_size=vocab_size,
    embedding_size=embedding_size,
    max_sen_len=max_sentence_len,
    num_classes=2,
)

In [22]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=["accuracy"],
)

In [26]:
model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    batch_size=32,
    epochs=15,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=3, monitor="val_loss")],
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15


<tensorflow.python.keras.callbacks.History at 0x7f43040af0d0>

In [27]:
y_test_pred = np.where(model.predict(X_test) < 0.5, 0, 1)

In [28]:
# USE 'macro avg'
show_performance(y_test, y_test_pred)

array([[1008,  679],
       [ 303, 2248]])
              precision    recall  f1-score   support

           0     0.7689    0.5975    0.6724      1687
           1     0.7680    0.8812    0.8207      2551

    accuracy                         0.7683      4238
   macro avg     0.7685    0.7394    0.7466      4238
weighted avg     0.7684    0.7683    0.7617      4238

