<a href="https://colab.research.google.com/github/timthedev07/news-articles-classification/blob/dev/news_articles_categorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries

In [None]:
import os
import string
import re
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D, Dropout, LSTM
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.callbacks import EarlyStopping
import nltk
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
stopWords = set(stopwords.words("english"))
stopWords.remove("not")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Load data.

In [None]:
!rm -rf ./sample_data
!rm -rf ./data
!mkdir data

from google.colab import drive
drive.mount('/content/gdrive')

!cp "/content/gdrive/My Drive/datasets/news-articles-categorization/data.json.zip" data
%cd data
!unzip -ojq data.json.zip
!mv News_Category_Dataset_v2.json data.json
!rm -rf data.json.zip
%cd ..

data = pd.read_json("data/data.json", lines = True)

In [None]:
x = data["headline"]
y = pd.get_dummies(data["category"])

trainX, testX, trainY, testY = train_test_split(x, y, test_size = 0.2, random_state = 42, shuffle = True)

LABELS = np.sort(data["category"].unique())

Input cleaning.

In [None]:
def customStandardization(text: tf.Tensor):
    # to lower case
    text = tf.strings.lower(text)
    # expand contraction
    pairs = [
        ("won't", "will not"),
        ("can't", "can not"),
        ("n't", " not"),
        ("'re", " are"),
        ("'s", " is"),
        ("'d", " would"),
        ("'ll", " will"),
        ("'t", " not"),
        ("'ve", " have"),
        ("'m", " am"),
    ]
    for contracted, replacement in pairs:
        text = tf.strings.regex_replace(text, contracted, replacement)
    
    # clean special symbols
    text = tf.strings.regex_replace(text, r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?", " ")
    text = tf.strings.regex_replace(text, r'@([A-Za-z0-9_]+)', " ")
    text = tf.strings.regex_replace(text, r"[^A-Za-z0-9]+", " ")

    # remove stopwords
    for i in stopWords:
        text = tf.strings.regex_replace(text, f"[^A-Za-z0-9_]+{i}[^A-Za-z0-9_]+", " ")

    return text

The model.

In [None]:
SEQUENCE_LENGTH = 12
VOCAB_SIZE = 15000

vectorizeLayer = TextVectorization(
        standardize=customStandardization,
        max_tokens=VOCAB_SIZE,
        output_mode='int',
        output_sequence_length=SEQUENCE_LENGTH)

vectorizeLayer.adapt(np.concatenate([trainX, testX]))

In [None]:
model = Sequential([
    vectorizeLayer,
    Embedding(VOCAB_SIZE, 32, name="embedding"),
    LSTM(64, name="lstm_1", recurrent_dropout = 0.4, dropout = 0.4),
    Dense(len(LABELS), activation = "softmax")
])

model.compile(
    optimizer='adam',
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)

model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_3 (TextV  (None, 12)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 12, 32)            480000    
                                                                 
 lstm_1 (LSTM)               (None, 64)                24832     
                                                                 
 dense_10 (Dense)            (None, 41)                2665      
                                                                 
Total params: 507,497
Trainable params: 507,497
Non-trainable params: 0
_________________________________________________________________


In [49]:
EPOCHS = 20
callback = EarlyStopping(patience=1)

model.fit(
    trainX,
    trainY,
    epochs=EPOCHS,
    batch_size=32,
    validation_data=(testX, testY),
    callbacks = [callback]
)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


<keras.callbacks.History at 0x7ff42ff37090>

In [50]:
model.save("model")
# compress model
!zip -r model.zip model

# download model
from google.colab import files
files.download("model.zip")

  adding: model/ (stored 0%)
  adding: model/saved_model.pb (deflated 85%)
  adding: model/keras_metadata.pb (deflated 86%)
  adding: model/assets/ (stored 0%)
  adding: model/variables/ (stored 0%)
  adding: model/variables/variables.data-00000-of-00001 (deflated 9%)
  adding: model/variables/variables.index (deflated 59%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>