<a href="https://colab.research.google.com/github/thedatadj/natural-language-processing/blob/main/News-articles-Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this project I create a model capable of classifying news articles by topic.

I'll use [this](https://www.kaggle.com/c/learn-ai-bbc/overview) public dataset from the BBC containing 2,225 articles, each labeled under one of 5 categories: business, entertainment, politics, sport or tech.

In [None]:
# Deep Neural Networks
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Numerical analysis
import numpy as np

# Visualization
import matplotlib.pyplot as plt

# Read csv files
import csv

In [None]:
# Download dataset
!gdown 12U7WyvJypu573BflZC2-9qqqES0xQ51i

Downloading...
From: https://drive.google.com/uc?id=12U7WyvJypu573BflZC2-9qqqES0xQ51i
To: /content/bbc-text.csv
100% 5.06M/5.06M [00:00<00:00, 32.1MB/s]


In [None]:
# Load the data
path = "/content/bbc-text.csv"
sentences = []
labels = []

with open(path, "r") as file:
    reader = csv.reader(file, delimiter=',')
    # Avoid header
    next(reader)

    # Store sentences and labels
    for row in reader:
        labels.append(row[0])
        sentences.append(row[1])

In [None]:
# First training example
print("First 100 characters of first article: ")
print(sentences[0][:100])
print("\nLabel of the this article:")
print(labels[0])

First 100 characters of first article: 
tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital

Label of the this article:
tech


# Preprocessing
I eliminate stopwords from the sentences dataset.

In [15]:
# List of stopwords
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [16]:
sentences_clean = []
# Loop through each sentence
for sentence in sentences:
    # Normalize it to lower case
    sentence = sentence.lower()

    # List of words
    words = sentence.split()

    # Store non-stopwords in sentence
    clean_words = []

    # Loop through each word in the sentence
    for word in words:
        if word not in stopwords:
            # Store non-stopwords
            clean_words.append(word)

    # Join non-stopwords into a sentence
    sentence = " ".join(clean_words)

    # Store clean sentences
    sentences_clean.append(sentence)

In [21]:
# Compare sentence with stopwords and without
print("Without stopwords: ", sentences_clean[0][:100])
print("With stopwords:    ",sentences[0][:100])

Without stopwords:  tv future hands viewers home theatre systems plasma high-definition tvs digital video recorders movi
With stopwords:     tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital


In [22]:
# Rename sentences dataset
sentences = sentences_clean

# Data Split
I split the data into training and validation sets.

In [24]:
# Index of point at 80 percentile
train_split = int(len(sentences)*0.8)

# Training split
X_train = sentences[:train_split]
y_train = labels[:train_split]

# Validation split
X_valid = sentences[train_split:]
y_valid = labels[train_split:]

# Tokenization

In [25]:
# Tokenizer instance
tokenizer = Tokenizer(num_words=1000, oov_token="<OOV>")

tokenizer.fit_on_texts(X_train)

I pad the sequences.

In [27]:
# Train padded sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(X_train_seq, maxlen=120, padding='post')

# Validation padded sequences
X_valid_seq = tokenizer.texts_to_sequences(X_valid)
X_valid_pad = pad_sequences(X_valid_seq, maxlen=120, padding='post')

Tokenize the labels

In [29]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(y_train)

# Convert labels to sequence
y_train_seq = label_tokenizer.texts_to_sequences(y_train)
y_valid_seq = label_tokenizer.texts_to_sequences(y_valid)

# Convert to numpy array
y_train_seq = np.array(y_train_seq) - 1
y_valid_seq = np.array(y_valid_seq) - 1

# Modeling

In [44]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_length=120),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [45]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [46]:
history = model.fit(X_train_pad, y_train_seq, epochs=10,
                    validation_data=(X_valid_pad, y_valid_seq))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
