In [208]:
import pandas as pd
import numpy as np
from keras.api._v2.keras.layers import Input, Embedding, Dense, TextVectorization, GRU
from keras.api._v2.keras.models import Sequential
from keras.api._v2.keras.losses import SparseCategoricalCrossentropy
import matplotlib.pyplot as plt
import tensorflow as tf
from load_data import load_l1_data

In [209]:
df = pd.read_csv('news-classification.csv', encoding='utf-8')
df.head()

Unnamed: 0,data_id,id,date,source,title,content,author,url,published,published_utc,collection_utc,category_level_1,category_level_2
0,1809,abcnews--2019-10-31--Virginia mom charged with...,2019-10-31,abcnews,Virginia mom charged with murder in 2-year-old...,The Virginia woman whose 2-year-old son was fo...,,https://abcnews.go.com/US/wireStory/virginia-m...,"Thu, 31 Oct 2019 16:49:56 -0400",1572554996,1572559512,"crime, law and justice",crime
1,1980,abcnews--2019-11-07--2 escaped murder suspects...,2019-11-07,abcnews,2 escaped murder suspects arrested at US-Mexic...,Authorities are trying to determine if anyone ...,,https://abcnews.go.com/US/wireStory/escaped-mu...,"Thu, 07 Nov 2019 00:13:12 -0500",1573103592,1573131986,"crime, law and justice",crime
2,1995,abcnews--2019-11-07--Family turns in escaped b...,2019-11-07,abcnews,"Family turns in escaped boy, 13, suspected in ...",A 13-year-old suspect in a double homicide who...,,https://abcnews.go.com/US/wireStory/family-tur...,"Thu, 07 Nov 2019 07:39:54 -0500",1573130394,1573131982,"crime, law and justice",crime
3,2740,abcnews--2019-12-02--Mother charged with murde...,2019-12-02,abcnews,Mother charged with murder in deaths of 2 youn...,The mother of two young children found hanging...,,https://abcnews.go.com/US/wireStory/mother-cha...,"Mon, 02 Dec 2019 11:30:59 -0500",1575304259,1575308811,"crime, law and justice",crime
4,7038,ageofautism--2019-04-12--Physician Father and ...,2019-04-12,ageofautism,"Physician, Father and Caretaker of 29 Year Old...","""One family member said Derek “can be violent ...",Age of Autism,http://feedproxy.google.com/~r/ageofautism/~3/...,2019-04-12 09:00:00+00:00,1555074000,1567543083,"crime, law and justice",crime


In [210]:
x_train, x_test, x_val, y_train, y_test, y_val = load_l1_data(df)

ONLY RUN TO DOWNLOAD GLOVE EMBENDINGS

In [211]:
# !wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
# !unzip -q glove.6B.zip

PREPROCESS FUNCTION

In [212]:
def custom_standarization(input_data):

        text = tf.strings.lower(input_data)
        text = tf.strings.regex_replace(text, r'\d+|http\S+', '')
        text = tf.strings.regex_replace(text, '[^a-zA-Z,\d]', ' ')
        text = tf.strings.regex_replace(text, r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b', ' ')
        text = tf.strings.regex_replace(text, '[/(){}\[\]\|@,;]', ' ')
        text = tf.strings.regex_replace(text, ' +', ' ')

        return text

In [213]:
VOCAB_SIZE = 8000
encoder = TextVectorization(standardize=custom_standarization, max_tokens=VOCAB_SIZE, output_sequence_length=500)
encoder.adapt(x_train)

In [214]:
voc = encoder.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [215]:
path_to_glove_file = "glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [216]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 7948 words (52 misses)


In [217]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    trainable=True,
)
embedding_layer.build((1,))
embedding_layer.set_weights([embedding_matrix])

In [218]:
numberOfClasses = 17

GRU MODEL FOR LEVEL 1 CATEGORY

In [219]:
model = Sequential([
    Input(shape=(None,), dtype="int32"),
    embedding_layer,
    tf.keras.layers.Bidirectional(GRU(896, dropout=0.35)),
    Dense(numberOfClasses, activation='softmax')
    ])

model.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, None, 100)         800200    
                                                                 
 bidirectional_13 (Bidirecti  (None, 1792)             5365248   
 onal)                                                           
                                                                 
 dense_13 (Dense)            (None, 17)                30481     
                                                                 
Total params: 6,195,929
Trainable params: 6,195,929
Non-trainable params: 0
_________________________________________________________________


In [220]:
x_train = encoder(np.array([[s] for s in x_train])).numpy()
x_val = encoder(np.array([[s] for s in x_val])).numpy()
x_test = encoder(np.array([[s] for s in x_test])).numpy()

In [221]:
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer="adam",
              metrics=['accuracy'])

MODEL TRAINING

In [222]:
glove_history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    epochs=7,
    validation_steps=30
)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


MODEL EVALUATION

In [223]:
loss, accuracy = model.evaluate(x_test, y_test)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.6194208264350891
Accuracy:  0.8437118530273438


PLOTTING

In [None]:
history_dict = glove_history.history
history_dict.keys()
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(1, len(acc) + 1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

GRU LEVEL 1 TUNNING

In [None]:
# DON'T RUN THIS CELL
# THIS CODE WAS USED FOR HYPERPARAMETER TUNNING
import keras_tuner as kt
def model_builder(hp): 
    model = Sequential()
    model.add(Input(shape=(None,), dtype="int32"))
    model.add(embedding_layer)
    
    hp_units = hp.Int('units', min_value=64, max_value=1024, step=32)

    model.add(tf.keras.layers.Bidirectional(GRU(units=hp_units, dropout=0.2)))
    model.add(Dense(numberOfClasses, activation='softmax'))

    model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])
    
    return model

In [None]:
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=15,
                     factor=3,
                    )

Reloading Tuner from ./untitled_project/tuner0.json


In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [None]:
tuner.search(x_train, y_train, epochs=50, validation_data=(x_val, y_val), callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")