In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from my_nlp_module.preprocessing import preprocess_bbc_to_dict, PrepOption

from gensim.models import KeyedVectors
from sklearn.utils import shuffle
import numpy as np
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import gc
from tokenizer import Tokenizer

label_to_class = {
    "business": 0,
    "entertainment": 1,
    "politics": 2,
    "sport": 3,
    "tech": 4
}

class_to_label = {
    0: "business",
    1: "entertainment",
    2: "politics",
    3: "sport",
    4: "tech"
}

path_to_model = "../pretrained_models/40/model.bin"
model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)
embed_dim = model.vector_size

2022-10-21 23:49:54.980434: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dataset_train_path = "../datasets/bbc-text/train"
dataset_test_path = "../datasets/bbc-text/test"

options = [PrepOption.INTERPUNCTION, PrepOption.RUBBISH, PrepOption.NUMBERS, PrepOption.STOPWORDS]
preprocessed = preprocess_bbc_to_dict(dataset_train_path, options)

documents = []
labels = []

for key in preprocessed.keys():
    for doc in preprocessed[key]:
        documents.append(doc)
        labels.append(label_to_class[key])
        
df = pd.DataFrame({"document": documents, "label": labels})
df = df.sample(frac=1, axis=0).reset_index(drop=True)

df.groupby(['label'])['label'].count()

Couldn't read 1 files:
/Users/wojciechzyla/Desktop/AGH/praca_inz/projekt_inzynierski/bbc-text/train/sport/199.txt



label
0    459
1    347
2    375
3    458
4    360
Name: label, dtype: int64

In [3]:
MAX_WORDS = 8000
tok = Tokenizer(MAX_WORDS)
tok.fit(list(df['document']))
vocab_size = len(tok.vocab) + 1

embed_matrix=np.zeros(shape=(vocab_size,embed_dim))
for word,i in tok.vocab.items():
    try:
        embed_vector=model[word]
        embed_matrix[i]=embed_vector
    except KeyError:
        continue

In [4]:
encoded = tok.texts_to_sequences(list(df['document']))
max_doc_len=-1
for doc in encoded:
    if(len(doc)>max_doc_len):
        max_doc_len=len(doc)
print(f"Maximal length of document is {max_doc_len}")

Maximal length of document is 530


In [5]:
pad_docs = pad_sequences(encoded, maxlen=max_doc_len, padding='post')
print(f"Shape of padded documents array: {pad_docs.shape}")

Shape of padded documents array: (1999, 530)


In [6]:
Y=df['label']
x_train,x_test,y_train,y_test=train_test_split(pad_docs,Y,test_size=0.20,random_state=42)

train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))

BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE = 100

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

AUTOTUNE = tf.data.AUTOTUNE
train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)

2022-10-21 23:50:40.354089: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
classifier=tf.keras.models.Sequential()
embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim,
                                      input_length=max_doc_len, weights=[embed_matrix],
                                      trainable=False)
classifier.add(embedding)

classifier.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
classifier.add(tf.keras.layers.Dense(84, activation='relu'))
classifier.add(tf.keras.layers.Dense(5, activation=tf.nn.softmax, name='classifier'))

loss = tf.keras.losses.SparseCategoricalCrossentropy()
metrics = tf.metrics.SparseCategoricalAccuracy()

epochs = 20

"""steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
      initial_learning_rate=init_lr,
      decay_steps=num_train_steps,
      end_learning_rate=0.0)

opt = tf.keras.optimizers.experimental.AdamW(learning_rate=lr_schedule)"""

classifier.compile(optimizer='adam',
                        loss=loss,
                        metrics=metrics)



classifier.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 530, 100)          800100    
                                                                 
 bidirectional (Bidirectiona  (None, 128)              84480     
 l)                                                              
                                                                 
 dense (Dense)               (None, 84)                10836     
                                                                 
 classifier (Dense)          (None, 5)                 425       
                                                                 
Total params: 895,841
Trainable params: 95,741
Non-trainable params: 800,100
_________________________________________________________________


In [None]:
# free memory
del model
del preprocessed
del df
del pad_docs
gc.collect()

classifier.fit(x_train,y_train,epochs=epochs,validation_data=(x_test,y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20