In [1]:
import csv
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [2]:
path_bbc = "data/bbc_news/bbc_text.csv"
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
special_characters = [".","+","(",")",":"]
sentences = []
labels = []
with open(path_bbc, 'r') as csvfile:
    csv_reader = csv.reader(csvfile, delimiter=',')
    next(csv_reader)
    for row in csv_reader:
        labels.append(row[0])
        sentence = row[1]
        for special_character in special_characters:
            sentence = sentence.replace(special_character, " ")
        for word in stopwords:
            sentence = sentence.replace(" " + word + " "," ")
        while sentence.find("  ") != -1:
            sentence = sentence.replace("  "," ")
        sentences.append(sentence)

In [3]:
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

In [4]:
train_size = int(len(sentences)*training_portion)

train_sentences = sentences[:train_size]
train_labels = labels[:train_size]

validation_sentences = sentences[train_size:]
validation_labels = labels[train_size:]

In [5]:
# Initialize tokenizer for texts
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

# Transform and padding training texts
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, maxlen= max_length ,padding=padding_type, truncating=trunc_type)

# Transform and padding validation texts
validation_sequences = tokenizer.texts_to_sequences(validation_sentences)
validation_padded = pad_sequences(validation_sequences, maxlen= max_length ,padding=padding_type, truncating=trunc_type)

In [10]:
# Tokenizing labels
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)
label_word_index = label_tokenizer.word_index

# Transform training and testing labels
training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq =  np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [7]:
np.save("data/bbc_news/train/train_padded.npy", train_padded)
np.save("data/bbc_news/validation/validation_padded.npy", validation_padded)
np.save("data/bbc_news/train/training_label_seq.npy", training_label_seq)
np.save("data/bbc_news/validation/validation_label_seq.npy", validation_label_seq)

In [11]:
loaded_train_padded = np.load("data/bbc_news/train/train_padded.npy")
loaded_validation_padded = np.load("data/bbc_news/validation/validation_padded.npy")
loaded_training_label_seq = np.load("data/bbc_news/train/training_label_seq.npy")
loaded_validation_label_seq = np.load("data/bbc_news/validation/validation_label_seq.npy")

In [11]:
import json
with open('data/bbc_news/word_index.json', 'w') as outfile:
    json.dump(word_index, outfile)
with open('data/bbc_news/label_word_index.json', 'w') as outfile:
    json.dump(label_word_index, outfile)

In [12]:
with open('data/bbc_news/word_index.json') as infile:
    loaded_word_index = json.load(infile)
with open('data/bbc_news/label_word_index.json') as infile:
    loaded_label_word_index = json.load(infile)