# Explore the BBC News archive

In [11]:
import csv
import zipfile
from google.colab import files
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


## Getting and checking the data

In [9]:
# Upload the dataset file
files.upload()

Saving bbc-text.zip to bbc-text.zip


In [13]:
# Unzip the file
zip_ref = zipfile.ZipFile('./bbc-text.zip', 'r')
zip_ref.extractall()


In [15]:
# Check the file contents
ds = './bbc-text.csv'
with open(ds, 'r') as f:
  print(f"First line (header) looks like this:\n\n{f.readline()}")
  print(f"Each data point looks like this:\n\n{f.readline()}")

First line (header) looks like this:

category,text

Each data point looks like this:

tech,tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essentially

## Parsing the data

In [17]:
# Remove_stopwords
def remove_stopwords(sentence=''):
  """
  Remove a list of stopwords
  Args:
      sentence (str): sentence to remove the stopwords from
  Returns:
      sentence (str): lowercase sentence without the stopwords
  """
  # List of stopwords
  stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

  return ' '.join([w for w in sentence.lower().split() if w not in stopwords])


In [18]:
# Test the function
remove_stopwords("I am about to go to the store and get any snack")

'go store get snack'

In [20]:
def parse_data_from_file(filename):
    """
    Extract sentences and labels from a CSV file
    Args:
        filename (str): path to the CSV file
    Returns:
        sentences, labels (list[str], list[str]): tuple containing lists of sentences and labels
    """
    sentences, labels = [], []
    with open(filename, 'r') as f:
      reader = csv.reader(f, delimiter=',')
      next(reader)
      for row in reader:
        labels.append(row[0])
        sentences.append(remove_stopwords(row[1]))
    return sentences, labels


In [21]:
# Test the function
sentences, labels = parse_data_from_file(ds)
mini_sentences, mini_labels = sentences[:5], labels[:5]

print("ORIGINAL DATASET:\n")
print(f"There are {len(sentences)} sentences in the dataset.\n")
print(f"First sentence has {len(sentences[0].split())} words (after removing stopwords).\n")
print(f"There are {len(labels)} labels in the dataset.\n")
print(f"The first 5 labels are {labels[:5]}\n\n")

print("MINIATURE DATASET:\n")
print(f"There are {len(mini_sentences)} sentences in the miniature dataset.\n")
print(f"First sentence has {len(mini_sentences[0].split())} words (after removing stopwords).\n")
print(f"There are {len(mini_labels)} labels in the miniature dataset.\n")
print(f"The first 5 labels are {mini_labels[:5]}")

ORIGINAL DATASET:

There are 2225 sentences in the dataset.

First sentence has 436 words (after removing stopwords).

There are 2225 labels in the dataset.

The first 5 labels are ['tech', 'business', 'sport', 'sport', 'entertainment']


MINIATURE DATASET:

There are 5 sentences in the miniature dataset.

First sentence has 436 words (after removing stopwords).

There are 5 labels in the miniature dataset.

The first 5 labels are ['tech', 'business', 'sport', 'sport', 'entertainment']


## Using the tokenizer

In [22]:
# Define a tokenizer
def fit_tokenizer(sentences):
    """
    Instantiate the Tokenizer class
    Args:
        sentences (list[str]): lower-cased sentences without stopwords
    Returns:
        tokenizer (object): an instance of the Tokenizer class containing the word-index dictionary
    """
    tokenizer = Tokenizer(oov_token='<OOV>')
    tokenizer.fit_on_texts(sentences)
    return tokenizer

In [23]:
tokenizer = fit_tokenizer(sentences)
word_index = tokenizer.word_index

print(f"Vocabulary contains {len(word_index)} words\n")
print("<OOV> token included in vocabulary" if "<OOV>" in word_index else "<OOV> token NOT included in vocabulary")

Vocabulary contains 29714 words

<OOV> token included in vocabulary


In [26]:
# Get padded sequences
def get_padded_sequences(tokenizer, sentences):
    """
    Generate an array of token sequences and pad them to the same length
    Args:
        tokenizer (object): Tokenizer instance containing the word-index dictionary
        sentences (list[str]): list of sentences to tokenize and pad
    Returns:
        padded_sequences (np.ndarray[int]): tokenized sentences padded to the same length
    """
    # Convert sentences to sequences
    sequences = tokenizer.texts_to_sequences(sentences)

    # Pad the sequences using the post padding strategy
    padded_sequences = pad_sequences(sequences, padding='post')
    return padded_sequences

In [27]:
padded_sequences = get_padded_sequences(tokenizer, sentences)
print(f"First padded sequence looks like this: \n\n{padded_sequences[0]}\n")
print(f"Numpy array of all sequences has shape: {padded_sequences.shape}\n")
print(f"This means there are {padded_sequences.shape[0]} sequences in total and each one has a size of {padded_sequences.shape[1]}")

First padded sequence looks like this: 

[  96  176 1157 ...    0    0    0]

Numpy array of all sequences has shape: (2225, 2438)

This means there are 2225 sequences in total and each one has a size of 2438


In [32]:
# Tokenize labels
def tokenize_labels(labels):
    """
    Tokenize the labels
    Args:
        labels (list[str]): labels to tokenize
    Returns:
        label_sequences, label_word_index (tuple[list[str], dict[str, int]]): tokenized labels and the word-index
    """
    label_tokenizer = Tokenizer()
    label_tokenizer.fit_on_texts(labels)

    # Save the word index
    label_word_index = label_tokenizer.word_index

    # Save the sequences
    label_sequences = label_tokenizer.texts_to_sequences(labels)

    return label_sequences, label_word_index

In [33]:
label_sequences, label_word_index = tokenize_labels(labels)
print(f"Vocabulary of labels looks like this {label_word_index}\n")
print(f"First ten sequences {label_sequences[:10]}\n")

Vocabulary of labels looks like this {'sport': 1, 'business': 2, 'politics': 3, 'tech': 4, 'entertainment': 5}

First ten sequences [[4], [2], [1], [1], [5], [3], [3], [1], [1], [5]]

