In [1]:
import os
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

In [14]:
(train_data, validation_data, test_data), ds_info  = tfds.load('ag_news_subset', 
                      split=('train[:60%]', 'train[60%:]', 'test'),
                      as_supervised=True,
                      with_info=True)

## EDA

In [23]:
# Firstly, let's print dataset info by tensorflow
# This is returned by tfds.load constructor. That's pretty cool, btw!

In [24]:
ds_info

tfds.core.DatasetInfo(
    name='ag_news_subset',
    version=1.0.0,
    description='AG is a collection of more than 1 million news articles.
News articles have been gathered from more than 2000  news sources by ComeToMyHead in more than 1 year of activity.
ComeToMyHead is an academic news search engine which has been running since July, 2004.
The dataset is provided by the academic comunity for research purposes in data mining (clustering, classification, etc),
information retrieval (ranking, search, etc), xml, data compression, data streaming,
and any other non-commercial activity.
For more information, please refer to the link http://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html .

The AG's news topic classification dataset is constructed by Xiang Zhang (xiang.zhang@nyu.edu) from the dataset above.
It is used as a text classification benchmark in the following paper:
Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Advanc

In [None]:
# We have 120000 records for training, and 7600 for testing
# Total unique classes are 4 - 0,1,2,3
# Let's print first few examples

In [27]:
train_examples_batch, train_labels_batch = next(iter(train_data.batch(4)))
train_examples_batch, train_labels_batch

(<tf.Tensor: shape=(4,), dtype=string, numpy=
 array([b'AMD #39;s new dual-core Opteron chip is designed mainly for corporate computing applications, including databases, Web services, and financial transactions.',
        b'Reuters - Major League Baseball\\Monday announced a decision on the appeal filed by Chicago Cubs\\pitcher Kerry Wood regarding a suspension stemming from an\\incident earlier this season.',
        b'President Bush #39;s  quot;revenue-neutral quot; tax reform needs losers to balance its winners, and people claiming the federal deduction for state and local taxes may be in administration planners #39; sights, news reports say.',
        b'Britain will run out of leading scientists unless science education is improved, says Professor Colin Pillinger.'],
       dtype=object)>,
 <tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 1, 2, 3])>)

In [None]:
# Let's check total examples for train, validation and test

In [33]:
print(f'Total size of the Training dataset : {tf.data.experimental.cardinality(train_data)}')
print(f'Total size of the Training dataset : {tf.data.experimental.cardinality(validation_data)}')
print(f'Total size of the Training dataset : {tf.data.experimental.cardinality(test_data)}')

Total size of the Training dataset : 72000
Total size of the Training dataset : 48000
Total size of the Training dataset : 7600


In [None]:
# Next, unique labels. Although this is available in the desc, but still good to know the method!
# Since TF datasets are lazily evaluated, the next code block might be slow

In [35]:
text, labels = tuple(zip(*train_data))

np_text = np.array(text)
np_labels = np.array(labels)

print('Unique Labels for training : ', list(set(np_labels)))

Unique Labels for training :  [0, 1, 2, 3]


In [None]:
# Next, few useful TF functions 

In [53]:
# Get number of unique classes
print(f"No of unique classes : {ds_info.features['label'].num_classes}")


# Get num of examples by the split
print(f"Total training examples : {ds_info.splits['train'].num_examples}")


No of unique classes : 4
