In [1]:
import tensorflow_datasets as tfds

In [2]:
ds = tfds.load('ai2_arc_with_ir', split = 'train', shuffle_files = True)

In [3]:
excerpt = ds.take(1)
for example in excerpt:
  print('Example:')
  print(example)
  print('\nAnswer Key:')
  print(example['answerKey'])
  print('\nChoices:')
  print(example['choices'])
  print('\nID:')
  print(example['id'])
  print('\nParagraph:')
  print(example['paragraph'])
  print('\nQuestion:')
  print(example['question'])

Example:
{'answerKey': <tf.Tensor: shape=(), dtype=int64, numpy=2>, 'choices': {'label': <tf.Tensor: shape=(4,), dtype=int64, numpy=array([0, 1, 2, 3])>, 'text': <tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'are low-density gases.', b'are not abundant on Earth.',
       b'have complete outermost energy levels.',
       b'have two electrons in their outermost energy level.'],
      dtype=object)>}, 'id': <tf.Tensor: shape=(), dtype=string, numpy=b'Mercury_401011'>, 'paragraph': <tf.Tensor: shape=(), dtype=string, numpy=b'It and the other noble gases - helium, neon, krypton, xenon, and radon - will react with other substances only under extreme conditions. The noble gases The noble, or inert, gases are helium, neon, argon, krypton, xenon and radon. The rare gases are helium, neon, argon, krypton or xenon. The noble "gases" are helium, neon, argon, krypton and xenon. The noble gases are helium, neon, argon, krypton, xenon and radon. These occur for the noble gases helium, neon, ar

In [4]:
def convert_to_alphanumeric(string):
  return ''.join(char for char in string if char.isalnum())

In [5]:
import re
def tokenize(string):
  list_of_tokens = []
  list_of_substrings = re.split(' |-', string)
  for substring in list_of_substrings:
    token = convert_to_alphanumeric(substring).lower()
    list_of_tokens.append(token)
  return list_of_tokens

In [6]:
def process(dictionary_of_tokens_and_frequencies, list_of_strings):
  for string in list_of_strings:
    list_of_tokens = tokenize(string)
    for token in list_of_tokens:
      if token in dictionary_of_tokens_and_frequencies.keys():
        dictionary_of_tokens_and_frequencies[token] = dictionary_of_tokens_and_frequencies[token] + 1
      else:
        dictionary_of_tokens_and_frequencies[token] = 1

In [8]:
dictionary_of_tokens_and_frequencies = {}
for example in ds:
  process(dictionary_of_tokens_and_frequencies, [byte_array.decode('utf-8') for byte_array in example['choices']['text'].numpy().tolist()])
  process(dictionary_of_tokens_and_frequencies, [example['paragraph'].numpy().decode('utf-8')])
  process(dictionary_of_tokens_and_frequencies, [example['question'].numpy().decode('utf-8')])
dictionary_of_tokens_and_frequencies = dict(sorted(dictionary_of_tokens_and_frequencies.items(), key=lambda item: item[1], reverse = True))
dictionary_of_tokens_and_frequencies

{'the': 19030,
 'of': 9739,
 'and': 7391,
 'a': 6561,
 'to': 5343,
 'in': 5136,
 'is': 4499,
 '': 2938,
 'which': 2757,
 'are': 2207,
 'that': 2021,
 'water': 2021,
 'from': 1811,
 'as': 1602,
 'on': 1545,
 'for': 1517,
 'by': 1414,
 'or': 1395,
 'it': 1338,
 'with': 1335,
 'energy': 1306,
 'an': 1236,
 'be': 1194,
 'can': 934,
 'most': 900,
 'have': 892,
 'earth': 869,
 'this': 845,
 'will': 828,
 'at': 823,
 'plants': 754,
 'when': 738,
 'has': 709,
 'they': 708,
 'into': 650,
 'its': 612,
 'one': 601,
 'than': 588,
 'not': 581,
 'other': 580,
 'sun': 562,
 'what': 550,
 'their': 539,
 'these': 517,
 'was': 513,
 'more': 492,
 'use': 490,
 'all': 483,
 'how': 467,
 'carbon': 466,
 'plant': 454,
 'food': 445,
 'used': 440,
 'air': 436,
 'soil': 434,
 'mass': 433,
 'chemical': 431,
 'temperature': 430,
 'light': 428,
 'different': 402,
 'two': 399,
 'if': 393,
 'like': 386,
 'same': 384,
 'many': 382,
 'system': 380,
 '2': 379,
 'such': 378,
 'would': 376,
 'but': 372,
 'cells': 370,
 

In [9]:
len(dictionary_of_tokens_and_frequencies)

15091

`ai2_arc_with_ir` is a dataset of 7,787 grade school level multiple-choice science questions assembled to encourage research in answering of questions and common-sense reasoning.

Through basic loading using TensorFlow Datasets, iteration through examples, and tokenization into words, I discovered about 15,091 words.

The top 10 most frequent words and their frequencies, excluding the empty string, are as follows.

{'the': 19030,
'of': 9739,
'and': 7391,
'a': 6561,
'to': 5343,
'in': 5136,
'is': 4499,
'': 2938,
'which': 2757,
'are': 2207,
'that': 2021}

Challenges involve understanding whether all examples are accounted for, extracting relevant strings from each example, tokenizing strings, and processing a lot of data.