In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pip install -q -U tensorflow-text

[K     |████████████████████████████████| 4.9 MB 30.4 MB/s 
[K     |████████████████████████████████| 462 kB 60.4 MB/s 
[?25h

In [None]:
!pip install tensorflow-datasets==4.3

Collecting tensorflow-datasets==4.3
  Downloading tensorflow_datasets-4.3.0-py3-none-any.whl (3.9 MB)
[K     |████████████████████████████████| 3.9 MB 11.4 MB/s 
Installing collected packages: tensorflow-datasets
  Attempting uninstall: tensorflow-datasets
    Found existing installation: tensorflow-datasets 4.0.1
    Uninstalling tensorflow-datasets-4.0.1:
      Successfully uninstalled tensorflow-datasets-4.0.1
Successfully installed tensorflow-datasets-4.3.0


In [None]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow_text as text
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
import tensorflow as tf

import json

In [None]:
tf.get_logger().setLevel('ERROR')
pwd = pathlib.Path.cwd()

# Build the Datasets

qa_train.csv: question, answer

context_train.csv: index, context

qret_train.csv: index, question

context_train.txt: one context per line, used to train subword tokenization


In [None]:
path = "/content/drive/MyDrive/dataset/GermanDPR/GermanDPR/GermanDPR_train.json"

In [None]:
with open(path, "r") as infile:
  data = json.load(infile)

# look at random first example
print('Data:')
data_ex = data[0]
for k, v in data_ex.items():
  print(k, v)

print('Question:')
print(data_ex['question'])
print('Answers:')
for a in data_ex['answers']:
  print(a)

print('Context:')

print(data_ex['positive_ctxs'][-1]['text'])

Data:
question Wie viele christlichen Menschen in Deutschland glauben an einen Gott?
answers ['75 % der befragten Katholiken sowie 67 % der Protestanten glaubten an einen Gott (2005: 85 % und 79 %)']
positive_ctxs [{'title': 'Gott', 'text': 'Gott\n\n=== Demografie ===\nEine Zusammenfassung von Umfrageergebnissen aus verschiedenen Staaten ergab im Jahr 2007, dass es weltweit zwischen 505 und 749 Millionen Atheisten und Agnostiker gibt. Laut der Encyclopædia Britannica gab es 2009 weltweit 640 Mio. Nichtreligiöse und Agnostiker (9,4 %), und weitere 139 Mio. Atheisten (2,0 %), hauptsächlich in der Volksrepublik China.\nBei einer Eurobarometer-Umfrage im Jahr 2005 wurde festgestellt, dass 52 % der damaligen EU-Bevölkerung glaubt, dass es einen Gott gibt. Eine vagere Frage nach dem Glauben an „eine andere spirituelle Kraft oder Lebenskraft“ wurde von weiteren 27 % positiv beantwortet. Bezüglich der Gottgläubigkeit bestanden große Unterschiede zwischen den einzelnen europäischen Staaten. Die

In [None]:
import csv

In [None]:
print(len(data))

9275


In [None]:
# build qa and q retrieval datasets
with open("/content/drive/MyDrive/dataset/qa_train.csv", "w") as qa, open("/content/drive/MyDrive/dataset/qret_train.csv", "w") as qret, open('/content/drive/MyDrive/dataset/context_train.csv', 'w') as context, open('/content/drive/MyDrive/dataset/context_train.txt', 'w') as bpe_train:
  qa_writer = csv.writer(qa, delimiter='\t')
  ret_writer = csv.writer(qret, delimiter='\t')
  c_writer = csv.writer(context, delimiter='\t')
  c_index = 0
  for d in data:
    c_index += 1
    c_writer.writerow([c_index, d['positive_ctxs'][-1]['text'].split('\n')[-1]])
    
    for a in d['answers']:
      qa_writer.writerow([c_index, d['question'], a])
    
    ret_writer.writerow([c_index, d['question']])
    bpe_train.write(d['positive_ctxs'][-1]['text'].split('\n')[-1])
    bpe_train.write('\n')

    
    



# Subword Tokenizer

https://www.tensorflow.org/text/guide/subwords_tokenizer

Takes Vocabulary Size as only input.

In [None]:
class SubwordTokenizer:
  
  def __init__(self, dataset, context):
    self.dataset = dataset
    self.context = context
    self.datatensor = None

  def _generate_data(self):
    with open(self.dataset, 'r') as data, open(self.context, 'r') as context:
      data_reader = csv.reader(data, delimiter='\t')
      context_reader = csv.reader(context, delimiter='\t')
      context_dict = {}
      for c in context_reader:
        context_dict[c[0]] = c[1]

      for d in data_reader:
        context = context_dict[d[0]]
        question = d[1]
        yield context, question
  
  def train_bpe(self, vocab_size):

    # Build the dataset and safe as tensor
    self.datatensor = tf.data.Dataset.from_generator(self._generate_data, output_types=(tf.string, tf.string))


    bert_tokenizer_params = dict(lower_case=True)

    bert_vocab_args = dict(
        # The target vocabulary size
        vocab_size = vocab_size,
        reserved_tokens = [],
        # Arguments for `text.BertTokenizer`
        bert_tokenizer_params=bert_tokenizer_params,
        # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
        learn_params={},
    )
    dataset = tf.data.TextLineDataset(['/content/drive/MyDrive/dataset/context_train.txt'])
  
    %%time
    trained_vocab = bert_vocab.bert_vocab_from_dataset(
        dataset.batch(1000).prefetch(2),
        **bert_vocab_args
    )

    # TODO: SAFE AS KERAS MODEL TO BE IMPORTED IN PREPROCESSING
    with open('/content/drive/MyDrive/dataset/bpe_vocab.txt', 'w') as outfile:
      for v in trained_vocab:
        outfile.write(v)
        outfile.write('\n')
    
 

In [None]:
c_path = '/content/drive/MyDrive/dataset/context_train.csv'
d_path = '/content/drive/MyDrive/dataset/qret_train.csv'
tok = SubwordTokenizer(d_path, c_path)
tok.train_bpe(1000)