In [6]:
import numpy as np

import typing
from typing import Any, Tuple

import einops
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import tensorflow as tf
import tensorflow_text as tf_text

import pathlib
import os, sys


In [7]:
# data download


# path_2_zip = tf.keras.utils.get_file('spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
#     extract=True)

# path_2_file = pathlib.Path(path_2_zip).parent/'spa-eng/spa.txt'

# print(path_2_file)
# print(path_2_zip)

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
C:\Users\Vincent\.keras\datasets\spa-eng\spa.txt
C:\Users\Vincent\.keras\datasets\spa-eng.zip


In [15]:
#data path
fileDir = os.path.dirname(os.path.realpath('__file__'))
filename = pathlib.Path(os.path.join(fileDir, r"dataset\spa.txt"))

print(filename)

F:\program repo\NLP-translation\dataset\spa.txt


In [18]:
def load_data(path):
    text = path.read_text(encoding='utf-8')

    lines = text.splitlines()
    pairs = [line.split('\t') for line in lines]

    context = np.array([context for target, context in pairs])
    target = np.array([target for target, context in pairs])

    return target, context



In [20]:
target_raw, context_raw = load_data(filename)

Go.
Ve.


In [21]:
#check data
print(target_raw[0])
print(context_raw[0])

Go.
Ve.


In [43]:
#meta data

max_vocab_size = 5000

buffer_size = len(context_raw)
batch_size = 64

train_set_proportion = 0.8

is_train = np.random.uniform(size=(len(target_raw), )) < train_set_proportion



In [36]:
#set up tf dataset
train_set_raw = (
    tf.data.Dataset.from_tensor_slices((context_raw[is_train], target_raw[is_train]))
    .shuffle(buffer_size)
    .batch(batch_size)
)

val_set_raw = (
    tf.data.Dataset.from_tensor_slices((context_raw[~is_train], target_raw[~is_train]))
    .shuffle(buffer_size)
    .batch(batch_size)
)

In [37]:
#data check

for context_strings, target_strings in train_set_raw.take(1):
    print(context_strings[:6])
    print()
    print(target_strings[:6])

tf.Tensor(
[b'Finalmente, Tom supo lo que hab\xc3\xada sucedido.'
 b'Nunca conoc\xc3\xad a mi padre.' b'Te deseo mucho \xc3\xa9xito.'
 b'Quiero saber c\xc3\xb3mo le va a Tom.' b'Va a una escuela para sordos.'
 b'Tiene una cruda horrible.'], shape=(6,), dtype=string)

tf.Tensor(
[b'Eventually, Tom found out what had happened.'
 b'I never knew my father.' b'I wish you every success.'
 b'I want to know how Tom is doing.' b'He goes to a school for the deaf.'
 b'He has a terrible hangover.'], shape=(6,), dtype=string)


In [38]:
def tf_lower_and_split_punct(text):
    #split accented characters
    text = tf_text.normalize_utf8(text, 'NFKD') #NFKD is one of the normalization format defined in unicode convention
    text = tf.strings.lower(text)
    # Keep space, a to z, and select punctuation.
    text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
    # Add spaces around punctuation.
    text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
    # Strip whitespace.
    text = tf.strings.strip(text)

    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
    return text


In [41]:
#check

example_text = tf.constant('¿Todavía está en casa?')

print(example_text.numpy())
print(tf_text.normalize_utf8(example_text, 'NFKD').numpy())

print(example_text.numpy().decode())

print(tf_lower_and_split_punct(example_text).numpy().decode())

b'\xc2\xbfTodav\xc3\xada est\xc3\xa1 en casa?'
b'\xc2\xbfTodavi\xcc\x81a esta\xcc\x81 en casa?'
¿Todavía está en casa?
[START] ¿ todavia esta en casa ? [END]


In [44]:
#Text vectoriazation

context_text_preprocessor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size,
    ragged=True
)

In [45]:
context_text_preprocessor.adapt(train_set_raw.map(lambda context, target: context))

In [48]:
#output check
context_text_preprocessor.get_vocabulary()[:15]

['',
 '[UNK]',
 '[START]',
 '[END]',
 '.',
 'que',
 'de',
 'el',
 'a',
 'no',
 'tom',
 'la',
 '?',
 '¿',
 'en']

In [49]:
target_text_preprocessor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size,
    ragged=True
)



In [50]:
target_text_preprocessor.adapt(train_set_raw.map(lambda context, target: target))

In [51]:
#output check
target_text_preprocessor.get_vocabulary()[:10]

['', '[UNK]', '[START]', '[END]', '.', 'the', 'i', 'to', 'you', 'tom']