In [None]:
!pip install keras-transformer

Collecting keras-transformer
  Downloading keras-transformer-0.40.0.tar.gz (9.7 kB)
Collecting keras-pos-embd==0.13.0
  Downloading keras-pos-embd-0.13.0.tar.gz (5.6 kB)
Collecting keras-multi-head==0.29.0
  Downloading keras-multi-head-0.29.0.tar.gz (13 kB)
Collecting keras-layer-normalization==0.16.0
  Downloading keras-layer-normalization-0.16.0.tar.gz (3.9 kB)
Collecting keras-position-wise-feed-forward==0.8.0
  Downloading keras-position-wise-feed-forward-0.8.0.tar.gz (4.1 kB)
Collecting keras-embed-sim==0.10.0
  Downloading keras-embed-sim-0.10.0.tar.gz (3.6 kB)
Collecting keras-self-attention==0.51.0
  Downloading keras-self-attention-0.51.0.tar.gz (11 kB)
Building wheels for collected packages: keras-transformer, keras-embed-sim, keras-layer-normalization, keras-multi-head, keras-pos-embd, keras-position-wise-feed-forward, keras-self-attention
  Building wheel for keras-transformer (setup.py) ... [?25l[?25hdone
  Created wheel for keras-transformer: filename=keras_transformer

In [None]:
import numpy as np
from keras_transformer import get_model, decode
from pickle import load
from google.colab import drive

In [None]:
drive.mount('/content/drive')
filename = '/content/drive/My Drive/Goocle Collab/english-spanish.pkl'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset = load(open(filename, 'rb'))
print(dataset[120000,0])
print(dataset[120000,1])

tom is a new yorker but he doesnt have a new york accent
tom es neoyorquino pero no tiene acento de nueva york


In [None]:
source_tokens = []
for oracion in dataset[:,0]:
  source_tokens.append(oracion.split(' '))
print(source_tokens[120000])

target_tokens = []
for oracion in dataset[:,1]:
  target_tokens.append(oracion.split(' '))
print(target_tokens[120000])

['tom', 'is', 'a', 'new', 'yorker', 'but', 'he', 'doesnt', 'have', 'a', 'new', 'york', 'accent']
['tom', 'es', 'neoyorquino', 'pero', 'no', 'tiene', 'acento', 'de', 'nueva', 'york']


In [None]:
def build_token_dict(token_list):
  token_dict = {
      '<PAD>': 0,
      '<START>': 1,
      '<END>': 2
  }
  for tokens in token_list:
    for token in tokens:
      if token not in token_dict:
        token_dict[token] = len(token_dict)
  return token_dict

In [None]:
source_token_dict = build_token_dict(source_tokens)
target_token_dict = build_token_dict(target_tokens)
target_token_dict_inv = {v:k for k,v in target_token_dict.items()}

In [None]:
encoder_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
decoder_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
output_tokens = [tokens + ['<END>'] for tokens in target_tokens]

source_max_len = max(map(len, encoder_tokens))
target_max_len = max(map(len, decoder_tokens))

encoder_tokens = [tokens + ['<PAD>']*(source_max_len-len(tokens)) for tokens in encoder_tokens]
decoder_tokens = [tokens + ['<PAD>']*(target_max_len-len(tokens)) for tokens in decoder_tokens]
output_tokens = [tokens + ['<PAD>']*(target_max_len-len(tokens)) for tokens in output_tokens ]

In [None]:
encoder_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encoder_tokens]
decoder_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decoder_tokens]
output_decoded = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]

print(encoder_input[120000])

[1, 56, 258, 120, 197, 12666, 2914, 32, 1577, 140, 120, 197, 5385, 4287, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
modelo = get_model(
    token_num = max(len(source_token_dict),len(target_token_dict)),
    embed_dim = 32,
    encoder_num = 2,
    decoder_num = 2,
    head_num = 4,
    hidden_dim = 128,
    dropout_rate = 0.05,
    use_same_embed = False,
)
modelo.compile('adam', 'sparse_categorical_crossentropy')
modelo.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Encoder-Input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 Encoder-Token-Embedding (Embed  [(None, None, 32),  808608      ['Encoder-Input[0][0]']          
 dingRet)                        (25269, 32)]                                                     
                                                                                                  
 Encoder-Embedding (TrigPosEmbe  (None, None, 32)    0           ['Encoder-Token-Embedding[0][0]']
 dding)                                                                                           
                                                                                              

In [None]:
x = [np.array(encoder_input), np.array(decoder_input)]
y = np.array(output_decoded)

modelo.fit(x,y, epochs=15, batch_size=32)

#filename = '/content/drive/My Drive/Goocle Collab/traductor.h5'
#model.load_weights(filename)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f209c67efd0>

In [None]:
def translate(sentence):
  sentence_tokens = [tokens + ['<END>', '<PAD>'] for tokens in [sentence.split(' ')]]
  tr_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in sentence_tokens][0]
  decoded = decode(
      modelo, 
      tr_input, 
      start_token = target_token_dict['<START>'],
      end_token = target_token_dict['<END>'],
      pad_token = target_token_dict['<PAD>']
  )

  print('Frase original: {}'.format(sentence))
  print('Traducción: {}'.format(' '.join(map(lambda x: target_token_dict_inv[x], decoded[1:-1]))))

In [None]:
translate('I saw a dog in the park yesterday')

Frase original: i saw a dog in the park yesterday
Traducción: ayer vi un perro en el parque


In [None]:
translate('the function is not continuous in the origin')

Frase original: the function is not continuous in the origin
Traducción: la funcion no continuo segui en el origen


In [None]:
translate("i do not trust in the axiom of choice")

KeyError: ignored

In [None]:
translate('hello')

Frase original: hello
Traducción: hay nada


In [None]:
translate('shut up')

Frase original: shut up
Traducción: callate


In [None]:
modelo.save('traductor.h5')