In [1]:
#!pip install faker

from tensorflow.keras.layers import Concatenate, Input, LSTM, Attention, TimeDistributed, Bidirectional
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Model
import tensorflow.keras.backend as K

import numpy as np
from faker import Faker
import random
from babel.dates import format_date
import keras

In [2]:
seed = 42

In [3]:
fake = Faker()
Faker.seed(seed)

### Test the functiionality of the faker package

In [4]:
for _ in range(5):
    print(fake.date())

2004-11-09
1971-05-14
1984-12-29
1982-03-03
2010-02-24


In [5]:
# test the fake date functionality
print(format_date(fake.date_object(), format='long', locale='en'))

print(format_date(fake.date_object(), format='full', locale='en'))

print(format_date(fake.date_object(), format='medium', locale='en'))

print(format_date(fake.date_object(), format='short', locale='en'))

print(format_date(fake.date_object(), format='d M YYY', locale='en'))
print(format_date(fake.date_object(), format='dd MM YYY', locale='en'))
print(format_date(fake.date_object(), format='d MMM YY', locale='en'))
print(format_date(fake.date_object(), format='d MMMM YYY', locale='en'))
print(format_date(fake.date_object(), format='d MMMM, ''YYY', locale='en'))
print(format_date(fake.date_object(), format='d MMMM YYY', locale='en'))
print(format_date(fake.date_object(), format='EEE, MMM d, ''YYYY', locale='en'))
print(format_date(fake.date_object(), format='EEEE, MMM d, ''YY', locale='en'))
print(format_date(fake.date_object(), format='EEEE, MMM d, ''YYYY', locale='en'))
print(format_date(fake.date_object(), format='MMM d, ''yyyy, EEEE', locale='en'))
print(format_date(fake.date_object(), format='EEE MMM d ''YYYY', locale='en'))
print(format_date(fake.date_object(), format='EEEE MMM d ''YYYY', locale='en'))

November 22, 2006
Tuesday, August 21, 2018
Sep 28, 1974
1/1/93
17 8 1971
02 12 1981
20 Jul 97
13 June 1971
3 November, 1980
6 June 2005
Thu, Sep 16, 1999
Thursday, Jan 7, 82
Friday, Feb 15, 2002
Feb 16, 2014, Sunday
Sun May 10 1970
Friday Dec 6 2013


In [6]:
format = [
'long',
'long',
'long',
'full',
'full',
'full',
'medium',
'medium',
'medium',
'short',
'short',
'short',
'd M YYY',
'dd MM YYY',
'd MMM YY',
'd MMMM YYY',
'd MMMM, ''YYY',
'd MMMM YYY',
'EEE, MMM d, ''YYYY',
'EEEE, MMM d, ''YY',
'EEEE, MMM d, ''YYYY',
'MMM d, ''yyyy, EEEE',
'MMM dd, ''yyyy, EEEE',
'EEE MMM d ''YYYY',
'EEEE MMM d ''YYYY'
]

In [7]:
# example of the date in the standard format and a transformed one
date = fake.date_object()
print(date)
print(format_date(date, format='d M YYY', locale='en'))
print(format_date(date, format='medium', locale='en'))
        

2008-01-23
23 1 2008
Jan 23, 2008


### Collecting data

In [8]:
random.seed = 42

In [9]:
label = []
date_to_transform = []

human_vocab = set()
machine_vocab = set()

def generate_dataset(num_iterations):

    for x in range(num_iterations):
    
        date = fake.date_object()
        label.append(str(date))
        machine_chars = list(str(date).lower())
        machine_vocab.update(machine_chars)

        date_new = format_date(date, format=random.choice(format), locale='en')
        date_to_transform.append(str(date_new))
        human_chars = list(str(date_new).lower().replace(',', ''))
        human_vocab.update(human_chars)

    return date_to_transform, label, human_vocab, machine_vocab

date_to_transform, label, human_vocab, machine_vocab = generate_dataset(30000)



In [10]:
label[1]

'1978-06-23'

In [11]:
date_to_transform[1]

'6/23/78'

In [12]:
machine_vocab = sorted(machine_vocab)
machine_vocab = {char: idx for idx, char in enumerate(machine_vocab)}
machine_vocab

{'-': 0,
 '0': 1,
 '1': 2,
 '2': 3,
 '3': 4,
 '4': 5,
 '5': 6,
 '6': 7,
 '7': 8,
 '8': 9,
 '9': 10}

In [13]:
human_vocab.update(['<unk>', '<pad>'])

In [14]:
human_vocab = sorted(human_vocab)
human_vocab = {char: idx for idx, char in enumerate(human_vocab)}
human_vocab

{' ': 0,
 '/': 1,
 '0': 2,
 '1': 3,
 '2': 4,
 '3': 5,
 '4': 6,
 '5': 7,
 '6': 8,
 '7': 9,
 '8': 10,
 '9': 11,
 '<pad>': 12,
 '<unk>': 13,
 'a': 14,
 'b': 15,
 'c': 16,
 'd': 17,
 'e': 18,
 'f': 19,
 'g': 20,
 'h': 21,
 'i': 22,
 'j': 23,
 'l': 24,
 'm': 25,
 'n': 26,
 'o': 27,
 'p': 28,
 'r': 29,
 's': 30,
 't': 31,
 'u': 32,
 'v': 33,
 'w': 34,
 'y': 35}

In [15]:
for i in range(1):
    print(f'text to transform: {date_to_transform[i]} -- transformed text {label[i]}')    

text to transform: Jul 19, 1988 -- transformed text 1988-07-19


### Preprocessing data

In [16]:
# max number of characters in human-format data
Tx = 30
# number of characters in machine-format data (default)
Ty = 10

In [17]:
# transform strings to integer representations
# if human data -- add unknown tokens and padding
def string_to_int_human(string, length_max, vocab):

    if len(string) > length_max:
        string = string[:length_max]

    representation = list(map(lambda x: vocab.get(x, vocab['<unk>']), string))

    if len(string) < length_max:
        representation += [vocab['<pad>']] * (length_max - len(string))

    return representation

def string_to_int_machine(string, length_max, vocab):

    if len(string) > length_max:
        string = string[:length_max]

    representation = list(map(lambda x: vocab.get(x), string))

    if len(string) < length_max:
        representation += [vocab['<pad>']] * (length_max - len(string))

    return representation

In [18]:
string_to_int_machine('2012-11-07', 10, machine_vocab)

[3, 1, 2, 3, 0, 2, 2, 0, 1, 8]

In [19]:
# transfrom arrays of raw numbers to one-hot encodings
def preprocess_data(date_to_transform, label,  human_vocab, machine_vocab, Tx, Ty):
    
    X = np.array([string_to_int_human(i, Tx, human_vocab) for i in date_to_transform])
    Y = np.array([string_to_int_machine(t, Ty, machine_vocab) for t in label])
    
    Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X)))
    Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))

    return X, Y,  Xoh, Yoh

X, Y, Xoh, Yoh = preprocess_data(date_to_transform, label, human_vocab, machine_vocab, Tx, Ty)

In [20]:
print(X.shape)
print(Y.shape)
print(Xoh.shape)
print(Yoh.shape)

(30000, 30)
(30000, 10)
(30000, 30, 36)
(30000, 10, 11)


### Initializing Architecture and Training

In [21]:
from sklearn.model_selection import train_test_split

len_human_vocab = len(human_vocab)
len_machine_vocab = len(machine_vocab)

def model_attention(Tx, Ty, len_human_vocab, len_machine_vocab):

    encoder_inputs = Input(shape=(Tx, len_human_vocab))
    decoder_inputs = Input(shape=(Ty, len_machine_vocab))
    
    encoder_outputs, state_h, state_c = LSTM(256, return_sequences=True, return_state=True, name='encoder_lstm')(encoder_inputs)
    encoder_states = [state_h, state_c]
    
    decoder_outputs, _, _  = LSTM(256, return_sequences=True, return_state=True, name='decoder_lstm')(decoder_inputs, initial_state=encoder_states)
    
    attention = Attention(name='attention_layer')([decoder_outputs, encoder_outputs])

    concat = Concatenate(axis=-1)([decoder_outputs, attention])
    
    decoder = TimeDistributed(Dense(len_machine_vocab, activation='softmax'))(concat)
    
    model = Model([encoder_inputs, decoder_inputs], decoder)
    
    return model

model = model_attention(Tx, Ty, len_human_vocab, len_machine_vocab)
model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 30, 36)]             0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 10, 11)]             0         []                            
                                                                                                  
 encoder_lstm (LSTM)         [(None, 30, 256),            300032    ['input_1[0][0]']             
                              (None, 256),                                                        
                              (None, 256)]                                                        
                                                                                              

  super().__init__(name, **kwargs)


In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(Xoh, Yoh, test_size=0.2, random_state=42)

In [23]:
# add special token 
decoder_input_data = np.zeros(Y_train.shape)
decoder_input_data[:, 1:, :] = Y_train[:, :-1, :]
decoder_input_data[:, 0, 0] = 1

# Train the model
model.fit(
    [X_train, decoder_input_data],
    Y_train,
    batch_size=15,
    epochs=10,
    validation_split=0.2
)

# Save the model
model.save('model_attention.keras')

# Load the model
model = keras.models.load_model('model_attention.keras')

Epoch 1/10


W0000 00:00:1720353328.688865       1 op_level_cost_estimator.cc:699] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" frequency: 2400 num_cores: 8 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }




W0000 00:00:1720353441.877983       1 op_level_cost_estimator.cc:699] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" frequency: 2400 num_cores: 8 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Inference phase

In [26]:
loss, accuracy = model.evaluate([X_test, np.zeros((len(X_test), Ty, len(machine_vocab)))], Y_test)
print(f'Accuracy on test data: {accuracy * 100:.2f}%')

Accuracy on test data: 98.88%


In [27]:
# Check the prediction in the human readable format

# create a reverse dictionary and return a string
def one_hot_to_str(oh, vocab):
    val_to_char = {val: char for char, val in vocab.items()}
    string =  ''.join(val_to_char[np.argmax(vec)] for vec in oh)
    return string

def predict(model, X_test, machine_vocab, Ty):

    prediction = model.predict([X_test, np.zeros((len(X_test), Ty, len(machine_vocab)))])
    prediction = [one_hot_to_str(pred, machine_vocab) for pred in prediction]

    return prediction

prediction = predict(model,X_test, machine_vocab, Ty)

for i in range(10):
    print('Predicted: ', prediction[i])
    print('Label: ', one_hot_to_str(Y_test[i], machine_vocab))

Predicted:  1970-07-30
Label:  1970-07-30
Predicted:  2012-03-20
Label:  2012-03-20
Predicted:  1991-01-04
Label:  1991-01-04
Predicted:  2009-02-28
Label:  2009-02-28
Predicted:  2012-01-21
Label:  2012-01-21
Predicted:  1992-12-15
Label:  1992-12-15
Predicted:  1991-05-17
Label:  1991-05-17
Predicted:  2001-11-23
Label:  2001-11-23
Predicted:  1990-03-22
Label:  1990-03-21
Predicted:  1985-03-02
Label:  1985-03-02
