In [58]:
import re
import json
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Bidirectional
from keras.models import Model, model_from_json
from keras.callbacks import EarlyStopping 
from keras.losses import CategoricalCrossentropy

1. Text Cleaning
2. Put <BOS> tag and <EOS> tag for decoder input
3. Make Vocabulary (VOCAB_SIZE)
4. Tokenize Bag of words to Bag of IDs
5. Padding (MAX_LEN)
6. Word Embedding (EMBEDDING_DIM)
7. Reshape the Data depends on neural network shape
8. Split Data for training and validation, testing

In [2]:
def clean_data(text):
    pass

def text2seq(train, label, VOCAB_SIZE):
    tokenizer = Tokenizer(num_words=VOCAB_SIZE, split=' ', filters='')
    tokenizer.fit_on_texts(train + label)
    encode_seq = tokenizer.texts_to_sequences(train)
    decode_seq = tokenizer.texts_to_sequences(label)
    dictionary = tokenizer.word_index
    
    word2idx = {}
    idx2word = {}
    for k, v in dictionary.items():
        if v < VOCAB_SIZE:
            word2idx[k] = v
            idx2word[v] = k
        if v >= VOCAB_SIZE-1:
            continue
          
    return word2idx, idx2word, encode_seq, decode_seq

def padding(seq, MAX_LEN):
    encode_seq = pad_sequences(seq, maxlen=MAX_LEN, dtype='int32', padding='post', truncating='post')
    return encode_seq

def embedding_layer_creater(VOCAB_SIZE, EMBEDDING_DIM, MAX_LEN, embedding_matrix):
    embedding_layer = Embedding(input_dim = VOCAB_SIZE, 
                                output_dim = EMBEDDING_DIM,
                                input_length = MAX_LEN,
                                weights = [embedding_matrix], trainable = False)
    return embedding_layer


In [3]:
def reg_sen(x):
    return re.findall(r"sin|cos|tan|\d|\w|\(|\)|\+|-|\*+", x.strip().lower())

In [4]:
with open('train.txt') as file:
    raw = file.readlines()
train = []
label = []
for row in raw:
    row = row.strip('\n')
    _train, _label = row.split('=')
    _train = reg_sen(_train)
    _label = reg_sen(_label)
    _label = ['#'] + _label + ['$']
    train.append(' '.join(_train))
    label.append(' '.join(_label))

In [5]:
print(train[:5])
print(label[:5])

['( 7 - 3 * z ) * ( - 5 * z - 9 )', '- 9 * s ** 2', '( 2 - 2 * n ) * ( n - 1 )', 'x ** 2', '( 4 - x ) * ( x - 2 3 )']
['# 1 5 * z ** 2 - 8 * z - 6 3 $', '# - 9 * s ** 2 $', '# - 2 * n ** 2 + 4 * n - 2 $', '# x ** 2 $', '# - x ** 2 + 2 7 * x - 9 2 $']


In [6]:
train_unique = len(set(' '.join(train).split(' ')))
label_unique = len(set(' '.join(label).split(' ')))

MAX_LEN = 29
VOCAB_SIZE = max(train_unique, label_unique)+1
print(VOCAB_SIZE)
EMBEDDING_DIM = 64

35


In [7]:
word2idx, idx2word, encode_seq, decode_seq = text2seq(train, label, VOCAB_SIZE)
print('len of word2idx:', len(word2idx))
print('len of idx2word', len(idx2word))

len of word2idx: 34
len of idx2word 34


In [8]:
word2idx[' '] = 0
idx2word[0] = ' '

In [109]:
with open('./weights/word2idx.json', 'w') as file:
    json.dump(word2idx, file)
with open('./weights/idx2word.json', 'w') as file:
    json.dump(idx2word, file)

In [10]:
for i in range(5):
    encode_tmp = encode_seq[i]
    decode_tmp = decode_seq[i]
    print([idx2word[k] for k in encode_tmp])
    print(train[i].split(' '))
    print([idx2word[k] for k in decode_tmp])
    print(label[i].split(' '))

['(', '7', '-', '3', '*', 'z', ')', '*', '(', '-', '5', '*', 'z', '-', '9', ')']
['(', '7', '-', '3', '*', 'z', ')', '*', '(', '-', '5', '*', 'z', '-', '9', ')']
['#', '1', '5', '*', 'z', '**', '2', '-', '8', '*', 'z', '-', '6', '3', '$']
['#', '1', '5', '*', 'z', '**', '2', '-', '8', '*', 'z', '-', '6', '3', '$']
['-', '9', '*', 's', '**', '2']
['-', '9', '*', 's', '**', '2']
['#', '-', '9', '*', 's', '**', '2', '$']
['#', '-', '9', '*', 's', '**', '2', '$']
['(', '2', '-', '2', '*', 'n', ')', '*', '(', 'n', '-', '1', ')']
['(', '2', '-', '2', '*', 'n', ')', '*', '(', 'n', '-', '1', ')']
['#', '-', '2', '*', 'n', '**', '2', '+', '4', '*', 'n', '-', '2', '$']
['#', '-', '2', '*', 'n', '**', '2', '+', '4', '*', 'n', '-', '2', '$']
['x', '**', '2']
['x', '**', '2']
['#', 'x', '**', '2', '$']
['#', 'x', '**', '2', '$']
['(', '4', '-', 'x', ')', '*', '(', 'x', '-', '2', '3', ')']
['(', '4', '-', 'x', ')', '*', '(', 'x', '-', '2', '3', ')']
['#', '-', 'x', '**', '2', '+', '2', '7', '*', 'x'

In [11]:
encode_input = padding(encode_seq, MAX_LEN)
decode_input = padding(decode_seq, MAX_LEN)

In [60]:
# start_array = np.full((decode_input_raw.shape[0], 1), word2idx['#'])
# decode_input = np.append(start_array, decode_input_raw, 1)

# end_array = np.full((decode_input.shape[0], 1), word2idx['$'])
# decode_target = np.append(decode_input_raw, end_array , 1)

In [13]:
encoder_input_data = np.zeros((len(train), MAX_LEN, VOCAB_SIZE), dtype='float32')
decoder_input_data = np.zeros((len(train), MAX_LEN, VOCAB_SIZE), dtype='float32')
decoder_target_data = np.zeros((len(label), MAX_LEN, VOCAB_SIZE), dtype='float32')

In [14]:
#parse the input and output texts
for i, (_encode, _decode) in enumerate(zip(encode_input, decode_input)):
    for t, char in enumerate(_encode):
        encoder_input_data[i, t, char] = 1.
    for t, char in enumerate(_decode):
        decoder_input_data[i, t, char] = 1.
        if t > 0:
            decoder_target_data[i, t-1, char] = 1.

In [15]:
# #parse the input and output texts
# for i, (_encode, _decode_input, _decode_target) in enumerate(zip(encode_input, decode_input, decode_target)):
#     for t, char in enumerate(_encode):
#         encoder_input_data[i, t, char] = 1.
#     for t, (char_input, char_target) in enumerate(zip(_decode_input, _decode_target)):
#         decoder_input_data[i, t, char_input] = 1.
#         decoder_target_data[i, t, char_target] = 1.

In [16]:
#define an input of the encoder with length as the number of encoder tokens
encoder_inputs = Input(shape=(None, VOCAB_SIZE))
#instantiate the LSTM model
encoder = Bidirectional(LSTM(EMBEDDING_DIM, return_state=True))
#define the outputs and states of the encoder
#encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder(encoder_inputs)
#disregard encoder_outputs and keep only the states
#encoder_states = [state_h, state_c]
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

2022-08-12 17:20:48.174020: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:
decoder_inputs = Input(shape=(None, VOCAB_SIZE))    
decoder_lstm = LSTM(EMBEDDING_DIM*2, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [18]:
# #define an input of the encoder with length as the number of encoder tokens
# decoder_inputs = Input(shape=(None, VOCAB_SIZE))
# #define the LSTM model for the decoder setting the return sequences and return state to True
# decoder_lstm = LSTM(EMBEDDING_DIM, return_sequences=True, return_state=True)
# #define only the decoder output for the training model. The states are only needed in the inference model
# decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
# decoder_dense = Dense(VOCAB_SIZE, activation='softmax')
# decoder_outputs = decoder_dense(decoder_outputs)
# #define the training model which requires the encoder_input_data and decoder_input_data to return the decoder_target_data
# model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [19]:
# for i in range(5):

#     print([idx2word[k] for k in encode_input[i]])
#     print(train[i].split(' '))
#     print([idx2word[k] for k in decode_input[i]])
#     print(label[i].split(' '))
    
#     encode_idx = np.argmax(encoder_input_data[i],axis=1)
#     print([idx2word[k] for k in encode_idx])
#     decode_idx1 = np.argmax(decoder_input_data[i],axis=1)
#     print([idx2word[k] for k in decode_idx1])
#     decode_idx2 = np.argmax(decoder_target_data[i],axis=1)
#     print([idx2word[k] for k in decode_idx2])

In [64]:
decoder_input_data[0].shape

(29, 35)

In [91]:
tmp = np.logical_not(np.equal(np.argmax(decoder_input_data[0],axis=1), 0))
print(tmp.shape)
np.tile(tmp, [1,1,5]).shape

(29,)


(1, 1, 145)

In [102]:
def loss_func(targets, pred):
    cross_entropy = CategoricalCrossentropy()
    mask = tf.math.logical_not(tf.math.equal(tf.math.argmax(pred,axis=2), 0))
    mask = tf.expand_dims(mask, axis=-1)
    loss = cross_entropy(targets, pred, sample_weight=mask)
    return loss

In [103]:
#Train the model
model.compile(optimizer='rmsprop', loss=loss_func, metrics=['accuracy'])
callback = EarlyStopping(monitor='loss', patience=1)

history = model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
         batch_size=128,
         epochs=10,
         validation_split=0.2, verbose=2, callbacks=[callback])

Epoch 1/10
6250/6250 - 660s - loss: 0.0349 - accuracy: 0.9872 - val_loss: 0.0287 - val_accuracy: 0.9892 - 660s/epoch - 106ms/step
Epoch 2/10
6250/6250 - 679s - loss: 0.0294 - accuracy: 0.9885 - val_loss: 0.0243 - val_accuracy: 0.9912 - 679s/epoch - 109ms/step
Epoch 3/10
6250/6250 - 693s - loss: 0.0267 - accuracy: 0.9895 - val_loss: 0.0254 - val_accuracy: 0.9896 - 693s/epoch - 111ms/step
Epoch 4/10
6250/6250 - 741s - loss: 0.0246 - accuracy: 0.9903 - val_loss: 0.0215 - val_accuracy: 0.9922 - 741s/epoch - 119ms/step
Epoch 5/10
6250/6250 - 602s - loss: 0.0229 - accuracy: 0.9909 - val_loss: 0.0209 - val_accuracy: 0.9918 - 602s/epoch - 96ms/step
Epoch 6/10
6250/6250 - 606s - loss: 0.0214 - accuracy: 0.9915 - val_loss: 0.0196 - val_accuracy: 0.9924 - 606s/epoch - 97ms/step
Epoch 7/10
6250/6250 - 596s - loss: 0.0202 - accuracy: 0.9920 - val_loss: 0.0176 - val_accuracy: 0.9936 - 596s/epoch - 95ms/step
Epoch 8/10
6250/6250 - 595s - loss: 0.0191 - accuracy: 0.9924 - val_loss: 0.0164 - val_accura

In [104]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(EMBEDDING_DIM*2,))
decoder_state_input_c = Input(shape=(EMBEDDING_DIM*2,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [108]:
with open('./weights/encoder_model.json', 'w', encoding='utf8') as f:
    f.write(encoder_model.to_json())
encoder_model.save_weights('./weights/encoder_model_weights.h5')

with open('./weights/decoder_model.json', 'w', encoding='utf8') as f:
    f.write(decoder_model.to_json())
decoder_model.save_weights('./weights/decoder_model_weights.h5')

In [118]:
with open('network.txt', 'w') as f:
    model.summary(print_fn=lambda x: f.write(x + '\n'))

In [110]:
def load_model(model_filename, model_weights_filename):
    with open(model_filename, 'r', encoding='utf8') as f:
        model = model_from_json(f.read())
    model.load_weights(model_weights_filename)
    return model

encoder = load_model('./weights/encoder_model.json', './weights/encoder_model_weights.h5')
decoder = load_model('./weights/decoder_model.json', './weights/decoder_model_weights.h5')

In [111]:
def decode_sequence(input_seq):
    states_value = encoder.predict(input_seq)
    target_seq = np.zeros((1, 1, VOCAB_SIZE))
    target_seq[0, 0, word2idx['#']] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = idx2word[sampled_token_index]
        decoded_sentence += sampled_char

        if (sampled_char == '$' or len(decoded_sentence) > MAX_LEN):
            stop_condition = True

        target_seq = np.zeros((1, 1, VOCAB_SIZE))
        target_seq[0, 0, sampled_token_index] = 1.

        states_value = [h, c]


    return decoded_sentence

In [113]:
for seq_index in range(100,120):

    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)[:-1]
    print('\n')
    print(f"Input sentence: {train[seq_index]}")
    print(f"Decoded sentence: {decoded_sentence}")



Input sentence: ( 2 1 - 7 * n ) * ( 6 * n + 4 )
Decoded sentence: -42*n**2+98*n+84


Input sentence: ( 1 4 - 9 * k ) * ( 4 * k + 8 )
Decoded sentence: -36*k**2-16*k+112


Input sentence: ( 1 3 - s ) * ( 7 * s + 6 )
Decoded sentence: -7*s**2+85*s+78


Input sentence: 3 * a * ( 4 * a - 6 )
Decoded sentence: 12*a**2-18*a


Input sentence: ( 1 6 - 5 * s ) * ( s - 3 2 )
Decoded sentence: -5*s**2+176*s-512


Input sentence: ( 1 3 - 5 * s ) * ( s + 2 )
Decoded sentence: -5*s**2+3*s+26


Input sentence: ( - 7 * j - 1 4 ) * ( j + 5 )
Decoded sentence: -7*j**2-49*j-70


Input sentence: 5 * j * ( j + 1 3 )
Decoded sentence: 5*j**2+65*j


Input sentence: ( 2 2 - 3 * c ) * ( c + 6 )
Decoded sentence: -3*c**2+4*c+132


Input sentence: ( 1 4 - 5 * t ) * ( t + 2 5 )
Decoded sentence: -5*t**2-111*t+350


Input sentence: ( 7 * s - 2 6 ) * ( 8 * s + 2 7 )
Decoded sentence: 56*s**2-15*s-702


Input sentence: ( - o - 2 5 ) * ( o - 8 )
Decoded sentence: -o**2-17*o+200


Input sentence: ( - o - 2 6 ) * ( -