In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd

In [2]:
good = pd.read_csv('./good.csv')
bad = pd.read_csv('./bad.csv')

dataset = pd.concat([good, bad], axis=0)
dataset

Unnamed: 0,sentence,label
0,[ NUMERO NUMERO NUMERO IDENTIF NUMERO IDENTIF ...,1
1,IDENTIF + IDENTIF * [ IDENTIF IDENTIF IDENTIF ...,1
2,[ IDENTIF ] / NUMERO / [ NUMERO NUMERO * ] + (...,1
3,[ IDENTIF NUMERO / IDENTIF * ] - IDENTIF - NUM...,1
4,IDENTIF + [ IDENTIF NUMERO IDENTIF - NUMERO NU...,1
...,...,...
9995,( IDENTIF - [ NUMERO ] ) * ( ( IDENTIF ) ) - (...,0
9996,( NUMERO - ( [ IDENTIF ] ) / NUMERO - [ NUMERO...,0
9997,[ IDENTIF IDENTIF - IDENTIF IDENTIF * NUMERO I...,0
9998,( IDENTIF + IDENTIF / [ IDENTIF ] / ( NUMERO )...,0


In [3]:
#https://medium.com/analytics-vidhya/one-hot-encoding-of-text-data-in-natural-language-processing-2242fefb2148
token_index = {}
#Create a counter for counting the number of key-value pairs in the token_length
counter = 0

# Select the elements of the samples which are the two sentences
for index, row in dataset.iterrows():
    for considered_word in row['sentence'].split():
        if considered_word not in token_index:
            
            # If the considered word is not present in the dictionary token_index, add it to the token_index
            # The index of the word in the dictionary begins from 1 
            token_index.update({considered_word : counter + 1}) 
            
            # updating the value of counter
            counter = counter + 1   

token_index

{'[': 1,
 'NUMERO': 2,
 'IDENTIF': 3,
 '/': 4,
 '+': 5,
 '*': 6,
 '-': 7,
 ']': 8,
 '(': 9,
 ')': 10}

In [4]:
def generate_onehot(size, index):
    z = [0 for x in range(size)]
    z[index] = 1
    # return np.asarray(z).astype(np.int32)
    return z

def encode_index(row, _token_index):
    return [_token_index[token] for token in row['sentence'].split()]

def encode_onehot(row, _token_index):
    length = len(_token_index)
    return [generate_onehot(length, _token_index[token]  - 1) for token in row['sentence'].split()]
    # return np.array([generate_onehot(length, _token_index[token]  - 1) for token in row['sentence'].split()])

dataset['encode_index'] = dataset.apply(lambda row: encode_index(row, token_index), axis=1)
dataset['encode'] = dataset.apply(lambda row: encode_onehot(row, token_index), axis=1)

dataset

Unnamed: 0,sentence,label,encode_index,encode
0,[ NUMERO NUMERO NUMERO IDENTIF NUMERO IDENTIF ...,1,"[1, 2, 2, 2, 3, 2, 3, 4, 4, 2, 2, 4, 2, 5, 6, ...","[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, ..."
1,IDENTIF + IDENTIF * [ IDENTIF IDENTIF IDENTIF ...,1,"[3, 5, 3, 6, 1, 3, 3, 3, 2, 2, 6, 6, 2, 2, 5, ...","[[0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, ..."
2,[ IDENTIF ] / NUMERO / [ NUMERO NUMERO * ] + (...,1,"[1, 3, 8, 4, 2, 4, 1, 2, 2, 6, 8, 5, 9, 1, 3, ...","[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, ..."
3,[ IDENTIF NUMERO / IDENTIF * ] - IDENTIF - NUM...,1,"[1, 3, 2, 4, 3, 6, 8, 7, 3, 7, 2, 4, 1, 2, 3, ...","[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, ..."
4,IDENTIF + [ IDENTIF NUMERO IDENTIF - NUMERO NU...,1,"[3, 5, 1, 3, 2, 3, 7, 2, 2, 4, 3, 7, 3, 5, 7, ...","[[0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, ..."
...,...,...,...,...
9995,( IDENTIF - [ NUMERO ] ) * ( ( IDENTIF ) ) - (...,0,"[9, 3, 7, 1, 2, 8, 10, 6, 9, 9, 3, 10, 10, 7, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 1, 0, ..."
9996,( NUMERO - ( [ IDENTIF ] ) / NUMERO - [ NUMERO...,0,"[9, 2, 7, 9, 1, 3, 8, 10, 4, 2, 7, 1, 2, 3, 4,...","[[0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 1, 0, 0, ..."
9997,[ IDENTIF IDENTIF - IDENTIF IDENTIF * NUMERO I...,0,"[1, 3, 3, 7, 3, 3, 6, 2, 3, 3, 2, 2, 5, 4, 7, ...","[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, ..."
9998,( IDENTIF + IDENTIF / [ IDENTIF ] / ( NUMERO )...,0,"[9, 3, 5, 3, 4, 1, 3, 8, 4, 9, 2, 10, 7, 1, 3,...","[[0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 1, 0, ..."


In [5]:
t_labels = tf.constant(dataset['label'], dtype=tf.int32)
t_sentences = tf.constant(dataset['sentence'], dtype=tf.string)
ds = tf.data.Dataset.from_tensor_slices((t_sentences, t_labels))
ds

<TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))>

In [6]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE, split='whitespace', standardize=None)
encoder.adapt(ds.map(lambda text, label: text))

In [7]:
encoder.get_vocabulary()

['', '[UNK]', 'NUMERO', 'IDENTIF', '-', '/', '*', '+', '(', ')', ']', '[']

In [33]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=512,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512)),
    tf.keras.layers.Dense(512, activation='elu'),
    tf.keras.layers.Dense(1)
])

In [28]:
sample_text = ('NUMERO + IDENTIF / NUMERO * IDENTIF * IDENTIF - NUMERO * IDENTIF + IDENTIF * IDENTIF * IDENTIF * NUMERO / NUMERO + IDENTIF - [ IDENTIF ] * NUMERO - IDENTIF * IDENTIF * NUMERO')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

[0.00608886]


In [29]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [30]:
BUFFER_SIZE = 10000
BATCH_SIZE = 32

In [31]:

dsb = ds.shuffle(BUFFER_SIZE)
length = len(dsb)
test_ds_size = int(length * 0.2) # 20 percent of length of ds
train_ds = dsb.skip(test_ds_size).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_ds = dsb.take(test_ds_size).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [32]:
history = model.fit(train_ds, epochs=10,
                    validation_data=test_ds,
                    validation_steps=30)

Epoch 1/10


Epoch 2/10
Epoch 3/10
 44/500 [=>............................] - ETA: 1:17 - loss: 0.6763 - accuracy: 0.4403

KeyboardInterrupt: 