In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd

In [2]:
good = pd.read_csv('./good.csv')
bad = pd.read_csv('./bad.csv')

dataset = pd.concat([good, bad], axis=0)
dataset

Unnamed: 0,sentence,label
0,[ NUMERO NUMERO NUMERO IDENTIF NUMERO IDENTIF ...,1
1,IDENTIF + IDENTIF * [ IDENTIF IDENTIF IDENTIF ...,1
2,[ IDENTIF ] / NUMERO / [ NUMERO NUMERO * ] + (...,1
3,[ IDENTIF NUMERO / IDENTIF * ] - IDENTIF - NUM...,1
4,IDENTIF + [ IDENTIF NUMERO IDENTIF - NUMERO NU...,1
...,...,...
9995,( IDENTIF - [ NUMERO ] ) * ( ( IDENTIF ) ) - (...,0
9996,( NUMERO - ( [ IDENTIF ] ) / NUMERO - [ NUMERO...,0
9997,[ IDENTIF IDENTIF - IDENTIF IDENTIF * NUMERO I...,0
9998,( IDENTIF + IDENTIF / [ IDENTIF ] / ( NUMERO )...,0


In [3]:
t_labels = tf.constant(dataset['label'], dtype=tf.int32)
t_sentences = tf.constant(dataset['sentence'], dtype=tf.string)
ds = tf.data.Dataset.from_tensor_slices((t_sentences, t_labels))
ds

<TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))>

In [4]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE, split='whitespace', standardize=None)
encoder.adapt(ds.map(lambda text, label: text))

In [5]:
encoder.get_vocabulary()

['', '[UNK]', 'NUMERO', 'IDENTIF', '-', '/', '*', '+', '(', ')', ']', '[']

In [6]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

In [7]:
sample_text = ('NUMERO + IDENTIF / NUMERO * IDENTIF * IDENTIF - NUMERO * IDENTIF + IDENTIF * IDENTIF * IDENTIF * NUMERO / NUMERO + IDENTIF - [ IDENTIF ] * NUMERO - IDENTIF * IDENTIF * NUMERO')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

[0.01555124]


In [8]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [15]:
BUFFER_SIZE = 20000
BATCH_SIZE = 8

In [16]:

dsb = ds.shuffle(BUFFER_SIZE)
length = len(dsb)
test_ds_size = int(length * 0.2) # 20 percent of length of ds
train_ds = dsb.skip(test_ds_size).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_ds = dsb.take(test_ds_size).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [17]:
history = model.fit(train_ds, epochs=10,
                    validation_data=test_ds,
                    validation_steps=30)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
 165/2000 [=>............................] - ETA: 1:06 - loss: 0.6759 - accuracy: 0.5038

KeyboardInterrupt: 