In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys
# Add functions path
sys.path.append('../Functions')
random_state=42

In [2]:
from datasets import load_stratified_dataset
df = load_stratified_dataset(path='../Datasets/full_good_bad_dataset.csv', labels='label', samples_per_label=2500)

In [3]:
from text_lemmatization import Lemmatizer
lemmatizer = Lemmatizer()
df.text = lemmatizer.lem_list(df.text)

In [4]:
from sklearn.model_selection import train_test_split
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = random_state)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.15, random_state=random_state)

In [5]:
print(f'Shape train: {X_train.shape[0]} rows')
print(f'Shape test: {X_test.shape[0]} rows')
#print(f'Shape val: {X_val.shape[0]} rows')

Shape train: 3750 rows
Shape test: 1250 rows


In [6]:
from keras.preprocessing.text import Tokenizer
t = Tokenizer()
t.fit_on_texts(X)
n_words=len(t.word_index)
print(f'There are {n_words} different words in the dataset')

Using TensorFlow backend.


There are 113709 different words in the dataset


In [7]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
#val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))

In [8]:
BUFFER_SIZE = 10000
BATCH_SIZE = 32

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

In [9]:
VOCAB_SIZE=n_words
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [11]:
from tensorflow.keras import regularizers
from tensorflow.keras import layers
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=n_words,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True,
        trainable=True
    ),
    tf.keras.layers.GlobalMaxPool1d(),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    layers.Dropout(0.5),
    tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    layers.Dropout(0.5),
    tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

AttributeError: module 'tensorflow_core.keras.layers' has no attribute 'GlobalMaxPool1d'

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset, 
                    validation_steps=10)

In [None]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
plot_graphs(history, 'accuracy')
plt.ylim(None,1)
plt.subplot(1,2,2)
plot_graphs(history, 'loss')
plt.ylim(0,None)