Instalando libs

In [None]:
!pip install --upgrade tensorflow tensorflow_hub keras_tuner pandas matplotlib==3.1.3

Importanto libs

In [6]:
import tensorflow as tf
import tensorflow_hub as hub
import keras_tuner as kt
import pandas as pd
import matplotlib.pyplot as plt

Carregando dataset

In [7]:
dataset_url = 'https://raw.githubusercontent.com/diptamath/covid_fake_news/main/data'
dataset_columns = ['id', 'features', 'labels']
dataset_offset = 1

def read_dataset(dataset_name):
  return pd.read_csv(dataset_url + dataset_name, names = dataset_columns, skiprows = lambda index : index < dataset_offset)

Criando os datasets de treinamento, teste e validação

In [None]:
dataset_train = read_dataset('/Constraint_Train.csv')
dataset_train.head()

In [None]:
dataset_test = read_dataset('/english_test_with_labels.csv')
dataset_test.head()

In [None]:
dataset_val = read_dataset('/Constraint_Val.csv')
dataset_val.head()

Funções Auxiliares para utilizar label na classificação

In [11]:
def transform(label):
    return 1 if label == 'fake' else 0

def inverse_transform(value):
    return 'fake' if value >= 0.5 else 'real'


Preprocessamento

In [None]:
dataset_train.pop('id')
dataset_test.pop('id')
dataset_val.pop('id')

dataset_train['labels'] = [transform(label) for label in dataset_train['labels']]
dataset_test['labels'] = [transform(label) for label in dataset_test['labels']]
dataset_val['labels'] = [transform(label) for label in dataset_val['labels']]

print(dataset_train)

Utilizando modelo pré-treinado como camada de incorporação de texto

In [None]:
embedding_layer_name = 'https://tfhub.dev/google/nnlm-en-dim128/2'
embedding_layer = hub.KerasLayer(embedding_layer_name, input_shape = [], dtype = tf.string, trainable = False)

embedding_layer(dataset_train['features'])

Definindo função para construir modelo

In [24]:
def build_model(hp):

    model = tf.keras.Sequential()
    model.add(embedding_layer)

    hp_units = hp.Int('units', min_value = 8, max_value = 64, step = 8)
    model.add(tf.keras.layers.Dense(units = hp_units, activation = 'relu'))

    model.add(tf.keras.layers.Dense(1, activation = 'sigmoid'))

    model.compile(optimizer = tf.keras.optimizers.Adam(), loss = tf.keras.losses.BinaryCrossentropy(from_logits = False), metrics = ['accuracy'])

    return model

Definindo tuner

In [None]:
tuner = kt.Hyperband(
    build_model,
    objective = 'val_accuracy',
    max_epochs = 50,
    factor = 3
)

Definindo função para parar processamento caso validation loss não sofra redução (saturação da aprendizagem)

In [26]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 5)

Buscando melhores valores de parâmetros

In [None]:
tuner.search(
    dataset_train['features'],
    dataset_train['labels'],
    epochs = 50,
    validation_data = (dataset_val['features'], dataset_val['labels']),
    batch_size = 512,
    validation_batch_size = 512,
    verbose = 1,
    callbacks = [ stop_early ]
)

best_hps = tuner.get_best_hyperparameters(num_trials = 1)[0]

Treinando o modelo utilizando os melhores parâmetros

In [None]:
model = tuner.hypermodel.build(best_hps)

history = model.fit(
    dataset_train['features'],
    dataset_train['labels'],
    batch_size = 512,
    validation_data = (dataset_val['features'], dataset_val['labels']),
    validation_batch_size = 512,
    epochs = 30,
    verbose = 1
)

Plotando resultados dos treinamentos

In [None]:
history_dict = history.history

accuracy = history_dict['accuracy']
val_accuracy = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(accuracy) + 1)

plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.plot(epochs, loss, 'bo', label = 'Training loss')
plt.plot(epochs, val_loss, 'r', label = 'Validation loss')
plt.legend()

plt.show()

In [None]:
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.plot(epochs, accuracy, 'bo', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'r', label='Validation accuracy')
plt.legend(loc = 'lower right')

plt.show()

Utilizando o modelo no dataset de teste

In [None]:
model.evaluate(
    dataset_test['features'],
    dataset_test['labels'],
    verbose = 1
)

Realizando previsões em sentenças

In [32]:
def predict(inputs):

    outputs = model.predict(inputs)

    for i, o in zip(inputs, outputs):
        print(f'Input: {i}')
        print(f'Output Score: {o[0]} | Output Label: {inverse_transform(o[0])}')

In [None]:
inputs = [
    'The Chinese government announced that "garlic is a preventative food for the the novel coronavirus."',
    'Hydroxychloroquine is the cure for coronavirus.',
    'Mass disinfection of people using a chemical solution will eradicate COVID-19.',
    'The coronavirus was engineered by scientists in a lab.',
    'Practice social distancing to slow the spread of covid.',
    'Wear a mask in public to help prevent the virus.', # most sentences in the dataset using the word "mask" is fake
    'Fever and difficulty breathing are symptoms of coronavirus.'
]

predict(inputs)