In [1]:
import argparse
import datasets
import pandas
import transformers
import tensorflow as tf
import numpy
import matplotlib.pyplot as plt
import keras_tuner as kt

from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import (L1,L2) 
from tensorflow.keras.layers import (SimpleRNN, Dense, Conv1D, Conv2D, MaxPooling2D,
                                      Flatten, Bidirectional, LSTM, GRU, Embedding, 
                                      Dropout, GlobalMaxPooling1D, GlobalAveragePooling1D)

In [2]:
pip show tensorflow

Name: tensorflow
Version: 2.13.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: c:\users\hewyu\anaconda3\lib\site-packages
Requires: tensorflow-intel
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [2]:
# use the tokenizer from DistilRoBERTa
tokenizer = transformers.AutoTokenizer.from_pretrained("distilroberta-base")

def tokenize(examples):
    """Converts the text of each example to "input_ids", a sequence of integers
    representing 1-hot vectors for each token in the text"""
    return tokenizer(examples["text"], truncation=True, max_length=64,
                     padding="max_length")

model_path="model"
train_path="../graduate-project-data/train_10s.csv"
dev_path="../graduate-project-data/dev_10s.csv"

# load the CSVs into Huggingface datasets to allow use of the tokenizer
hf_dataset = datasets.load_dataset("csv", data_files={
    "train": train_path, "validation": dev_path})

# the labels are the names of all columns except the first
labels = hf_dataset["train"].column_names[1:]

def gather_labels(example):
    """Converts the label columns into a list of 0s and 1s"""
    # the float here is because F1Score requires floats
    return {"labels": [float(example[l]) for l in labels]}

# convert text and labels to format expected by model
hf_dataset = hf_dataset.map(gather_labels)
hf_dataset = hf_dataset.map(tokenize, batched=True)
#hf_dataset = hf_dataset.map(to_bow) # For Feed Forward NN

# convert Huggingface datasets to Tensorflow datasets
train_dataset = hf_dataset["train"].to_tf_dataset(
    columns="input_ids", # "input_bow" for FF
    label_cols="labels", 
    batch_size=16,
    shuffle=True)
dev_dataset = hf_dataset["validation"].to_tf_dataset(
    columns="input_ids", # input_bow for FF
    label_cols="labels",
    batch_size=16)

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

  if _pandas_api.is_sparse(col):


Generating validation split: 0 examples [00:00, ? examples/s]

  if _pandas_api.is_sparse(col):


Map:   0%|          | 0/2520 [00:00<?, ? examples/s]

Map:   0%|          | 0/315 [00:00<?, ? examples/s]

Map:   0%|          | 0/2520 [00:00<?, ? examples/s]

Map:   0%|          | 0/315 [00:00<?, ? examples/s]

In [14]:
# https://stackoverflow.com/questions/76109047/integrate-batch-size-in-keras-tuner
# https://keras.io/guides/keras_tuner/getting_started/#tune-model-training
class my_model:

    def model_builder(self, hp):
    
        # Bidiretional GRU model
        model = Sequential()
    
        hp_output_dim = hp.Choice('output_dim', values=[32, 64, 128, 256])
        hp_gru = hp.Choice('gru_units', values=[32, 64, 128, 256])
        hp_do1 = hp.Float('dropout_1', min_value=0.1, max_value=1, step=0.1)
        hp_d =  hp.Choice('dense', values=[32, 64, 128, 256])
        hp_do2 = hp.Float('dropout_2', min_value=0.1, max_value=1, step=0.1)
        #hp_layer = hp.Choice('layer_type', values=[Flatten(), GlobalMaxPooling1D(), GlobalAveragePooling1D()])
        hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
        hp_thresh = hp.Float('threshold', min_value=0.1, max_value=1, step=0.1)
        self.hp_batch_size = hp.Choice('batch_size', values=[16, 32, 64])
    
        model.add(Embedding(input_dim=tokenizer.vocab_size, output_dim=hp_output_dim, input_length=64))
        model.add(Bidirectional(GRU(units=hp_gru, return_sequences=True), input_shape=(tokenizer.vocab_size,)))
        model.add(Dropout(rate=hp_do1)) 
        model.add(Dense(units=hp_d, activation='relu')) #model.add(Dense(64, activation='relu', kernel_regularizer=L2(0.01)))
        model.add(Dropout(rate=hp_do2))
        model.add(Flatten()) 
        model.add(Dense(len(labels), activation='sigmoid'))
    
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
            loss=tf.keras.losses.binary_crossentropy,
            metrics=[tf.keras.metrics.F1Score(average="micro", threshold=hp_thresh)])
      
        return model

    def fit(self, hp, model, *args, **kwargs):
        return model.fit(
            *args,
            batch_size=self.hp_batch_size,
            **kwargs,
        )

In [16]:
# Specify the objective with direction
objective = kt.Objective("val_f1_score", direction="max")

tuner = kt.Hyperband(model_builder,
                     objective=objective,
                     max_epochs=20,
                     factor=3,
                     overwrite=True,
                     directory='.',
                     project_name='hp')

In [17]:
tuner.search(train_dataset, 
                epochs=20, 
                #batch_size=hp_batch_size,
                validation_data=dev_dataset,
                callbacks=[tf.keras.callbacks.ModelCheckpoint(
                filepath=model_path,
                monitor="val_f1_score",
                mode="max",
                save_best_only=True),
                EarlyStopping(monitor='val_f1_score', mode='max', patience=3, restore_best_weights=True)])

Trial 30 Complete [00h 05m 11s]
val_f1_score: 0.5849056839942932

Best val_f1_score So Far: 0.7049808502197266
Total elapsed time: 02h 54m 28s


In [20]:
oracle_state = tuner.oracle.get_state()
trials = oracle_state['trials']
print("Number of trials executed:", len(trials))


KeyError: 'trials'

In [18]:
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best Hyperparameters:")
for key, value in best_hyperparameters.values.items():
    print(f"{key}: {value}")

Best Hyperparameters:
output_dim: 128
gru_units: 128
dropout_1: 0.1
dense: 64
dropout_2: 0.1
learning_rate: 0.001
threshold: 0.4
batch_size: 64
tuner/epochs: 3
tuner/initial_epoch: 0
tuner/bracket: 2
tuner/round: 0


In [34]:
train_dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), TensorSpec(shape=(None, 7), dtype=tf.float32, name=None))>