In [1]:
import argparse
import datasets
import pandas
import transformers
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import keras_tuner as kt

from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import (L1,L2) 
from tensorflow.keras.layers import (SimpleRNN, Dense, Conv1D, Conv2D, MaxPooling2D,
                                      Flatten, Bidirectional, LSTM, GRU, Embedding, 
                                      Dropout, GlobalMaxPooling1D, GlobalAveragePooling1D)

In [2]:
pip show tensorflow

Name: tensorflow
Version: 2.13.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: c:\users\hewyu\anaconda3\lib\site-packages
Requires: tensorflow-intel
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [2]:
# use the tokenizer from DistilRoBERTa
tokenizer = transformers.AutoTokenizer.from_pretrained("distilroberta-base")

def tokenize(examples):
    """Converts the text of each example to "input_ids", a sequence of integers
    representing 1-hot vectors for each token in the text"""
    return tokenizer(examples["text"], truncation=True, max_length=64,
                     padding="max_length")

model_path="model"
train_path="../graduate-project-data/train_10s.csv"
dev_path="../graduate-project-data/dev_10s.csv"

# load the CSVs into Huggingface datasets to allow use of the tokenizer
hf_dataset = datasets.load_dataset("csv", data_files={
    "train": train_path, "validation": dev_path})

# the labels are the names of all columns except the first
labels = hf_dataset["train"].column_names[1:]

def gather_labels(example):
    """Converts the label columns into a list of 0s and 1s"""
    # the float here is because F1Score requires floats
    return {"labels": [float(example[l]) for l in labels]}

# convert text and labels to format expected by model
hf_dataset = hf_dataset.map(gather_labels)
hf_dataset = hf_dataset.map(tokenize, batched=True)
#hf_dataset = hf_dataset.map(to_bow) # For Feed Forward NN

# convert Huggingface datasets to Tensorflow datasets
train_dataset = hf_dataset["train"].to_tf_dataset(
    columns="input_ids", # "input_bow" for FF
    label_cols="labels", 
    batch_size=16,
    shuffle=True)
dev_dataset = hf_dataset["validation"].to_tf_dataset(
    columns="input_ids", # input_bow for FF
    label_cols="labels",
    batch_size=16)

In [3]:
# Class weight calculation
all_ld = np.array([])
for batch in train_dataset:
    input_data, label_data = batch
    ld = tf.convert_to_tensor(label_data, dtype=tf.float32)
    all_ld = np.vstack([all_ld, ld.numpy()]) if all_ld.size else ld.numpy()

# You can sum along axis 0 to get the count of each class
class_counts = np.sum(all_ld, axis=0)

# total number of samples in your training dataset
total = len(all_ld)

# Calculate class weights
#class_weights = {i: total / (2.0 * count) for i, count in enumerate(class_counts)}

# class_weights = {
# 0: total / (2.0 * class_counts[0]) * 3,  # Admiration
# 1: total / (2.0 * class_counts[1]) * 2, # Amusement
# 2: total / (2.0 * class_counts[2]), # Gratitude
# 3: total / (2.0 * class_counts[3]) * 3, # Love
# 4: total / (2.0 * class_counts[4]), # Pride
# 5: total / (2.0 * class_counts[5]), # Relief
# 6: total / (2.0 * class_counts[6]),  # Remorse
# }

#print(class_counts, total, class_weights)

In [4]:
# https://stackoverflow.com/questions/76109047/integrate-batch-size-in-keras-tuner
# https://keras.io/guides/keras_tuner/getting_started/#tune-model-training
class my_model:

    def model_builder(self, hp):
    
        # Bidiretional GRU model
        model = Sequential()

        # For bgru_h
        # hp_output_dim = hp.Choice('output_dim', values=[32, 64, 128, 256])
        # hp_gru = hp.Choice('gru_units', values=[32, 64, 128, 256])
        # hp_do1 = hp.Float('dropout_1', min_value=0.1, max_value=1, step=0.1)
        # hp_d =  hp.Choice('dense', values=[32, 64, 128, 256])
        # hp_do2 = hp.Float('dropout_2', min_value=0.1, max_value=1, step=0.1)
        # hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
        # hp_thresh = hp.Float('threshold', min_value=0.1, max_value=1, step=0.1)
        # self.hp_batch_size = hp.Choice('batch_size', values=[16, 32, 64])

        # For cnn_w3h
        # hp_output_dim = hp.Choice('output_dim', values=[32, 64, 128, 256])
        # hp_conv1d = hp.Choice('conv1d_units', values=[32, 64, 128, 256])
        # hp_kernel = hp.Int('kernel', min_value=1, max_value=5, step=1)
        # hp_strides = hp.Choice('strides', values=[1,2,3])
        # hp_do = hp.Float('dropout_1', min_value=0.1, max_value=1, step=0.1)
        # hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
        # hp_thresh = hp.Float('threshold', min_value=0.1, max_value=1, step=0.1)
        # self.hp_batch_size = hp.Choice('batch_size', values=[16, 32, 64])
    
        # model.add(Embedding(input_dim=tokenizer.vocab_size, output_dim=hp_output_dim, input_length=64))
        # model.add(Bidirectional(GRU(units=hp_gru, return_sequences=True), input_shape=(tokenizer.vocab_size,)))
        # model.add(Dropout(rate=hp_do1)) 
        # model.add(Dense(units=hp_d, activation='relu')) #model.add(Dense(64, activation='relu', kernel_regularizer=L2(0.01)))
        # model.add(Dropout(rate=hp_do2))
        # model.add(Flatten()) 
        # model.add(Dense(len(labels), activation='sigmoid'))

        # CNN model
        model = Sequential()
        model.add(Embedding(input_dim=tokenizer.vocab_size, output_dim=256, input_length=64))
        model.add(Conv1D(128, kernel_size=2, strides=1, activation='relu', input_shape=(tokenizer.vocab_size,)))
        model.add(GlobalMaxPooling1D())
        model.add(Dropout(0.1))
        model.add(Dense(len(labels), activation='sigmoid'))
    
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
            loss=tf.keras.losses.binary_crossentropy,
            metrics=[tf.keras.metrics.F1Score(average="micro", threshold=0.5)])
      
        return model

    def fit(self, hp, *args, **kwargs):
        class_weights = {
            0: total / (2.0 * class_counts[0]) * hp.Int('admiration', min_value=1, max_value=5),
            1: total / (2.0 * class_counts[1]) * hp.Int('amusement', min_value=1, max_value=5),
            2: total / (2.0 * class_counts[2]) * hp.Int('gratitude', min_value=1, max_value=5),
            3: total / (2.0 * class_counts[3]) * hp.Int('love', min_value=1, max_value=5),
            4: total / (2.0 * class_counts[4]) * hp.Int('pride', min_value=1, max_value=5),
            5: total / (2.0 * class_counts[5]) * hp.Int('relief', min_value=1, max_value=5),
            6: total / (2.0 * class_counts[6]) * hp.Int('remorse', min_value=1, max_value=5),
        }
        model = self.model_builder(hp)
        return model.fit(
            *args,
             batch_size=32, #batch_size=self.hp_batch_size,
            class_weight=class_weights,
            **kwargs,
        )

In [5]:
my_model = my_model()
# Specify the objective with direction
objective = kt.Objective("val_f1_score", direction="max")

tuner = kt.Hyperband(my_model.model_builder,
                     objective=objective,
                     max_epochs=20,
                     factor=3,
                     overwrite=True,
                     directory='../',
                     project_name='cnn_w3h_etm5_1h')

In [6]:
tuner.search(train_dataset, 
                epochs=20, 
                #batch_size=hp_batch_size,
                validation_data=dev_dataset,
                callbacks=[tf.keras.callbacks.ModelCheckpoint(
                filepath=model_path,
                monitor="val_f1_score",
                save_best_only=True),
                EarlyStopping(monitor='val_f1_score', mode='max', patience=5, restore_best_weights=True)])

Trial 30 Complete [00h 00m 30s]
val_f1_score: 0.0

Best val_f1_score So Far: 0.7878788113594055
Total elapsed time: 00h 46m 09s


In [20]:
oracle_state = tuner.oracle.get_state()
trials = oracle_state['trials']
print("Number of trials executed:", len(trials))


KeyError: 'trials'

In [7]:
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best Hyperparameters:")
for key, value in best_hyperparameters.values.items():
    print(f"{key}: {value}")

Best Hyperparameters:
output_dim: 256
conv1d_units: 128
kernel: 2
strides: 1
dropout_1: 0.1
learning_rate: 0.001
threshold: 0.5
batch_size: 32
tuner/epochs: 20
tuner/initial_epoch: 0
tuner/bracket: 0
tuner/round: 0


In [22]:

# Assuming your dataset variable is named 'my_dataset'
for batch in train_dataset:
    input_data, label_data = batch
    print("Input data:", input_data)
    print("Label data:", label_data)


Input data: tf.Tensor(
[[    0   574  1168 ...     1     1     1]
 [    0   100   437 ...     1     1     1]
 [    0 44604     4 ...     1     1     1]
 ...
 [    0  7516   127 ...     1     1     1]
 [    0 11979 20060 ...     1     1     1]
 [    0 22086    13 ...     1     1     1]], shape=(16, 64), dtype=int64)
Label data: tf.Tensor(
[[0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]], shape=(16, 7), dtype=float32)
Input data: tf.Tensor(
[[    0 31653    47 ...     1     1     1]
 [    0  2709    47 ...     1     1     1]
 [    0  6209    24 ...     1     1     1]
 ...
 [    0  6785 33664 ...     1     1     1]
 [    0 13987    47 ...     1     1     1]
 

In [39]:
all_ld = np.array([])
for batch in train_dataset:
    input_data, label_data = batch
    ld = tf.convert_to_tensor(label_data, dtype=tf.float32)
    all_ld = np.vstack([all_ld, ld.numpy()]) if all_ld.size else ld.numpy()

# You can sum along axis 0 to get the count of each class
class_counts = np.sum(all_ld, axis=0)

# total number of samples in your training dataset
total = len(all_ld)

# Calculate class weights
#class_weights = {i: total / (2.0 * count) for i, count in enumerate(class_counts)}
class_weights = {
    0: total / (2.0 * class_counts[0]) * 2,  # Admiration
    1: total / (2.0 * class_counts[1]), # Amusement
    2: total / (2.0 * class_counts[2]), # Gratitude
    3: total / (2.0 * class_counts[3]) * 2, # Love
    4: total / (2.0 * class_counts[4]), # Pride
    5: total / (2.0 * class_counts[5]), # Relief
    6: total / (2.0 * class_counts[6]),  # Remorse
}

In [38]:
class_counts, total, class_weights

(array([420., 201., 264., 210.,   9.,  18.,  57.], dtype=float32),
 2520,
 {0: 3.0,
  1: 6.268656716417911,
  2: 4.7727272727272725,
  3: 6.0,
  4: 140.0,
  5: 70.0,
  6: 22.105263157894736})

In [40]:
class_weights

{0: 6.0,
 1: 6.268656716417911,
 2: 4.7727272727272725,
 3: 12.0,
 4: 140.0,
 5: 70.0,
 6: 22.105263157894736}