In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
import argparse
import datasets
import pandas
import transformers
import tensorflow as tf
import numpy

# use the tokenizer from DistilRoBERT
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(examples):
    """Converts the text of each example to "input_ids", a sequence of integers
    representing 1-hot vectors for each token in the text"""
    return tokenizer(examples["text"], truncation=True, max_length=64,
                     padding="max_length", return_tensors="tf")

def train(model_path="model", train_path="/content/drive/MyDrive/Colab Notebooks/train.csv", dev_path="/content/drive/MyDrive/Colab Notebooks/dev.csv"):
    # Load the CSVs into HuggingFace datasets to allow tokenizer usage
    hf_dataset = datasets.load_dataset("csv", data_files={
        "train": train_path, "validation": dev_path
    })

    # Define labels as column names except the first (text column)
    labels = hf_dataset["train"].column_names[1:]

    def gather_labels(example):
        """Convert label columns into a list of 0s and 1s"""
        return {"labels": [float(example[l]) for l in labels]}

    # Map the tokenizer and label gather functions
    hf_dataset = hf_dataset.map(gather_labels)
    hf_dataset = hf_dataset.map(tokenize, batched=True)

    # Convert HuggingFace datasets to TensorFlow datasets
    train_dataset = hf_dataset["train"].to_tf_dataset(
        columns=["input_ids"], label_cols="labels", batch_size=16, shuffle=True
    )
    dev_dataset = hf_dataset["validation"].to_tf_dataset(
        columns=["input_ids"], label_cols="labels", batch_size=16
    )

    # L2 Regularization
    #l2_reg = tf.keras.regularizers.l2(0.0005)

    # Define model architecture
    input_layer = tf.keras.Input(shape=(64,), dtype=tf.int32, name="input_ids")
    embedding_layer = tf.keras.layers.Embedding(
        input_dim=tokenizer.vocab_size, output_dim=128, input_length=64)(input_layer)
    lstm_output = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)
    )(embedding_layer)
    attention_output = tf.keras.layers.Attention()([lstm_output, lstm_output])
    attention_output = tf.keras.layers.LayerNormalization()(attention_output) #trial
    pooled_output = tf.keras.layers.GlobalAveragePooling1D()(attention_output)
    dropout_layer = tf.keras.layers.Dropout(0.5)(pooled_output)
    dense_layer = tf.keras.layers.Dense(256, activation="relu")(dropout_layer) #trial # add L2 regularization for dense layer with value 0.0005
    output_layer = tf.keras.layers.Dense(len(labels), activation="sigmoid")(dense_layer)

    model = tf.keras.Model(inputs=input_layer, outputs=output_layer)

    lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", factor=0.5, patience=2, verbose=1
    )

    # Compile the model
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.05), #try label smmothing as 0.05
        metrics=[tf.keras.metrics.F1Score(average="micro", threshold=0.5)]
    )

    # model = transformers.TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(labels))
    # optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
    # model.compile(optimizer=optimizer, loss=tf.keras.losses.BinaryCrossentropy(), metrics=[tf.keras.metrics.F1Score(average="micro", threshold=0.5)])

    #Define callbacks
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=3, restore_best_weights=True, verbose=1
    )

    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath=f"{model_path}.keras",
        monitor="val_f1_score",
        mode="max",
        save_best_only=True,
        verbose=1
    )
    tensorboard = tf.keras.callbacks.TensorBoard(log_dir="logs", histogram_freq=1)

    # Train the model
    model.fit(
        train_dataset,
        validation_data=dev_dataset,
        epochs=10,
        callbacks=[early_stopping,lr_scheduler, model_checkpoint, tensorboard]
    )

train()

Epoch 1/10
[1m1575/1575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 400ms/step - f1_score: 0.1027 - loss: 0.2864
Epoch 1: val_f1_score improved from -inf to 0.73891, saving model to model.keras
[1m1575/1575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m662s[0m 415ms/step - f1_score: 0.1028 - loss: 0.2864 - val_f1_score: 0.7389 - val_loss: 0.1981 - learning_rate: 0.0010
Epoch 2/10
[1m1575/1575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 405ms/step - f1_score: 0.7512 - loss: 0.1905
Epoch 2: val_f1_score improved from 0.73891 to 0.82719, saving model to model.keras
[1m1575/1575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m688s[0m 418ms/step - f1_score: 0.7512 - loss: 0.1905 - val_f1_score: 0.8272 - val_loss: 0.1715 - learning_rate: 0.0010
Epoch 3/10
[1m1575/1575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 403ms/step - f1_score: 0.8245 - loss: 0.1708
Epoch 3: val_f1_score improved from 0.82719 to 0.83020, saving model to model.keras
[1m1575/1575[0m

In [None]:
def predict(model_path="/content/model", input_path="/content/drive/MyDrive/Colab Notebooks/dev.csv"):

    # load the saved model
    model = tf.keras.models.load_model(f"{model_path}.keras")

    # load the data for prediction
    # use Pandas here to make assigning labels easier later
    df = pandas.read_csv(input_path)

    # create input features in the same way as in train()
    hf_dataset = datasets.Dataset.from_pandas(df)
    hf_dataset = hf_dataset.map(tokenize, batched=True)
    #hf_dataset = hf_dataset.map(to_bow)
    tf_dataset = hf_dataset.to_tf_dataset(
        columns=["input_ids", "attention_mask"],
        batch_size=16)

    # generate predictions from model
    predictions = numpy.where(model.predict(tf_dataset) > 0.5, 1, 0)

    # assign predictions to label columns in Pandas data frame
    df.iloc[:, 1:] = predictions

    # write the Pandas dataframe to a zipped CSV file
    df.to_csv("submission_83.zip", index=False, compression=dict(
        method='zip', archive_name=f'submission_83.csv'))
predict()

Map:   0%|          | 0/3149 [00:00<?, ? examples/s]

Expected: input_ids
Received: inputs=['Tensor(shape=(16, 64))']


[1m196/197[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 81ms/step

Expected: input_ids
Received: inputs=['Tensor(shape=(None, 64))']


[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 85ms/step
