In [None]:
!pip install datasets

In [None]:
import argparse
import datasets
import pandas
import transformers
import tensorflow as tf
import numpy

# use the tokenizer from DistilRoBERT
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(examples):
    """Converts the text of each example to "input_ids", a sequence of integers
    representing 1-hot vectors for each token in the text"""
    return tokenizer(examples["text"], truncation=True, max_length=64,
                     padding="max_length", return_tensors="tf")

def train(model_path="model", train_path="/content/drive/MyDrive/Colab Notebooks/train.csv", dev_path="/content/drive/MyDrive/Colab Notebooks/dev.csv"):
    # Load the CSVs into HuggingFace datasets to allow tokenizer usage
    hf_dataset = datasets.load_dataset("csv", data_files={
        "train": train_path, "validation": dev_path
    })

    # Define labels as column names except the first (text column)
    labels = hf_dataset["train"].column_names[1:]

    def gather_labels(example):
        """Convert label columns into a list of 0s and 1s"""
        return {"labels": [float(example[l]) for l in labels]}

    # Map the tokenizer and label gather functions
    hf_dataset = hf_dataset.map(gather_labels)
    hf_dataset = hf_dataset.map(tokenize, batched=True)

    # Convert HuggingFace datasets to TensorFlow datasets
    train_dataset = hf_dataset["train"].to_tf_dataset(
        columns=["input_ids"], label_cols="labels", batch_size=16, shuffle=True
    )
    dev_dataset = hf_dataset["validation"].to_tf_dataset(
        columns=["input_ids"], label_cols="labels", batch_size=16
    )

    # Define model architecture
    input_layer = tf.keras.Input(shape=(64,), dtype=tf.int32, name="input_ids")
    embedding_layer = tf.keras.layers.Embedding(
        input_dim=tokenizer.vocab_size, output_dim=128, input_length=64)(input_layer)
    lstm_output = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)
    )(embedding_layer)
    attention_output = tf.keras.layers.Attention()([lstm_output, lstm_output])
    attention_output = tf.keras.layers.LayerNormalization()(attention_output) #trial
    pooled_output = tf.keras.layers.GlobalAveragePooling1D()(attention_output)
    dropout_layer = tf.keras.layers.Dropout(0.5)(pooled_output)
    dense_layer = tf.keras.layers.Dense(256, activation="relu")(dropout_layer) #trial # add L2 regularization for dense layer with value 0.0005
    output_layer = tf.keras.layers.Dense(len(labels), activation="sigmoid")(dense_layer)

    model = tf.keras.Model(inputs=input_layer, outputs=output_layer)

    # Compile the model
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.1), #try label smmothing as 0.05
        metrics=[tf.keras.metrics.F1Score(average="micro", threshold=0.5)]
    )

    # model = transformers.TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(labels))
    # optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
    # model.compile(optimizer=optimizer, loss=tf.keras.losses.BinaryCrossentropy(), metrics=[tf.keras.metrics.F1Score(average="micro", threshold=0.5)])

    #Define callbacks
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=3, restore_best_weights=True, verbose=1
    )
    # try cosine or exponential LR for smoother rate decay
    lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", factor=0.5, patience=3, verbose=1
    )
    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath=f"{model_path}.keras",
        monitor="val_f1_score",
        mode="max",
        save_best_only=True,
        verbose=1
    )
    tensorboard = tf.keras.callbacks.TensorBoard(log_dir="logs", histogram_freq=1)

    # Train the model
    model.fit(
        train_dataset,
        validation_data=dev_dataset,
        epochs=10,
        callbacks=[early_stopping, lr_scheduler, model_checkpoint, tensorboard]
    )

train()

In [None]:
def predict(model_path="/content/model", input_path="/content/drive/MyDrive/Colab Notebooks/dev.csv"):

    # load the saved model
    model = tf.keras.models.load_model(f"{model_path}.keras")

    # load the data for prediction
    # use Pandas here to make assigning labels easier later
    df = pandas.read_csv(input_path)

    # create input features in the same way as in train()
    hf_dataset = datasets.Dataset.from_pandas(df)
    hf_dataset = hf_dataset.map(tokenize, batched=True)
    #hf_dataset = hf_dataset.map(to_bow)
    tf_dataset = hf_dataset.to_tf_dataset(
        columns=["input_ids", "attention_mask"],
        batch_size=16)

    # generate predictions from model
    predictions = numpy.where(model.predict(tf_dataset) > 0.5, 1, 0)

    # assign predictions to label columns in Pandas data frame
    df.iloc[:, 1:] = predictions

    # write the Pandas dataframe to a zipped CSV file
    df.to_csv("submission_83.zip", index=False, compression=dict(
        method='zip', archive_name=f'submission_83.csv'))
predict()