In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [6]:
import datasets
import pandas
import transformers
import tensorflow as tf
import numpy as np

tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(examples):
    tokens = tokenizer(examples["text"], truncation=True, max_length=64, padding="max_length")
    return {
        "input_ids": tokens["input_ids"],
        "attention_mask": tokens["attention_mask"]
    }

def train(model_path="model",
          train_path="/content/drive/MyDrive/Colab Notebooks/train.csv",
          dev_path="/content/drive/MyDrive/Colab Notebooks/dev.csv"):

    # Load dataset
    hf_dataset = datasets.load_dataset("csv", data_files={
        "train": train_path,
        "validation": dev_path
    })

    labels = hf_dataset["train"].column_names[1:]

    def gather_labels(example):
        return {"labels": [float(example[l]) for l in labels]}

    hf_dataset = hf_dataset.map(gather_labels)
    hf_dataset = hf_dataset.map(tokenize, batched=True)

    train_dataset = hf_dataset["train"].to_tf_dataset(
        columns=["input_ids", "attention_mask"],
        label_cols="labels",
        batch_size=16,
        shuffle=True
    )
    dev_dataset = hf_dataset["validation"].to_tf_dataset(
        columns=["input_ids", "attention_mask"],
        label_cols="labels",
        batch_size=16
    )

    # Load BERT backbone
    bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased")

    # Freeze embeddings
    bert_model.bert.embeddings.trainable = False

    # Freeze encoder layers 0–8, fine-tune 9–11
    for i, layer in enumerate(bert_model.bert.encoder.layer):
        if i < 9:
            layer.trainable = False
        else:
            layer.trainable = True

    # Define your classifier
    class BertMultiLabelClassifier(tf.keras.Model):
        def __init__(self, bert_model, num_labels):
            super().__init__()
            self.bert = bert_model
            self.pooling = tf.keras.layers.GlobalAveragePooling1D()
            self.norm = tf.keras.layers.LayerNormalization()
            self.dropout = tf.keras.layers.Dropout(0.4)
            self.dense = tf.keras.layers.Dense(256, activation="relu",
                                               kernel_regularizer=tf.keras.regularizers.l2(0.0005))
            self.out = tf.keras.layers.Dense(num_labels, activation="sigmoid")

        def call(self, inputs):
            x = self.bert(inputs)[0]  # last_hidden_state
            x = tf.keras.layers.Dropout(0.3)(x)
            x = self.pooling(x)
            x = self.norm(x)
            x = self.dropout(x)
            x = self.dense(x)
            return self.out(x)

    # Initialize model
    model = BertMultiLabelClassifier(bert_model, num_labels=len(labels))

    # ✅ Recompile after freezing layers
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.01),
        metrics=[tf.keras.metrics.F1Score(average="micro", threshold=0.5)]
    )

    # Callbacks
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
    lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2)
    tensorboard = tf.keras.callbacks.TensorBoard(log_dir="logs")

    # Train
    model.fit(
        train_dataset,
        validation_data=dev_dataset,
        epochs=5,
        callbacks=[early_stopping, lr_scheduler, tensorboard]
    )

    # Save model
    bert_model.save_pretrained(f"{model_path}/bert")
    tokenizer.save_pretrained(f"{model_path}/bert")
    model.save_weights(f"{model_path}/custom_head.weights.h5")

    print(f"Training complete")


In [7]:
train()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




✅ Training complete. Model saved to model


In [10]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import TFBertModel, AutoTokenizer
import tensorflow as tf

# Define the same classifier used during training
class BertMultiLabelClassifier(tf.keras.Model):
    def __init__(self, bert_model, num_labels):
        super().__init__()
        self.bert = bert_model
        self.pooling = tf.keras.layers.GlobalAveragePooling1D()
        self.norm = tf.keras.layers.LayerNormalization()
        self.dropout = tf.keras.layers.Dropout(0.4)
        self.dense = tf.keras.layers.Dense(256, activation="relu",
                                           kernel_regularizer=tf.keras.regularizers.l2(0.0005))
        self.out = tf.keras.layers.Dense(num_labels, activation="sigmoid")

    def call(self, inputs):
        x = self.bert(inputs)[0]
        x = tf.keras.layers.Dropout(0.3)(x)
        x = self.pooling(x)
        x = self.norm(x)
        x = self.dropout(x)
        x = self.dense(x)
        return self.out(x)

def predict(model_path="model", input_path="/content/drive/MyDrive/Colab Notebooks/dev.csv"):
    # Load tokenizer and base BERT model
    tokenizer = AutoTokenizer.from_pretrained(f"{model_path}/bert")
    bert_model = TFBertModel.from_pretrained(f"{model_path}/bert")

    # Load input data
    df = pd.read_csv(input_path)
    text_column = "text"
    num_labels = df.shape[1] - 1  # assuming first column is 'text', rest are labels

    # Tokenize using HuggingFace Dataset
    def tokenize(example):
        tokens = tokenizer(example[text_column], padding="max_length", truncation=True, max_length=64)
        return {"input_ids": tokens["input_ids"], "attention_mask": tokens["attention_mask"]}

    hf_dataset = Dataset.from_pandas(df)
    hf_dataset = hf_dataset.map(tokenize, batched=True)

    tf_dataset = hf_dataset.to_tf_dataset(
        columns=["input_ids", "attention_mask"],
        batch_size=16,
        shuffle=False
    )

    # Rebuild the classifier
    model = BertMultiLabelClassifier(bert_model, num_labels)

    # call the model once to build it before loading weights
    dummy_inputs = {
        "input_ids": tf.zeros((1, 64), dtype=tf.int32),
        "attention_mask": tf.zeros((1, 64), dtype=tf.int32)
    }
    model(dummy_inputs)  # builds model graph

    # Load classifier head weights
    model.load_weights(f"{model_path}/custom_head.weights.h5")

    # Predict
    probs = model.predict(tf_dataset)
    predictions = np.where(probs > 0.5, 1, 0)

    # Replace label columns with predictions
    df.iloc[:, 1:] = predictions

    # Save to zip
    df.to_csv("submission_85.zip", index=False, compression=dict(
        method='zip', archive_name='submission_85.csv'))

    print("Predictions written")



In [11]:
predict()

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at model/bert.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Map:   0%|          | 0/3149 [00:00<?, ? examples/s]



✅ Predictions written to submission_85.zip
