In [1]:
!pip uninstall -y tensorflow keras protobuf tokenizers transformers datasets
!pip install tensorflow==2.15.0
!pip install transformers==4.40.0 datasets==2.19.0 tokenizers==0.15.2
!pip install protobuf==3.20.3
!pip install sentencepiece scikit-learn


Found existing installation: tensorflow 2.18.0
Uninstalling tensorflow-2.18.0:
  Successfully uninstalled tensorflow-2.18.0
Found existing installation: keras 3.8.0
Uninstalling keras-3.8.0:
  Successfully uninstalled keras-3.8.0
Found existing installation: protobuf 6.33.0
Uninstalling protobuf-6.33.0:
  Successfully uninstalled protobuf-6.33.0
Found existing installation: tokenizers 0.21.2
Uninstalling tokenizers-0.21.2:
  Successfully uninstalled tokenizers-0.21.2
Found existing installation: transformers 4.53.3
Uninstalling transformers-4.53.3:
  Successfully uninstalled transformers-4.53.3
Found existing installation: datasets 4.4.1
Uninstalling datasets-4.4.1:
  Successfully uninstalled datasets-4.4.1
Collecting tensorflow==2.15.0
  Downloading tensorflow-2.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting ml-dtypes~=0.2.0 (from tensorflow==2.15.0)
  Downloading ml_dtypes-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manyl

# Create project structure

In [2]:
import os

BASE_DIR = "/kaggle/working/mepe"

DIRS = [
    "notebooks",
    "data",
    "models/text_emotion",
    "src",
    "demo"
]

for d in DIRS:
    os.makedirs(os.path.join(BASE_DIR, d), exist_ok=True)

BASE_DIR


'/kaggle/working/mepe'

# Global config

In [3]:
CONFIG = {
    "TEXT_MODEL": "distilbert-base-uncased",
    "MAX_LEN": 96,
    "BATCH_SIZE": 32,
    "LR": 2e-5,
    "EPOCHS": 3,
    "NUM_LABELS": 28
}
CONFIG


{'TEXT_MODEL': 'distilbert-base-uncased',
 'MAX_LEN': 96,
 'BATCH_SIZE': 32,
 'LR': 2e-05,
 'EPOCHS': 3,
 'NUM_LABELS': 28}

In [4]:
!pip uninstall -y transformers datasets tokenizers
!pip install transformers==4.40.0 datasets==2.19.0


[0mCollecting transformers==4.40.0
  Using cached transformers-4.40.0-py3-none-any.whl.metadata (137 kB)
Collecting datasets==2.19.0
  Using cached datasets-2.19.0-py3-none-any.whl.metadata (19 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.40.0)
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting pyarrow-hotfix (from datasets==2.19.0)
  Downloading pyarrow_hotfix-0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets==2.19.0)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting fsspec<=2024.3.1,>=2023.1.0 (from fsspec[http]<=2024.3.1,>=2023.1.0->datasets==2.19.0)
  Downloading fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.19.0)
  Downloading multiproces

# DistilBERT sanity check

In [5]:
from transformers import AutoTokenizer, TFAutoModel

tokenizer = AutoTokenizer.from_pretrained(CONFIG["TEXT_MODEL"])
model = TFAutoModel.from_pretrained(
    CONFIG["TEXT_MODEL"],
    from_pt=True
)

inputs = tokenizer("I feel anxious today", return_tensors="tf")
outputs = model(**inputs)

outputs.last_hidden_state.shape




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

2025-12-13 16:08:08.016237: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-12-13 16:08:08.016304: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-12-13 16:08:08.017764: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


TensorShape([1, 6, 768])

# Dataset access check

In [6]:
from datasets import load_dataset

ds = load_dataset("go_emotions", split="train[:10]")
ds[0]


Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/350k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

{'text': "My favourite food is anything I didn't have to cook myself.",
 'labels': [27],
 'id': 'eebbqej'}

# Save config

In [7]:
import json

with open(f"{BASE_DIR}/config.json", "w") as f:
    json.dump(CONFIG, f, indent=2)

print("Phase 0 complete.")


Phase 0 complete.
