In [12]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bbc-news/bbc-text.csv


# DATASET 


**You should include the BBC News dataset using the input tab on the right side.  I used this dataset**
: [https://www.kaggle.com/datasets/moazeldsokyx/bbc-news](https://www.kaggle.com/datasets/moazeldsokyx/bbc-news)


# model

In [13]:
# import tensorflow_hub as hub
# import bert.tokenization.FullTokenizer as tokenizer

# I'd like to use TensorFlow libraries here,
# but my internet speed is only around 0.3 Mbps,
# which means it takes over an hour to download both TensorFlow Hub and the tokenizer.
# So, for now, I'm skipping them.

from transformers import TFAutoModel, AutoTokenizer
import tensorflow as tf
import keras

model_name = "bert-base-uncased"
bert_layer = TFAutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


class TextClassifier(keras.Model):
    def __init__(self, bert_layer, num_classes):
        super(TextClassifier, self).__init__()
        self.bert = bert_layer
        self.dropout = keras.layers.Dropout(0.3)
        self.dense = keras.layers.Dense(num_classes, activation="softmax")

    def call(self, inputs):
        outputs = self.bert(inputs)
        sequence_output = outputs.last_hidden_state
        cls_token = sequence_output[:, 0, :] 
        x = self.dropout(cls_token)
        return self.dense(x)


model = TextClassifier(bert_layer, num_classes=5)
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=5e-5),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

# ----------------------------------
# reding data
import pandas as pd
from sklearn.model_selection import train_test_split
import os


df = pd.read_csv("/kaggle/input/bbc-news/bbc-text.csv")
print(df.info())
print(df.head())

label2id = {label: idx for idx, label in enumerate(sorted(df["category"].unique()))}
df["label_id"] = df["category"].map(label2id)

train_df, val_df = train_test_split(
    df, test_size=0.1, stratify=df["label_id"], random_state=42
)


def preprocessing(texts, labels, tokenizer, max_length=128):
    enc = tokenizer(
        texts.tolist(),
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="tf"
    )
    inputs = {
        "input_ids": enc["input_ids"],
        "attention_mask": enc["attention_mask"],
        "token_type_ids": enc["token_type_ids"],
    }
    return inputs, tf.convert_to_tensor(labels.tolist())


train_enc, train_labels = preprocessing(
    train_df["text"], train_df["label_id"], tokenizer
)
val_enc, val_labels = preprocessing(
    val_df["text"], val_df["label_id"], tokenizer
)

train_ds = (
    tf.data.Dataset.from_tensor_slices((train_enc, train_labels))
    .shuffle(500)
    .batch(16)
)
val_ds = tf.data.Dataset.from_tensor_slices((val_enc, val_labels)).batch(16)


model.fit(train_ds, validation_data=val_ds, epochs=30)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2225 non-null   object
 1   text      2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB
None
        category                                               text
0           tech  tv future in the hands of viewers with home th...
1       business  worldcom boss  left books alone  former worldc...
2          sport  tigers wary of farrell  gamble  leicester say ...
3          sport  yeading face newcastle in fa cup premiership s...
4  entertainment  ocean s twelve raids box office ocean s twelve...
Epoch 1/30


W0000 00:00:1755766157.444155     110 assert_op.cc:38] Ignoring Assert operator text_classifier_6_1/tf_bert_model_6/bert/embeddings/assert_less/Assert/Assert


[1m125/126[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 146ms/step - accuracy: 0.2938 - loss: 1.5557

W0000 00:00:1755766179.309161     111 assert_op.cc:38] Ignoring Assert operator text_classifier_6_1/tf_bert_model_6/bert/embeddings/assert_less/Assert/Assert


[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step - accuracy: 0.2944 - loss: 1.5550

W0000 00:00:1755766186.380563     109 assert_op.cc:38] Ignoring Assert operator text_classifier_6_1/tf_bert_model_6/bert/embeddings/assert_less/Assert/Assert
W0000 00:00:1755766190.452233     111 assert_op.cc:38] Ignoring Assert operator text_classifier_6_1/tf_bert_model_6/bert/embeddings/assert_less/Assert/Assert


[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 258ms/step - accuracy: 0.2950 - loss: 1.5543 - val_accuracy: 0.5516 - val_loss: 1.3203
Epoch 2/30
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 171ms/step - accuracy: 0.5707 - loss: 1.2897 - val_accuracy: 0.7220 - val_loss: 1.1467
Epoch 3/30
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 161ms/step - accuracy: 0.7256 - loss: 1.1201 - val_accuracy: 0.8161 - val_loss: 1.0059
Epoch 4/30
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 157ms/step - accuracy: 0.8098 - loss: 0.9798 - val_accuracy: 0.8565 - val_loss: 0.8898
Epoch 5/30
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 160ms/step - accuracy: 0.8556 - loss: 0.8628 - val_accuracy: 0.9013 - val_loss: 0.7946
Epoch 6/30
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 164ms/step - accuracy: 0.8972 - loss: 0.7614 - val_accuracy: 0.9103 - val_loss: 0.7151
Epoch 7/30
[1m126/12

<keras.src.callbacks.history.History at 0x783ebdfcab50>