In [2]:
from transformers import TFBertForSequenceClassification, BertTokenizer
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Load pre-trained BERT model for sequence classification with 4 topic classes
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  3076      
                                                                 
Total params: 109485316 (417.65 MB)
Trainable params: 109485316 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [6]:
# Get the model configuration
model_config = model.get_config()
print(model_config)

{'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_

In [7]:
# Count the number of layers in the model
num_layers = len(model.layers)
print(f"Total number of layers: {num_layers}")

Total number of layers: 3


In [8]:
bert_model = model.bert

# Count the number of encoder layers
num_encoder_layers = len(bert_model.encoder.layer)

print(f"Number of transformer layers (encoder layers) in BERT: {num_encoder_layers}")

Number of transformer layers (encoder layers) in BERT: 12


In [9]:
# Freeze the first 8 layers of the BERT encoder
for layer in model.bert.encoder.layer[:8]:
    layer.trainable = False  # Freezing layers

# Check which layers are frozen
for layer_num, layer in enumerate(model.bert.encoder.layer):
    print(f"Layer {layer_num}: {'trainable' if layer.trainable else 'frozen'}")

Layer 0: frozen
Layer 1: frozen
Layer 2: frozen
Layer 3: frozen
Layer 4: frozen
Layer 5: frozen
Layer 6: frozen
Layer 7: frozen
Layer 8: trainable
Layer 9: trainable
Layer 10: trainable
Layer 11: trainable


In [10]:
# Load dataset (ensure it has 'text' and 'label' columns)
df = pd.read_csv('/kaggle/input/topic-classification/dataset.csv')

In [11]:
df.head()

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


In [12]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [13]:
df.shape

(50000, 2)

In [14]:
df["label"].unique()

array([2, 3, 1, 0])

In [15]:
df.columns

Index(['text', 'label'], dtype='object')

In [16]:
# Split the data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

In [17]:
# Tokenize the text data
def tokenize_data(texts, tokenizer, max_len=512):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors='tf'
    )

In [18]:
train_encodings = tokenize_data(train_texts, tokenizer)
val_encodings = tokenize_data(val_texts, tokenizer)

In [19]:
# Create TensorFlow dataset objects
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels)).batch(16)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels)).batch(16)

In [20]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [21]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  3076      
                                                                 
Total params: 109485316 (417.65 MB)
Trainable params: 52782340 (201.35 MB)
Non-trainable params: 56702976 (216.30 MB)
_________________________________________________________________


In [22]:
# Fine-tune the model
history = model.fit(train_dataset, validation_data=val_dataset, epochs=1)

Cause: for/else statement not yet supported


I0000 00:00:1726670875.283759     103 service.cc:145] XLA service 0x78fade127330 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1726670875.283840     103 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1726670875.283848     103 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1726670875.464843     103 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




In [24]:
# Extract history for accuracy, loss, validation accuracy, and validation loss
training_accuracy = history.history.get('accuracy', [])
training_loss = history.history.get('loss', [])
validation_accuracy = history.history.get('val_accuracy', [])
validation_loss = history.history.get('val_loss', [])

# Print the training accuracy history
print("Training Accuracy History:")
print(training_accuracy)

# Print the training loss history
print("\nTraining Loss History:")
print(training_loss)

# Print the validation accuracy history
print("\nValidation Accuracy History:")
print(validation_accuracy)

# Print the validation loss history
print("\nValidation Loss History:")
print(validation_loss)


Training Accuracy History:
[0.9106500148773193]

Training Loss History:
[0.2649601995944977]

Validation Accuracy History:
[0.9337999820709229]

Validation Loss History:
[0.19422510266304016]


In [25]:
# Predict topic for a new text
category_mapping = {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}

def predict_topic(text, model, tokenizer, category_mapping):
    inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True, max_length=512)
    outputs = model(inputs)
    prediction = tf.argmax(outputs.logits, axis=-1).numpy()[0]
    return category_mapping[prediction]

# Example usage
text = "The stock market is experiencing volatility due to recent global events."
predicted_topic = predict_topic(text, model, tokenizer, category_mapping)
print(f'Predicted Topic: {predicted_topic}')


Predicted Topic: Business


In [26]:
# Save the model in Hugging Face format
model.save_pretrained('fine_tuned_bert_model')

# Save the tokenizer
tokenizer.save_pretrained('fine_tuned_bert_model')


('fine_tuned_bert_model/tokenizer_config.json',
 'fine_tuned_bert_model/special_tokens_map.json',
 'fine_tuned_bert_model/vocab.txt',
 'fine_tuned_bert_model/added_tokens.json')

In [27]:
import shutil

# Zip the model directory
shutil.make_archive('fine_tuned_bert_model', 'zip', 'fine_tuned_bert_model')

# Download the zipped model
from IPython.display import FileLink
FileLink(r'fine_tuned_bert_model.zip')