<a href="https://colab.research.google.com/github/wkambale/Fine-tuning-BERT-for-text-classification-with-KerasNLP/blob/main/Text_Classification_with_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning BERT for text classification with KerasNLP

A guide to mastering text classification with BERT and KerasNLP

Link to Article: https://kambale.dev//feature-extraction-in-ml

NB: To use this notebook, make a copy first.

MIT License: Copyright (c) 2024 **Wesley Kambale**

## Install necessary libraries

In [None]:
!pip install keras-nlp
!pip install --upgrade transformers
!pip install --upgrade tensorflow

## Import the libraries

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification

import pandas as pd
from sklearn.model_selection import train_test_split

import keras
from keras.layers import Input, Dense
from keras.models import Model
from keras.optimizers import Adam

from keras_nlp import Tokenizer

from keras_nlp import load_bert_finetuned_model

## Load BERT Model with KerasNLP

In [None]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load BERT model
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

## Text Classification

In [None]:
# Load your dataset
data = pd.read_csv('sentiment.csv')

# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Tokenize and convert text data to BERT input format
tokenizer = Tokenizer(model_name)
X_train = tokenizer.tokenize(train_data['text'].tolist())
X_test = tokenizer.tokenize(test_data['text'].tolist())

# Convert labels to numerical format (0 for negative, 1 for positive)
y_train = train_data['sentiment'].map({'negative': 0, 'positive': 1}).values
y_test = test_data['sentiment'].map({'negative': 0, 'positive': 1}).values

## BERT Text Classification Model

In [None]:
# Define the input layer
input_layer = Input(shape=(tokenizer.max_seq_length,), dtype='int32')

# Load BERT model with the specified input layer
bert_output = bert_model(input_layer)

# Add a dense layer for classification
output_layer = Dense(1, activation='sigmoid')(bert_output['pooled_output'])

# Build the model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer=Adam(learning_rate=2e-5), loss='binary_crossentropy', metrics=['accuracy'])

## Training & Evaluating the Model

In [None]:
# Train the model
model.fit(X_train, y_train, epochs=3, batch_size=32, validation_split=0.1)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

## Fine-Tuning BERT for Specific Tasks

In [None]:
# Specify the path to the fine-tuned BERT model
fine_tuned_model_path = 'path/to/fine_tuned_model'
fine_tuned_model = load_bert_finetuned_model(fine_tuned_model_path)

In [None]:
# Load your task-specific dataset
task_data = pd.read_csv('task_specific_data.csv')

# Tokenize and convert text data to BERT input format
X_task = tokenizer.tokenize(task_data['text'].tolist())

# Convert labels to numerical format
y_task = task_data['label'].values

In [None]:
# Fine-tune the BERT model
fine_tuned_model.fit(X_task, y_task, epochs=5, batch_size=16, validation_split=0.1)

# Save the fine-tuned model
fine_tuned_model.save('path/to/save/fine_tuned_model')