# TensorFlow BERT Model Training
## Getting Started

[notebook](https://www.kaggle.com/code/yufengdev/bbc-text-categorization/notebook) provided by the Kaggle article 

## Setup

In [1]:
pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:

from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

import os
import itertools

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras
layers = keras.layers
models = keras.models


# This code was tested with TensorFlow v1.8
print("You have TensorFlow version", tf.__version__)


  from .autonotebook import tqdm as notebook_tqdm


You have TensorFlow version 2.15.0


## Data Loading

In [3]:
# Load your dataset
datapath = "bbc-text.csv"
data = pd.read_csv(datapath)
data

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


In [4]:
data['category'].value_counts()

category
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64

In [5]:
train_size = int(len(data) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

Train size: 1780
Test size: 445


In [6]:
def train_test_split(data, train_size):
    train = data[:train_size]
    test = data[train_size:]
    return train, test

## Data Preperation
#### 1. split the data into training and testing size

In [7]:
train_cat, test_cat = train_test_split(data['category'], train_size)
train_text, test_text = train_test_split(data['text'], train_size)

#### 2. Then we'll tokenize the words (text), and then convert them to a numbered index.

In [8]:
max_words = 1000
tokenize = keras.preprocessing.text.Tokenizer(num_words=max_words, 
                                              char_level=False)

#### 3. Next we'll do the same for the labels (categories), by using the LabelEncoder utility.

In [9]:
tokenize.fit_on_texts(train_text) # fit tokenizer to our training text data
x_train = tokenize.texts_to_matrix(train_text)
x_test = tokenize.texts_to_matrix(test_text)

Use sklearn utility to convert label strings to numbered index

In [10]:
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_cat)
y_train = encoder.transform(train_cat)
y_test = encoder.transform(test_cat)

#### 4.Finally, we'll convert the labels to a one-hot representation.

In [11]:

# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [12]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (1780, 1000)
x_test shape: (445, 1000)
y_train shape: (1780, 5)
y_test shape: (445, 5)


## Train the Model

In [13]:
# This model trains very quickly and 2 epochs are already more than enough
# Training for more epochs will likely lead to overfitting on this dataset

batch_size = 32
epochs = 2
drop_ratio = 0.5

In [14]:
# Build the model
model = models.Sequential()
model.add(layers.Dense(512, input_shape=(max_words,)))
model.add(layers.Activation('relu'))
# model.add(layers.Dropout(drop_ratio))
model.add(layers.Dense(num_classes))
model.add(layers.Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [15]:
# model.fit trains the model
# The validation_split param tells Keras what % of our training data should be used in the validation set
# You can see the validation loss decreasing slowly when you run this
# Because val_loss is no longer decreasing we stop training to prevent overfitting
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/2


Epoch 2/2


## Evaluate Model

In [16]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.13507680594921112
Test accuracy: 0.9550561904907227


In [22]:
import pickle

# Assume `model` is your trained model from the provided code snippet

# Serializing the model
with open('text_categorization_model.pkl', 'wb') as file:
    pickle.dump(model, file)
with open('tokenizer.pkl', 'wb') as t:
    pickle.dump(tokenize, t)

# Your label encoder instance is `encoder`
with open('encoder.pkl', 'wb') as e:
    pickle.dump(encoder, e)

print("Model has been saved as 'text_categorization_model.pkl'")


Model has been saved as 'text_categorization_model.pkl'


In [21]:
import pickle

# Load the model from the pickle file
with open('text_categorization_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

print("Model has been loaded.")
# Example new texts
new_texts = ["messi is good soccer player", "I love three body problem, it is a good movie"]

# Assuming `tokenize` is your tokenizer and it's already fitted to your training data
# You might need to load or reinitialize your tokenizer here if starting a new session

# Tokenize the new texts
x_new = tokenize.texts_to_matrix(new_texts)

# Make predictions with the loaded model
predictions = loaded_model.predict(x_new)

# Assuming `encoder` is your LabelEncoder and it's already fitted to your training labels
# Convert predictions to label names
# You might need to load or reinitialize your encoder here if starting a new session
predicted_labels = encoder.inverse_transform([np.argmax(pred) for pred in predictions])

print(predicted_labels)



Model has been loaded.
['sport' 'entertainment']
