# CPU run keras-nlp examples

- https://github.com/keras-team/keras-nlp/

# 0. keras dataset basic

- get content/label from dataset
- build content/label to dataset

In [None]:
# download data
import tensorflow_datasets as tfds

imdb_train, imdb_test = tfds.load(
    "imdb_reviews",
    split=["train", "test"],
    as_supervised=True,
    batch_size=16,
)


In [None]:
# the data type
type(imdb_train)

In [None]:
# check one batch of data
for x in imdb_train:
    break
    
print(x)

In [None]:
len(x), len(x[0]), len(x[1])  # x[0]: contents;   x[1]: label

In [None]:
# get data size

sample_count_train = 0
for x in imdb_train:
    sample_count_train+=len(x[0])
print(sample_count_train)

sample_count_test = 0
for x in imdb_test:
    sample_count_test+=len(x[0])
print(sample_count_test)


In [None]:
# load 2% of data
import tensorflow_datasets as tfds

imdb_s_train, imdb_s_test = tfds.load(
    "imdb_reviews",
    split=["train[:2%]", "test[:2%]"],
    as_supervised=True,
    batch_size=16,
)
# get data size

sample_count_train = 0
for x in imdb_s_train:
    sample_count_train+=len(x[0])
print(sample_count_train)

sample_count_test = 0
for x in imdb_s_test:
    sample_count_test+=len(x[0])
print(sample_count_test)

In [None]:
# raw data to dataset
import tensorflow as tf


def build_dataset(texts, labels=None, batch_size=32,
                  cache=False, drop_remainder=True,
                  repeat=False, shuffle=True):
    AUTO = tf.data.AUTOTUNE  # AUTOTUNE option
    slices = (texts,) if labels is None else (texts, labels)  # Create slices
    ds = tf.data.Dataset.from_tensor_slices(slices)  # Create dataset from slices
    ds = ds.cache() if cache else ds  # Cache dataset if enabled
    ds = ds.repeat() if repeat else ds  # Repeat dataset if enabled
    opt = tf.data.Options()  # Create dataset options
    if shuffle: 
        ds = ds.shuffle(shuffle, seed=666)  # Shuffle dataset if enabled
        opt.experimental_deterministic = False
    ds = ds.with_options(opt)  # Set dataset options
    ds = ds.batch(batch_size, drop_remainder=drop_remainder)  # Batch dataset
    ds = ds.prefetch(AUTO)  # Prefetch next batch
    return ds

# raw data
texts = [    'aaaaaa','bbbbbb','cccccc','dddddd','eeeeee','ffffff', 'gggggg',
            'aaa','bbb','ccc','ddd','eee','fff','ggg']
labels = [1,1,1,1,1,1,1, 0,0,0,0,0,0,0]
# build dataset from raw data
ds = build_dataset(texts, labels, batch_size=2)
# interation dataset
for x in ds:
    print(x)

# 1. keras-nlp example1: basic train and test

In [None]:
import os
os.environ["KERAS_BACKEND"] = "torch"  # "tensorflow" or "jax" or "torch"!

import keras_nlp
import tensorflow_datasets as tfds

# download dataset
imdb_train, imdb_test = tfds.load(
    "imdb_reviews",
    split=["train[:2%]", "test[:2%]"],  # load 2% of data
    as_supervised=True,
    batch_size=16,
)
# Load a BERT model. (download model)
classifier = keras_nlp.models.BertClassifier.from_preset(
    "bert_base_en_uncased", 
    num_classes=2,
    activation="softmax",
)
# Fine-tune on IMDb movie reviews.
classifier.fit(imdb_train, validation_data=imdb_test, epochs=3)
# Predict two new examples.
classifier.predict(["What an amazing movie!", "A total waste of my time."])


# 2. load your own dataset for training and prediction

In [None]:
import os
os.environ["KERAS_BACKEND"] = "torch"  # "tensorflow" or "jax" or "torch"!
import tensorflow as tf
import keras_nlp
import tensorflow_datasets as tfds


def build_dataset(texts, labels=None, batch_size=32,
                  cache=False, drop_remainder=True,
                  repeat=False, shuffle=True):
    AUTO = tf.data.AUTOTUNE  # AUTOTUNE option
    slices = (texts,) if labels is None else (texts, labels)  # Create slices
    ds = tf.data.Dataset.from_tensor_slices(slices)  # Create dataset from slices
    ds = ds.cache() if cache else ds  # Cache dataset if enabled
    ds = ds.repeat() if repeat else ds  # Repeat dataset if enabled
    opt = tf.data.Options()  # Create dataset options
    if shuffle: 
        ds = ds.shuffle(shuffle, seed=666)  # Shuffle dataset if enabled
        opt.experimental_deterministic = False
    ds = ds.with_options(opt)  # Set dataset options
    ds = ds.batch(batch_size, drop_remainder=drop_remainder)  # Batch dataset
    ds = ds.prefetch(AUTO)  # Prefetch next batch
    return ds

# raw data
texts = [    'aaaaaa','bbbbbb','cccccc','dddddd','eeeeee','ffffff', 'gggggg',
            'aaa','bbb','ccc','ddd','eee','fff','ggg']
labels = [1,1,1,1,1,1,1, 0,0,0,0,0,0,0]
# build dataset from raw data
ds = build_dataset(texts, labels, batch_size=2)
# Load a BERT model. (download model)
# from_preset 
#    constructs a keras.Model instance with preset preprocessing, architecture and weights. 
#    This means that we can pass raw strings in any format accepted by a keras.Model and get output specific to our task
classifier = keras_nlp.models.BertClassifier.from_preset(
    "bert_base_en_uncased", 
    num_classes=2,
    activation="softmax",
    load_weights=True # Whether to load pre-trained weights into model.  Defaults to `True`.
)
# train: this is fine tuning since we loaded pre-trained weights and small epoch
classifier.fit(ds, epochs=2)
# Predict .
classifier.predict(["hhhhhh", "ooo"])
