In [1]:
import tensorflow as tf




In [2]:
import tensorflow_datasets as tfds
import tensorflow as tf

In [3]:
(train_data,test_data),info=tfds.load(
    "imdb_reviews",
    split=['train','test'],
    as_supervised=True,
    with_info=True
)
print(info)

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset. This is a dataset for binary sentiment
    classification containing substantially more data than previous benchmark
    datasets. We provide a set of 25,000 highly polar movie reviews for training,
    and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_dir='C:\\Users\\LENOVO\\tensorflow_datasets\\imdb_reviews\\plain_text\\1.0.0',
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=int64, num_classes=2),
        'text': Text(shape=(), dtype=string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    nondeterministic_order=False,
    splits={
        'test': 

In [4]:
for text,label in train_data.take(3):
    print("Review: ",text.numpy().decode("utf-8")[:])
    print("Label:",label.numpy())
    print("-"*50)

Review:  This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.
Label: 0
--------------------------------------------------
Review:  I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occa

In [5]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
tokenizer=AutoTokenizer.from_pretrained(
    "distilbert-base-uncased"
)

In [7]:
MAX_LEN=96
def tokenize_review(text,label):
    text=text.numpy().decode("utf-8")
    tokens=tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )
    return(
        tokens["input_ids"],
        tokens["attention_mask"]
        ,label
    )

In [8]:
def tf_tokenize_review(text,label):
    input_ids,attention_mask,label=tf.py_function(
        tokenize_review,
        inp=[text,label],
        Tout=[tf.int32,tf.int32,tf.int64]
    )
    input_ids.set_shape([MAX_LEN])
    attention_mask.set_shape([MAX_LEN])
    label.set_shape([])
    
    return{
        "input_ids":input_ids,
        "attention_mask":attention_mask
    },label

In [9]:
BATCH_SIZE = 32

train_ds = (
    train_data
    .map(tf_tokenize_review, num_parallel_calls=tf.data.AUTOTUNE)
    .shuffle(10000)
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(tf.data.AUTOTUNE)
)

test_ds = (
    test_data
    .map(tf_tokenize_review, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(tf.data.AUTOTUNE)
)


In [10]:
for batch in train_ds.take(1):
    inputs, labels = batch
    print(f"{inputs['input_ids'].shape}")
    print(inputs["attention_mask"].shape)
    print(labels.shape)


(32, 96)
(32, 96)
(32,)


In [11]:
from transformers import TFAutoModel

bert=TFAutoModel.from_pretrained(
    "distilbert-base-uncased",
    use_safetensors=False
)
bert.trainable=False





TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'activation_13', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint wa

In [12]:
class SentimentModel(tf.keras.Model):
    def __init__(self,transformer,num_classes=2):
        super().__init__()
        self.transformer=transformer
        self.classifier=tf.keras.layers.Dense(
            num_classes,
            activation="softmax"
        )
    def call(self,inputs):
        outputs=self.transformer(inputs)
        token_embeddings=outputs.last_hidden_state
        mask=tf.cast(inputs["attention_mask"],tf.float32)
        mask=tf.expand_dims(mask,axis=-1)
        pooled=tf.reduce_sum(token_embeddings*mask,axis=1)
        pooled=pooled/tf.reduce_sum(mask,axis=1)
        return self.classifier(pooled)

In [13]:
model = SentimentModel(bert)

for inputs, labels in train_ds.take(1):
    outputs = model(inputs)
    print(outputs.shape)

(32, 2)


In [14]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=["accuracy"]
)

In [15]:
history=model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=2
)

Epoch 1/2


















Epoch 2/2


In [16]:
print(f"Final Train Accuracy: {history.history['accuracy'][-1]}")
print(f"Final Value Accuracy: {history.history['val_accuracy'][-1]}")

Final Train Accuracy: 0.6715599894523621
Final Value Accuracy: 0.704479992389679


In [17]:
print(tf.config.list_physical_devices("GPU"))

[]


In [18]:

for layer in bert.layers[-2:]:
    layer.trainable=True

In [19]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=["accuracy"]
)

In [21]:
history_ft=model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=1
)



In [23]:
print(f"Fine Tuned Train Accuracy: {history_ft.history['accuracy'][-1]}")
print(f"Fine Tuned Test Accuracy: {history_ft.history['val_accuracy'][-1]}")

Fine Tuned Train Accuracy: 0.7117199897766113
Fine Tuned Test Accuracy: 0.718720018863678


In [24]:
import os
MODEL_DIR="models/classifier/sentiment"
os.makedirs(MODEL_DIR,exist_ok=True)
model.save(MODEL_DIR)







































INFO:tensorflow:Assets written to: models/classifier/sentiment\assets


INFO:tensorflow:Assets written to: models/classifier/sentiment\assets










In [26]:
TOKENIZER_DIR="models/embedder/tokenizer"
os.makedirs(TOKENIZER_DIR,exist_ok=True)
tokenizer.save_pretrained(TOKENIZER_DIR)

('models/embedder/tokenizer\\tokenizer_config.json',
 'models/embedder/tokenizer\\special_tokens_map.json',
 'models/embedder/tokenizer\\vocab.txt',
 'models/embedder/tokenizer\\added_tokens.json',
 'models/embedder/tokenizer\\tokenizer.json')