Easy

In [2]:
!pip install --upgrade tensorflow-hub



In [6]:
import tensorflow as tf
import keras_hub
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_hub as hub
import numpy as np

# Load GloVe embeddings pre-trained
embedding = hub.load('https://tfhub.dev/google/Wiki-words-250/1')

# Prepare the text dataset
texts = ["Example sentence 1", "Example sentence 2", "Example sentence 3"]
labels = [0, 1, 0]  # Binary classification

# Tokenize and encode the text
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, padding='post')

# Split dataset into training and test
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2)

# Build the text classification model
# Get the embedding matrix from the loaded embedding model

# Instead of directly accessing embedding.variables[0],
# use the embedding layer within the model definition.
# This allows TensorFlow to manage the scope of the variables correctly.

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=250,
                              # weights=[embedding_matrix],  # Remove this line
                              trainable=False),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Apply the embedding to your input data (X_train or X) and then access the embeddings
# This is done inside the model, ensuring correct scope.

# Example:
# embeddings = model.layers[0](X_train) # Access the embedding layer and apply it to input
# embedding_matrix = embeddings.numpy() # Now you can get the embedding matrix

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Convert X_train and y_train to TensorFlow tensors:
X_train = tf.convert_to_tensor(X_train, dtype=tf.int32)
y_train = tf.convert_to_tensor(y_train, dtype=tf.float32)  # Use tf.float32 for binary classification

model.fit(X_train, y_train, epochs=5) # Now call model.fit

Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.5000 - loss: 0.6954
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step - accuracy: 0.5000 - loss: 0.6947
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - accuracy: 0.5000 - loss: 0.6940
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.5000 - loss: 0.6934
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step - accuracy: 0.5000 - loss: 0.6927


<keras.src.callbacks.history.History at 0x7d93f8272710>

Intermediate

In [9]:
import tensorflow as tf
import keras_hub
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split

# Prepare BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Prepare dataset
texts = ["This is a good product.", "I am not happy with this.", "Great service."]
labels = [1, 0, 1]  # 1 for positive, 0 for negative

# Tokenize the texts
inputs = tokenizer(texts, return_tensors='tf', padding=True, truncation=True, max_length=512)

# Convert the TensorFlow tensor to a NumPy array before splitting
input_ids = inputs['input_ids'].numpy()

# Split into train and test sets using the NumPy array
X_train, X_test, y_train, y_test = train_test_split(input_ids, labels, test_size=0.2)

# Convert the training and testing data back to TensorFlow tensors
X_train = tf.convert_to_tensor(X_train, dtype=tf.int32)
X_test = tf.convert_to_tensor(X_test, dtype=tf.int32)

# Convert y_train and y_test to TensorFlow tensors as well
y_train = tf.convert_to_tensor(y_train, dtype=tf.int32) # Convert y_train to a tensor
y_test = tf.convert_to_tensor(y_test, dtype=tf.int32) # Convert y_test to a tensor


# Fine-tune the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=3)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7d939d433050>

Advanced

In [10]:
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
import tensorflow as tf

# Load GPT-2 pre-trained model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = TFGPT2LMHeadModel.from_pretrained("gpt2")

# Encode input text and generate output
input_text = "Once upon a time"
inputs = tokenizer(input_text, return_tensors="tf")

# Generate text
output = model.generate(inputs['input_ids'], max_length=50, num_return_sequences=1)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a


Expert

In [11]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load pre-trained T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Summarize the text
input_text = "The quick brown fox jumps over the lazy dog. The dog is very lazy and does not move much."

# Tokenize and encode input text
inputs = tokenizer("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True)

# Generate summary
summary_ids = model.generate(inputs['input_ids'], max_length=50, min_length=10, length_penalty=2.0)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(f"Summary: {summary}")


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



Summary: the quick brown fox jumps over the lazy dog.
