In [39]:
# Text generation using GPT-2 model
# Load Libraries
from transformers import pipeline

# Load text generation pipeline with GPT-2
generator = pipeline("text-generation", model="gpt2")

# Prompt for the LLM
prompt = "BERT model is used for Classification"

# Generate text
output = generator(prompt, max_length=60, num_return_sequences=1)

print("Generated Text:\n", output[0]["generated_text"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=60) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generated Text:
 BERT model is used for Classification of the Human Brain. These models are more sophisticated than the T.M.R. model, and can be used in large-scale, non-linear models, but can only be used in very small-scale, non-linear models.

The primary differences between the two models are their similarity to the recent T.M.R. model, which has been used in several studies (e.g., in the literature, the literature of the British Medical Journal, and the International Journal of Neurochemistry). In the T.M.R. model, a model of brain function has been proposed, but this model has not been updated to account for the present (e.g., the current model is based on the T.M.R. model). This model, however, is very similar to the T.M.R. model in many different ways.

The changes in the neural activity of the human brain in the T.M.R. model are most pronounced during adolescence (e.g., in the brainstem, hippocampus, and cerebellum) and during adulthood (e.g., in the cerebellum). The T.M.R. mo

In [41]:
# Fine tuning
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

class SimpleTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

texts = [
    "I like this movie!",
    "This film was bad"]
labels = [1, 0]  # 1 = positive, 0 = negative

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

dataset = SimpleTextDataset(texts, labels, tokenizer)

training_args = TrainingArguments(
    output_dir="./bert-model",
    num_train_epochs=2,
    per_device_train_batch_size=2,
    logging_dir="./logs",
    logging_steps=1,
    overwrite_output_dir=True
)

trainer = Trainer(model=model,args=training_args,train_dataset=dataset)

trainer.train()