In [None]:
from datasets import load_dataset
from os import environ

dataset = load_dataset("yelp_review_full")
environ["CUDA_LAUNCH_BLOCKING"] = "1"

model_name = "google-bert/bert-base-cased"
# model_name = "TinyLlama/TinyLlama_v1.1"
# model_name = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO"

In [None]:
from transformers import AutoTokenizer, LlamaTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer = LlamaTokenizer.from_pretrained("TinyLlama/TinyLlama_v1.1")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, 
                                 batched=True,
                                 batch_size=1000,
                                 keep_in_memory=False)

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test.trainer", eval_strategy="epoch")

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test.trainer", 
                                  eval_strategy="epoch",
                                  per_device_train_batch_size=1,
                                  per_device_eval_batch_size=1,)
                                #   per_gpu_eval_batch_size=1,
                                #   per_gpu_train_batch_size=1,)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()



In [None]:
model_name = "google-bert/bert-base-cased"

trainer.save_model(f'{model_name}.model')
trainer.save_state()