In [None]:
%%sh
pip -q install torch transformers datasets widgetsnbextension ipywidgets huggingface_hub --upgrade

In [None]:
import datasets
import transformers

print(transformers.__version__)
print(datasets.__version__)

In [None]:
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

In [None]:
base_model_id = "distilbert-base-uncased"

epochs = 3
num_labels = 1  # Dataset has 5 classes
learning_rate = 5e-5
train_batch_size = 24
eval_batch_size = 32
save_strategy = "no"
save_steps = 500
logging_steps = 100

# Let's use mixed mode training to accelerate training
# (from 23 minutes to 9 minutes on a V100)
fp16 = True

output_data_dir = "./output"
model_dir = "./model"

In [None]:
!python -m pip install huggingface_hub
!huggingface-cli login

In [None]:
# Option 1: load dataset from the Hub
dataset = load_dataset("DepositorOP/masterstack", use_auth_token=True)

# Option 2: load dataset from local storage
# dataset = load_from_disk("./data")

print(dataset)

In [None]:
dataset["train"][0]

In [None]:
train_dataset = dataset["train"]
valid_dataset = dataset["test"]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(base_model_id, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)


train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
train_dataset = train_dataset.remove_columns(["text"])
valid_dataset = valid_dataset.remove_columns(["text"])

In [None]:
hub_model_id = "DepositorOP/NewModel"

training_args = TrainingArguments(
    hub_model_id=hub_model_id,  # This is where we'll push the model after training
    output_dir=model_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    save_strategy=save_strategy,
    save_steps=save_steps,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    logging_steps=logging_steps,
    fp16=True,
    #push_to_hub=True,
    #push_to_hub_token='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'
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

Using cuda_amp half precision backend


In [None]:
trainer.train()

In [None]:
trainer.evaluate(eval_dataset=valid_dataset)

In [None]:
trainer.save_model('/content/drive/MyDrive/ForMasterStack/' + "_local")