In [26]:
#import libraries
!pip install transformers



In [27]:
#import libraries
import torch
import numpy as np
import random
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

In [28]:
#model hyperparameters
model_name = "bert-base-uncased"
max_length = 512
device = 'cpu'

#tokenizing the dataset
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a

In [29]:
#split the dataset
dataset = fetch_20newsgroups(subset="all", shuffle=True, remove=("headers", "footers", "quotes"))
doc = dataset['data']
target = dataset['target']

(train_texts, valid_texts, train_labels, valid_labels), target_names = train_test_split(doc, target, test_size=0.2), dataset['target_names']

In [30]:
# encode the text
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)

In [42]:
#predict the text output
#predict the text output
def get_prediction(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    return target_names[probs.argmax()]

In [31]:
class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = NewsGroupsDataset(train_encodings, train_labels)
valid_dataset = NewsGroupsDataset(valid_encodings, valid_labels)

In [32]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names)).to(device)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19"
  },
  "initializer_range": 0.02,
  

In [33]:
#define metrics for training
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [34]:
epoch = 1

training_args = TrainingArguments(
    output_dir='./results',        num_train_epochs=epoch, per_device_train_batch_size=8, per_device_eval_batch_size=20,   
    warmup_steps=500, weight_decay=0.01, logging_dir='./logs', load_best_model_at_end=True, logging_steps=400, save_steps=400, evaluation_strategy="steps",  
)

using `logging_steps` to initialize `eval_steps` to 400
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [35]:
#train the model
trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=valid_dataset, compute_metrics=compute_metrics )  
trainer.train()


***** Running training *****
  Num examples = 15076
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1885


Step,Training Loss,Validation Loss,Accuracy
400,2.424,1.402941,0.602918
800,1.2804,1.154187,0.660477


***** Running Evaluation *****
  Num examples = 3770
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-400
Configuration saved in ./results/checkpoint-400/config.json
Model weights saved in ./results/checkpoint-400/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3770
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-800
Configuration saved in ./results/checkpoint-800/config.json
Model weights saved in ./results/checkpoint-800/pytorch_model.bin


Step,Training Loss,Validation Loss,Accuracy
400,2.424,1.402941,0.602918
800,1.2804,1.154187,0.660477
1200,1.0956,1.010735,0.694164
1600,0.9446,0.961439,0.701592


***** Running Evaluation *****
  Num examples = 3770
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-1200
Configuration saved in ./results/checkpoint-1200/config.json
Model weights saved in ./results/checkpoint-1200/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3770
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-1600
Configuration saved in ./results/checkpoint-1600/config.json
Model weights saved in ./results/checkpoint-1600/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpoint-1600 (score: 0.9614388346672058).


TrainOutput(global_step=1885, training_loss=1.3509799623362897, metrics={'train_runtime': 3936.3011, 'train_samples_per_second': 3.83, 'train_steps_per_second': 0.479, 'total_flos': 3967303341686784.0, 'train_loss': 1.3509799623362897, 'epoch': 1.0})

In [40]:
# evaluate the model
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 3770
  Batch size = 20


{'epoch': 1.0,
 'eval_accuracy': 0.7015915119363395,
 'eval_loss': 0.9614388346672058,
 'eval_runtime': 259.5911,
 'eval_samples_per_second': 14.523,
 'eval_steps_per_second': 0.728}

In [41]:
# save trained model
model.save_pretrained('saved_trained_model')
tokenizer.save_pretrained('saved_trained_model')

Configuration saved in saved_trained_model/config.json
Model weights saved in saved_trained_model/pytorch_model.bin
tokenizer config file saved in saved_trained_model/tokenizer_config.json
Special tokens file saved in saved_trained_model/special_tokens_map.json


('saved_trained_model/tokenizer_config.json',
 'saved_trained_model/special_tokens_map.json',
 'saved_trained_model/vocab.txt',
 'saved_trained_model/added_tokens.json',
 'saved_trained_model/tokenizer.json')

In [44]:
text = """
I got back from my trip to discover that my email spool file got blown
away.  I am missing all the playoff pool entries sent between April 5
and April 17.  It looks like about 200 entries got lost.  *Sigh*.
Therefore, I would like to ask each person that sent me a team to resend
it ASAP.  I am relying on your honesty to not make changes after the
deadline today.

Thanks in advance, and I apologize for the problem.
"""
print(get_prediction(text))

rec.sport.hockey
