In [1]:
# We won't need TensorFlow here
#!pip uninstall -y tensorflow

!pip install accelerate -U
# Install `transformers` from master
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.11.0
# tokenizers version at notebook update --- 0.8.0rc1
!pip install datasets
!pip install evaluate


Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-cvd6tpth
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-cvd6tpth
  Resolved https://github.com/huggingface/transformers to commit f26099e7b5cf579f99a42bab6ddd371bf2c8d548
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
tokenizers                       0.13.3
transformers                     4.33.0.dev0


In [23]:
from datasets import Dataset, Value, ClassLabel, Features
import pandas as pd

df = pd.read_csv('bbc-news-data.csv',index_col = False,sep="\t")
df = df.rename(columns={'category': 'labels'})

labels = df["labels"].unique()
id2label = {k: v for k, v in enumerate(labels)}
label2id = {v: k for k, v in id2label.items()}

features = Features({"content": Value("string"), "labels": ClassLabel(num_classes=len(labels), names=list(labels))})

dataset = Dataset.from_pandas(df[["labels", "content"]], features=features)
dataset = dataset.train_test_split(test_size=0.1)
dataset["test"]

Dataset({
    features: ['content', 'labels'],
    num_rows: 223
})

In [25]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
# tokenizer.padding = True

def tokenize_function(article):
    tokens = tokenizer(article["content"], padding=True, truncation=True)
    return tokens

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2002 [00:00<?, ? examples/s]

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

In [4]:
!mkdir Classification

mkdir: cannot create directory ‘Classification’: File exists


In [5]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

True

In [6]:
import evaluate

accuracy = evaluate.load("accuracy")

In [7]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [9]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-cased", num_labels=len(labels), id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
training_args = TrainingArguments(
    output_dir="Classification",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.087858,0.973094
2,No log,0.052023,0.991031


TrainOutput(global_step=252, training_loss=0.3046756623283265, metrics={'train_runtime': 209.5332, 'train_samples_per_second': 19.109, 'train_steps_per_second': 1.203, 'total_flos': 530427840983040.0, 'train_loss': 0.3046756623283265, 'epoch': 2.0})

In [27]:
trainer.save_model("./Classification")

In [11]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
