In [1]:
from datasets import load_dataset, DatasetDict

In [2]:
import unicodedata

In [3]:
import html

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

In [5]:
ds = load_dataset("SetFit/bbc-news")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
ds.reset_format()

In [7]:
ds['train'].shuffle(seed=42)['text'][0]

'prince crowned  top music earner  prince earned more than any other pop star in 2004  beating artists such madonna and elton john in us magazine rolling stone s annual list.  the singer banked $56.5m (£30.4m) from concerts  album and publishing sales with his musicology tour and album. he kept madonna in second place  as she earned $54.9m (£29.5m) while embarking on her global re-invention tour. veterans simon and garfunkel were in 10th place  their comeback tour helping them earn $24.9m (£13.4m) last year.   prince returned to centre stage after a decade in the commercial wilderness   the magazine reported. the singer s 2004 tour took $90.3m (£48.5m) in ticket sales and he sold 1.9 million copies of his latest album musicology.  although she grossed more than prince last year  madonna remained in second place because of the  monumental  production costs of her tour. heavy metal band metallica s madly in anger with the world tour helped push their 2004 earnings up to $43.1m (£23.1m). 

In [8]:
def space_delitter(x):
    return {'text': [" ".join(html.unescape(i).split()).lower() for i in x['text']]}

In [9]:
ds = ds.map(space_delitter, batched=True)

In [10]:
ds['train']['text'][0]

'wales want rugby league training wales could follow england s lead by training with a rugby league club. england have already had a three-day session with leeds rhinos and wales are thought to be interested in a similar clinic with rivals st helens. saints coach ian millward has given his approval but if it does happen it is unlikely to be this season. saints have a week s training in portugal next week while wales will play england in the opening six nations match on 5 february. we have had an approach from wales confirmed a saints spokesman. it s in the very early stages but it is something we are giving serious consideration to. st helens who are proud of their welsh connections are obvious partners for the welsh rugby union despite a spat in 2001 over the collapse of kieron cunningham s proposed £500 000 move to union side swansea. a similar cross-code deal that took iestyn harris from leeds to cardiff in 2001 did go through before the talented stand-off returned to the 13-man cod

In [11]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [12]:
import string

In [13]:
simbols = list(string.ascii_lowercase) + ['!', ',', '"', '.', '?', '-', '+', '=', '(', ')', '&', ' ']


In [14]:
itg = []
for i in ''.join(ds['train']['text']):
    if i not in simbols+itg:
        itg.append(i)

In [15]:
itg

['5',
 '2',
 '0',
 '1',
 '£',
 '3',
 '9',
 '6',
 '4',
 '$',
 '7',
 '8',
 '%',
 '[',
 ']',
 ':',
 'é',
 '/',
 ';',
 '<',
 '*',
 'ã',
 'ô',
 '>',
 'ó']

In [16]:
ds = ds.map(lambda x: {'text': unicodedata.normalize("NFC", x['text'])})

In [17]:
ds['train']['text'][0]

'wales want rugby league training wales could follow england s lead by training with a rugby league club. england have already had a three-day session with leeds rhinos and wales are thought to be interested in a similar clinic with rivals st helens. saints coach ian millward has given his approval but if it does happen it is unlikely to be this season. saints have a week s training in portugal next week while wales will play england in the opening six nations match on 5 february. we have had an approach from wales confirmed a saints spokesman. it s in the very early stages but it is something we are giving serious consideration to. st helens who are proud of their welsh connections are obvious partners for the welsh rugby union despite a spat in 2001 over the collapse of kieron cunningham s proposed £500 000 move to union side swansea. a similar cross-code deal that took iestyn harris from leeds to cardiff in 2001 did go through before the talented stand-off returned to the 13-man cod

In [18]:
unicodedata.normalize("NFKD", 'é')

'é'

In [19]:
def tokenize_func(example):
    return {
        'tokenized_text':
        tokenizer(example['text'], truncation=True, max_length=512, padding=True)
    }


In [20]:
ds = ds.map(tokenize_func)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [21]:
tokens = ds['train']['tokenized_text'][0]
tokenizer.decode(tokens['input_ids'])

'[CLS] wales want rugby league training wales could follow england s lead by training with a rugby league club. england have already had a three - day session with leeds rhinos and wales are thought to be interested in a similar clinic with rivals st helens. saints coach ian millward has given his approval but if it does happen it is unlikely to be this season. saints have a week s training in portugal next week while wales will play england in the opening six nations match on 5 february. we have had an approach from wales confirmed a saints spokesman. it s in the very early stages but it is something we are giving serious consideration to. st helens who are proud of their welsh connections are obvious partners for the welsh rugby union despite a spat in 2001 over the collapse of kieron cunningham s proposed £500 000 move to union side swansea. a similar cross - code deal that took iestyn harris from leeds to cardiff in 2001 did go through before the talented stand - off returned to th

In [22]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
try:
    ds['train']['label'].features.names
except AttributeError:
    print('Dataset labels has no encoded names')


Dataset labels has no encoded names


In [24]:
ds.set_format('pandas')

In [25]:
labels = ds['train']['label'].unique()

In [26]:
labels_encode = ds['train']['label_text'].unique()
labels_encode

array(['sport', 'business', 'entertainment', 'tech', 'politics'],
      dtype=object)

In [27]:
label_to_category = dict(zip(labels, labels_encode))

In [28]:
label_to_category

{np.int64(2): 'sport',
 np.int64(1): 'business',
 np.int64(3): 'entertainment',
 np.int64(0): 'tech',
 np.int64(4): 'politics'}

In [29]:
ds.reset_format()

In [30]:
def extract_tokens(example):
    example['input_ids'] = example['tokenized_text']['input_ids']
    example['attention_mask'] = example['tokenized_text']['attention_mask']
    return example

ds = ds.map(extract_tokens, remove_columns=['text','label_text', 'tokenized_text'])


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [31]:
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1225
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [32]:
args = TrainingArguments(
    output_dir ='./model',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    report_to=[]
)



In [33]:
from sklearn.metrics import accuracy_score
import numpy as np

In [34]:
def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds)



    }






In [35]:

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding='max_length',
    max_length=512
)

In [36]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds['train'],
    eval_dataset=ds['train'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator

)

  trainer = Trainer(


In [37]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.209374,0.97551


KeyboardInterrupt: 

In [38]:
trainer.predict(ds['test'])

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.209374,0.97551


PredictionOutput(predictions=array([[-0.90396893, -1.1753968 , -0.6930947 ,  2.923195  , -1.2647469 ],
       [-0.8878729 , -0.72531116,  3.6257668 , -0.7184649 , -0.45259207],
       [-1.0117823 ,  3.508115  , -0.7791326 , -0.9882177 , -0.37977993],
       ...,
       [ 3.2771597 , -0.5951701 , -1.2391256 , -0.6775571 , -0.85896647],
       [-0.72113466,  3.5888658 , -1.0355138 , -0.9015393 , -0.8486399 ],
       [-0.5747604 , -0.49506333, -0.99390584, -1.1834903 ,  3.2129092 ]],
      dtype=float32), label_ids=array([3, 2, 1, 4, 2, 1, 2, 1, 4, 4, 2, 3, 3, 0, 3, 3, 3, 2, 1, 3, 3, 4,
       2, 1, 0, 1, 1, 1, 2, 3, 1, 2, 0, 3, 2, 2, 0, 0, 1, 3, 3, 4, 1, 4,
       0, 0, 2, 0, 2, 1, 2, 1, 2, 0, 1, 2, 1, 3, 3, 1, 2, 3, 3, 4, 2, 4,
       3, 2, 0, 3, 2, 4, 0, 2, 3, 0, 2, 1, 2, 4, 2, 3, 0, 3, 3, 0, 2, 0,
       0, 1, 2, 2, 4, 0, 1, 1, 2, 1, 3, 0, 1, 3, 0, 0, 4, 2, 4, 0, 0, 1,
       3, 2, 0, 0, 2, 1, 3, 0, 3, 0, 1, 3, 4, 4, 3, 1, 1, 0, 2, 2, 0, 0,
       2, 2, 0, 1, 1, 1, 4, 3, 1, 4, 4, 0, 1