In [58]:
from datasets import load_dataset
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    TrainingArguments,
    Trainer,
    pipeline
)
from functools import partial
import numpy as np
import evaluate
import wandb

## Load dataset

In [6]:
dataset = load_dataset("GonzaloA/fake_news")

Downloading:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading and preparing dataset csv/default (download: 61.82 MiB, generated: 100.38 MiB, post-processed: Unknown size, total: 162.20 MiB) to /root/.cache/huggingface/datasets/parquet/GonzaloA--fake_news-1fe2b42e1fa111c8/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/13.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.8M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/GonzaloA--fake_news-1fe2b42e1fa111c8/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
dataset["train"][100]

{'Unnamed: 0': 100,
 'title': 'Trump on Twitter (Feb 7) - Putin, Iran, Betsy Devos',
 'text': 'The following statements\xa0were posted to the verified Twitter accounts of U.S. President Donald Trump, @realDonaldTrump and @POTUS.  The opinions expressed are his own.\xa0Reuters has not edited the statements or confirmed their accuracy.  @realDonaldTrump : -I don’t know Putin, have no deals in Russia, and the haters are going crazy - yet Obama can make a deal with Iran, #1 in terror, no problem! [711 EST] -An honor having the National Sheriffs’ Assoc. join me at the @WhiteHouse. Incredible men & women who protect & serve 24/7/365. THANK YOU!! [1125 EST] -It is a disgrace that my full Cabinet is still not in place, the longest such delay in the history of our country. Obstruction by Democrats! [2004 EST] -Senate Dems protest to keep the failed status quo. Betsy DeVos is a reformer, and she is going to be a great Education Sec. for our kids! [814 EST] -An honor having the @NationalSheriff A

## Load model and tokenizer

In [8]:
bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2, output_attentions = False, output_hidden_states = False)
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
def tokenize_function(tokenizer, examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [10]:
bert_tokenize_function = partial(tokenize_function, bert_tokenizer)

In [17]:
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(10000))

In [19]:
tokenized_datasets = small_train_dataset.map(bert_tokenize_function, batched=True)

  0%|          | 0/10 [00:00<?, ?ba/s]

## Training

In [20]:
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [21]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [30]:
train_dataset = tokenized_datasets.shuffle(seed=42).select(range(8000))
eval_dataset = tokenized_datasets.shuffle(seed=42).select(range(2000))

In [31]:
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [32]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss,Accuracy
1,0.074,0.027787,0.9865
2,0.0326,0.01092,0.995
3,0.0101,0.002739,0.9995




TrainOutput(global_step=1500, training_loss=0.038901503562927243, metrics={'train_runtime': 1572.7041, 'train_samples_per_second': 15.26, 'train_steps_per_second': 0.954, 'total_flos': 6314665328640000.0, 'train_loss': 0.038901503562927243, 'epoch': 3.0})

In [42]:
PATH = "working/checkpoints"

In [49]:
trainer.save_model(PATH)
bert_tokenizer.save_pretrained(PATH)

('/kaggle/working/checkpoints/tokenizer_config.json',
 '/kaggle/working/checkpoints/special_tokens_map.json',
 '/kaggle/working/checkpoints/vocab.txt',
 '/kaggle/working/checkpoints/added_tokens.json')

## Usage

In [52]:
tokenizer = BertTokenizer.from_pretrained(PATH, local_files_only=True)
model = BertForSequenceClassification.from_pretrained(PATH, local_files_only=True)


In [53]:
text = """
Liverpool struck twice late on to beat Arsenal at Emirates Stadium and reach the FA Cup fourth round.

Arsenal paid the price for missing a host of opportunities and were punished as Liverpool grew increasingly dangerous, the deadlock broken when Trent Alexander-Arnold's free-kick glanced in off Jakub Kiwior's head with seven minutes left.

Liverpool's triumph was completed in the closing seconds as a lethal break ended with Luis Diaz firing an emphatic finish high past Arsenal goalkeeper Aaron Ramsdale.

Mikel Arteta's side dominated the first half, with Martin Odegaard hitting the bar while Reiss Nelson and Kai Havertz also had chances to give Arsenal reward for their pressure.

Liverpool, despite missing captain Virgil van Dijk through illness and with Mohamed Salah at the Africa Cup of Nations, held firm and were always a threat. Alexander-Arnold hit the bar in the first half and as they grew into the game, Ramsdale saved well from Diaz and Diogo Jota headed against the woodwork.
"""

In [59]:
text_classification_pipeline = pipeline(
    "text-classification",
    model=PATH,
    tokenizer=PATH,
    return_all_scores=True,
    device=0,
)



In [60]:
 text_classification_pipeline(text)

[[{'label': 'LABEL_0', 'score': 0.9968350529670715},
  {'label': 'LABEL_1', 'score': 0.0031650131568312645}]]