In [1]:
from datasets import load_dataset

dataset = load_dataset("magnea/fake-news-formated")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dataset_id', 'title', 'content', 'classification'],
        num_rows: 149049
    })
    test: Dataset({
        features: ['id', 'dataset_id', 'title', 'content', 'classification'],
        num_rows: 16556
    })
})

In [2]:
dataset= dataset.filter(lambda d: d['title'] != None and d['content'] != None)
dataset = dataset.rename_column('classification', 'label')
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dataset_id', 'title', 'content', 'label'],
        num_rows: 114522
    })
    test: Dataset({
        features: ['id', 'dataset_id', 'title', 'content', 'label'],
        num_rows: 12728
    })
})

In [None]:
id2label={0: 'fake', 1: 'real'}
label2id={'fake': 0, 'real': 1}

def combine_fields(d):
	return {'combined': d['title'] + '\n\n' + d['content']}

def transform_label(d):
    d['label'] = label2id[d['label']]
    return d

splits = dataset.keys()
for split in splits:
    dataset[split] = dataset[split].map(combine_fields)
    dataset[split] = dataset[split].map(transform_label)

dataset

Map:   0%|          | 0/114522 [00:00<?, ? examples/s]

Map:   0%|          | 0/12728 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dataset_id', 'title', 'content', 'label', 'combined'],
        num_rows: 114522
    })
    test: Dataset({
        features: ['id', 'dataset_id', 'title', 'content', 'label', 'combined'],
        num_rows: 12728
    })
})

In [5]:
dataset['train'][4711]

{'id': 'e52659cd236e805',
 'dataset_id': 1.0,
 'title': "South Africa's Dlamini-Zuma, ANC leadership contender, to become MP",
 'content': 'JOHANNESBURG (Reuters) - South African veteran politician and anti-apartheid activist Nkosazana Dlamini-Zuma, a leading contender to take over as head of the ruling ANC in December, will be sworn in as a member of parliament next week, a senior party official said on Friday. Dlamini-Zuma, the ex-wife of current ANC leader and South African President Jacob Zuma, does not hold a top position and could use a seat in parliament to raise her profile ahead of the party s December leadership conference.   She is going to be sworn in,  ANC Secretary General Gwede Mantashe was quoted as saying by the local EWN news network. The former health and foreign affairs minister s main opponent in the ANC leadership race is expected to be Deputy President Cyril Ramaphosa, a trade unionist-turned-business tycoon whom many investors would prefer to see running a count

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

def preprocess_data(d):
    return tokenizer(d['combined'], padding='max_length', truncation=True)


tokenized = {}
for split in splits:
    tokenized[split] = dataset[split].map(preprocess_data, batched=True)

tokenized

Map:   0%|          | 0/114522 [00:00<?, ? examples/s]

Map:   0%|          | 0/12728 [00:00<?, ? examples/s]

{'train': Dataset({
     features: ['id', 'dataset_id', 'title', 'content', 'label', 'combined', 'input_ids', 'attention_mask'],
     num_rows: 114522
 }),
 'test': Dataset({
     features: ['id', 'dataset_id', 'title', 'content', 'label', 'combined', 'input_ids', 'attention_mask'],
     num_rows: 12728
 })}

In [7]:
tokenized['test']

Dataset({
    features: ['id', 'dataset_id', 'title', 'content', 'label', 'combined', 'input_ids', 'attention_mask'],
    num_rows: 12728
})

In [8]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    'gpt2',
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)
for param in model.parameters():
    param.requires_grad = False

model

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [None]:
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./benchmark",
        learning_rate=2e-3,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True
    ),
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

trainer.evaluate()

  trainer = Trainer(


{'eval_loss': 1.5774004459381104,
 'eval_model_preparation_time': 0.0007,
 'eval_accuracy': 0.494343180389692,
 'eval_runtime': 175.1342,
 'eval_samples_per_second': 72.676,
 'eval_steps_per_second': 1.136}