In [1]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [2]:
raw_data = load_dataset("json", data_files="HC3/all.jsonl")

Found cached dataset json (C:/Users/alpha/.cache/huggingface/datasets/json/default-fc01a04ef66caffc/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
len(raw_data)

In [32]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [20]:
def format_dataset(raw_data):
    answers = []
    for example in raw_data["train"]:
        for human_answer in example["human_answers"]:
            answers.append({"label": 0, "text": human_answer})
        for gpt_answer in example["chatgpt_answers"]:
            answers.append({"label": 1, "text": gpt_answer})
    return Dataset.from_list(answers)

In [21]:
answers = format_dataset(raw_data)

In [33]:
tokenized_answers = answers.map(preprocess_function, batched=True)

                                                                   

In [35]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [37]:
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

Downloading pytorch_model.bin: 100%|██████████| 501M/501M [00:59<00:00, 8.36MB/s] 
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoin

In [47]:
answers_split = tokenized_answers.train_test_split()

In [50]:
training_args = TrainingArguments(
    output_dir='results',
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=answers_split['train'],
    eval_dataset=answers_split['test'],
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [51]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 64086
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2003
  Number of trainable parameters = 124647170
  0%|          | 0/2003 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import json
import datasets
import pandas as pd

# Define the model name and number of labels
MODEL_NAME = "roberta-base"
NUM_LABELS = 2

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

# Import data
with open('amazon_v0.json', 'r') as f:
    data = json.load(f)

# Expand reviews into individual examples
def expand_reviews(raw_data, human='human', generated='chatgpt', from_pd=False):
  review_data = []
  for row in raw_data:
    if from_pd:
      _, row = row
    for human_review in row[human]:
      review_data.append({
          'text': human_review,
          'label': 0,
      })
    for generated_review in row[generated]:
      review_data.append({
          'text': generated_review,
          'label': 1,
      })
  return review_data

# Creates Trainer-ready dataset from data
def create_dataset(data):
  dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=data))
  return dataset

dataset = create_dataset(expand_reviews(data))
# TODO remove later, train on small sample to test code
dataset = dataset.select(range(10))

# Tokenize the dataset
def tokenize_function(example):
    return tokenizer(example['text'], padding='max_length', truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Select the input and target columns
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Split the dataset into training and test sets
train_dataset, eval_dataset = tokenized_dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)

In [None]:
print(train_dataset.keys())