In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 0. Set up

In [None]:
! pip install transformers datasets evaluate scikit-learn
! pip install accelerate -U

In [3]:
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

## 1. Load dataset
I will use the HC3 (Human ChatGPT Comparison Corpus) dataset from the 🤗 Datasets library.

In [4]:
from datasets import load_dataset

ds = load_dataset("hello-simpleai/hc3", data_files=['all.jsonl' ])
print(f"Train dataset size: {len(ds['train'])}")

Train dataset size: 24322


There are five fields in this dataset:
  - question
  - human_answers (several)
  - chatgpt_answers (several)
  - index
  - source

*** There is only train set, therefore test set should be seperated from the train set manually.

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['question', 'human_answers', 'chatgpt_answers', 'index', 'source'],
        num_rows: 24322
    })
})

In [6]:
ds['train'][3]['human_answers'][0]

"You ca n't just go around assassinating the leaders of countries you do nt like ! The international condemnation would be brutal . Even though noone likes Kim Jong - Un , and everyone thinks North Korea is pretty shitty to its citizens , if say the US were to send agents over ( and do n't think they are n't capable of it ) and they got caught .... every country , every world leader would be a potential target . Who 's next ... Castro ? Angela Merkel ? Anyways , rumour has it that he 's ultra paranoid about exactly that and travels around in tanks and armoured trains that make Limo 1 look like a tonka toy ."

In [7]:
# check how many answers could be each question...
# not sure it is important or not so far...
num_human_answers = []
num_chatgpt_answers = []
for i in range(len(ds['train'])):
  num_human_answers.append(len(ds['train'][i]['human_answers']))
  num_chatgpt_answers.append(len(ds['train'][i]['chatgpt_answers']))

print(set(num_human_answers))
print(set(num_chatgpt_answers))

{1, 3}
{0, 1, 2, 3}


## 2. Preprocess
The next step is to load a tokenizer to preprocess the text field. A tokenizer converts text to a sequence of tokens and creates numerical representation.

In [8]:
# test tokenizer
from transformers import AutoTokenizer

roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
text = "Hello everyone!"
vocab = {v:k for k,v in roberta_tokenizer.vocab.items()}

print([(id, vocab[id]) for id in roberta_tokenizer(text)['input_ids']])
print([(id, vocab[id]) for id in roberta_tokenizer(ds['train'][3]['human_answers'][0])['input_ids']])

[(0, '<s>'), (31414, 'Hello'), (961, 'Ġeveryone'), (328, '!'), (2, '</s>')]
[(0, '<s>'), (1185, 'You'), (6056, 'Ġca'), (295, 'Ġn'), (75, "'t"), (95, 'Ġjust'), (213, 'Ġgo'), (198, 'Ġaround'), (39257, 'Ġassass'), (15647, 'inating'), (5, 'Ġthe'), (917, 'Ġleaders'), (9, 'Ġof'), (749, 'Ġcountries'), (47, 'Ġyou'), (109, 'Ġdo'), (295, 'Ġn'), (90, 't'), (101, 'Ġlike'), (27785, 'Ġ!'), (20, 'ĠThe'), (758, 'Ġinternational'), (19973, 'Ġcondemnation'), (74, 'Ġwould'), (28, 'Ġbe'), (8513, 'Ġbrutal'), (479, 'Ġ.'), (1648, 'ĠEven'), (600, 'Ġthough'), (117, 'Ġno'), (1264, 'one'), (3829, 'Ġlikes'), (1636, 'ĠKim'), (6465, 'ĠJong'), (111, 'Ġ-'), (1890, 'ĠUn'), (2156, 'Ġ,'), (8, 'Ġand'), (961, 'Ġeveryone'), (4265, 'Ġthinks'), (369, 'ĠNorth'), (1101, 'ĠKorea'), (16, 'Ġis'), (1256, 'Ġpretty'), (43816, 'Ġshitty'), (7, 'Ġto'), (63, 'Ġits'), (2286, 'Ġcitizens'), (2156, 'Ġ,'), (114, 'Ġif'), (224, 'Ġsay'), (5, 'Ġthe'), (382, 'ĠUS'), (58, 'Ġwere'), (7, 'Ġto'), (2142, 'Ġsend'), (3525, 'Ġagents'), (81, 'Ġover'), (36,

In [9]:
def preprocess_function(example, answer_type):
    temp_dict = roberta_tokenizer(example[answer_type][0], truncation=True)
    temp_dict['label'] = 0
    return temp_dict

In [10]:
tokenized_human_answers = ds['train'].map(lambda x: preprocess_function(x, 'human_answers'), batched=False)


Map:   0%|          | 0/24322 [00:00<?, ? examples/s]

In [11]:
tokenized_human_answers = tokenized_human_answers.remove_columns(['chatgpt_answers'])
tokenized_human_answers = tokenized_human_answers.rename_column('human_answers', 'answer')


In [12]:
import pandas as pd

tokenized_human_answers_df = pd.DataFrame(tokenized_human_answers)

In [13]:
tokenized_human_answers_df.head()

Unnamed: 0,question,answer,index,source,input_ids,attention_mask,label
0,"Why is every book I hear about a "" NY Times # ...","[Basically there are many categories of "" Best...",,reddit_eli5,"[0, 34480, 89, 32, 171, 6363, 9, 22, 2700, 447...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1,"If salt is so bad for cars , why do we use it ...",[salt is good for not dying in car crashes and...,,reddit_eli5,"[0, 29, 3967, 16, 205, 13, 45, 8180, 11, 512, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
2,Why do we still have SD TV channels when HD lo...,[The way it works is that old TV stations got ...,,reddit_eli5,"[0, 133, 169, 24, 1364, 16, 14, 793, 1012, 449...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
3,Why has nobody assassinated Kim Jong - un He i...,[You ca n't just go around assassinating the l...,,reddit_eli5,"[0, 1185, 6056, 295, 75, 95, 213, 198, 39257, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
4,How was airplane technology able to advance so...,[Wanting to kill the shit out of Germans drive...,,reddit_eli5,"[0, 38576, 154, 7, 3549, 5, 15328, 66, 9, 1841...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0


In [14]:
question_list = []
chatgpt_answer_list = []
index_list = []
source_list = []
input_ids_list = []
attention_mask_list = []
label_list = []

for sample in ds['train']:

  # some of the question does not have answer.
  if len(sample['chatgpt_answers'])!=0:
    one_chatgpt_answer = sample['chatgpt_answers'][0]

    question_list.append(sample['question'])
    chatgpt_answer_list.append([one_chatgpt_answer])
    index_list.append(sample['index'])
    source_list.append(sample['source'])
    input_ids_list.append(roberta_tokenizer(one_chatgpt_answer, truncation=True)['input_ids'])
    attention_mask_list.append(roberta_tokenizer(one_chatgpt_answer, truncation=True)['attention_mask'])
    label_list.append(1)

In [15]:

tokenized_chatgpt_answers_df = pd.DataFrame.from_dict({'question':question_list, 'answer':chatgpt_answer_list, 'index':index_list, \
                                                       'source':source_list, 'input_ids':input_ids_list, 'attention_mask':attention_mask_list, \
                                                      'label':label_list})

In [16]:
tokenized_chatgpt_answers_df.head()

Unnamed: 0,question,answer,index,source,input_ids,attention_mask,label
0,"Why is every book I hear about a "" NY Times # ...",[There are many different best seller lists th...,,reddit_eli5,"[0, 970, 32, 171, 430, 275, 15689, 8204, 14, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
1,"If salt is so bad for cars , why do we use it ...",[Salt is used on roads to help melt ice and sn...,,reddit_eli5,"[0, 44095, 16, 341, 15, 3197, 7, 244, 20147, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
2,Why do we still have SD TV channels when HD lo...,[There are a few reasons why we still have SD ...,,reddit_eli5,"[0, 970, 32, 10, 367, 2188, 596, 52, 202, 33, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
3,Why has nobody assassinated Kim Jong - un He i...,[It is generally not acceptable or ethical to ...,,reddit_eli5,"[0, 243, 16, 3489, 45, 9796, 50, 13557, 7, 715...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
4,How was airplane technology able to advance so...,[After the Wright Brothers made the first powe...,,reddit_eli5,"[0, 4993, 5, 5825, 10144, 156, 5, 78, 8852, 25...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1


In [17]:
combined_df = pd.concat([tokenized_human_answers_df, tokenized_chatgpt_answers_df], ignore_index=True)


In [18]:
combined_df.head()

Unnamed: 0,question,answer,index,source,input_ids,attention_mask,label
0,"Why is every book I hear about a "" NY Times # ...","[Basically there are many categories of "" Best...",,reddit_eli5,"[0, 34480, 89, 32, 171, 6363, 9, 22, 2700, 447...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1,"If salt is so bad for cars , why do we use it ...",[salt is good for not dying in car crashes and...,,reddit_eli5,"[0, 29, 3967, 16, 205, 13, 45, 8180, 11, 512, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
2,Why do we still have SD TV channels when HD lo...,[The way it works is that old TV stations got ...,,reddit_eli5,"[0, 133, 169, 24, 1364, 16, 14, 793, 1012, 449...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
3,Why has nobody assassinated Kim Jong - un He i...,[You ca n't just go around assassinating the l...,,reddit_eli5,"[0, 1185, 6056, 295, 75, 95, 213, 198, 39257, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
4,How was airplane technology able to advance so...,[Wanting to kill the shit out of Germans drive...,,reddit_eli5,"[0, 38576, 154, 7, 3549, 5, 15328, 66, 9, 1841...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0


In [19]:
print(tokenized_human_answers_df.shape, tokenized_chatgpt_answers_df.shape)
print(combined_df.shape)
combined_df.head()

(24322, 7) (23867, 7)
(48189, 7)


Unnamed: 0,question,answer,index,source,input_ids,attention_mask,label
0,"Why is every book I hear about a "" NY Times # ...","[Basically there are many categories of "" Best...",,reddit_eli5,"[0, 34480, 89, 32, 171, 6363, 9, 22, 2700, 447...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1,"If salt is so bad for cars , why do we use it ...",[salt is good for not dying in car crashes and...,,reddit_eli5,"[0, 29, 3967, 16, 205, 13, 45, 8180, 11, 512, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
2,Why do we still have SD TV channels when HD lo...,[The way it works is that old TV stations got ...,,reddit_eli5,"[0, 133, 169, 24, 1364, 16, 14, 793, 1012, 449...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
3,Why has nobody assassinated Kim Jong - un He i...,[You ca n't just go around assassinating the l...,,reddit_eli5,"[0, 1185, 6056, 295, 75, 95, 213, 198, 39257, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
4,How was airplane technology able to advance so...,[Wanting to kill the shit out of Germans drive...,,reddit_eli5,"[0, 38576, 154, 7, 3549, 5, 15328, 66, 9, 1841...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0


In [20]:
from datasets import Dataset

Tokenized_dataset = Dataset.from_pandas(combined_df)


In [21]:
Tokenized_dataset

Dataset({
    features: ['question', 'answer', 'index', 'source', 'input_ids', 'attention_mask', 'label'],
    num_rows: 48189
})

In [22]:
Tokenized_dataset = Tokenized_dataset.remove_columns(['question', 'index', 'source'])
Tokenized_dataset = Tokenized_dataset.rename_column('answer', 'text')

In [23]:
Tokenized_dataset = Tokenized_dataset.train_test_split(test_size=0.2)

In [24]:
Tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'label'],
        num_rows: 38551
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'label'],
        num_rows: 9638
    })
})

In [25]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=roberta_tokenizer)

In [26]:
import evaluate

# Proportion of correct predictions among the total number of cases processed
accuracy = evaluate.load("accuracy")

In [27]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [28]:
id2label = {0: "human", 1: "chatgpt"}
label2id = {"human": 0, "chatgpt": 1}

In [29]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# This automodel class gives us the model with pretrained weights + a sequence classification head
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=14,
    per_device_eval_batch_size=14,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [39]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=Tokenized_dataset["train"],
    eval_dataset=Tokenized_dataset["test"],
    tokenizer=roberta_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0136,0.058737,0.991284


KeyboardInterrupt: ignored