## 0. Set up

In [None]:
! pip install transformers datasets evaluate scikit-learn
! pip install accelerate -U

In [39]:
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [71]:
import pandas as pd
from transformers import AutoTokenizer
from datasets import load_dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import numpy as np

import re
import string

from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from tqdm.auto import tqdm
from transformers import get_scheduler
from torch.optim import AdamW
import math

## 1. Load dataset
I will use the HC3 (Human ChatGPT Comparison Corpus) dataset from the 🤗 Datasets library.

In [41]:
ds = load_dataset("hello-simpleai/hc3", data_files=['all.jsonl' ])
print(f"Train dataset size: {len(ds['train'])}")

Train dataset size: 24322


There are five fields in this dataset:
  - question
  - human_answers (several)
  - chatgpt_answers (several)
  - index
  - source

*** There is only train set, therefore test set should be seperated from the train set manually.

In [42]:
ds

DatasetDict({
    train: Dataset({
        features: ['question', 'human_answers', 'chatgpt_answers', 'index', 'source'],
        num_rows: 24322
    })
})

In [43]:
ds['train'][5]['human_answers'][0]

'Melanin ! Many of the the first known humans existed in the fertile crescent - modern day Iraq and surrounding areas , and it was just as sunny and hot as it is today . Melanin causes skin and eyes to have a darker color , and as a benefit reduced the amount of UV radiation absorbed into the skin . Eventually humans expanded into less hot and sunlit areas allowing for the survival and procreation of people who developed lighter colored eyes and skin because of the lack of need of melanin for survival .'

In [44]:
ds['train'][5]['chatgpt_answers'][0]

'The color of your eyes is determined by the amount and type of pigments in your iris, which is the colored part of your eye, and by the way that the iris scatters light. The iris contains two types of pigment: one called melanin, which gives your skin, hair, and eyes their color, and another called lipochrome, which is a yellowish pigment. The combination of these pigments, along with the structure of the iris, determines the color of your eyes. \nThere are many different shades of eye color, ranging from dark brown to light blue, and the most common eye colors are brown, blue, and green. The color of your eyes is determined by your genes, which are the instructions that you inherit from your parents that tell your body how to grow and function. \nThere is no scientific evidence to suggest that the color of your eyes has any special meaning or significance. However, many people believe that the color of your eyes can reveal certain things about your personality or your health, but the

## 2. Preprocess
The next step is to load a tokenizer to preprocess the text field. A tokenizer converts text to a sequence of tokens and creates numerical representation.

In [45]:
# tokenizer
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# tokenizer test
text = "Hello everyone!"
vocab = {v:k for k,v in roberta_tokenizer.vocab.items()}
print([(id, vocab[id]) for id in roberta_tokenizer(text)['input_ids']])
print([(id, vocab[id]) for id in roberta_tokenizer(ds['train'][3]['human_answers'][0])['input_ids']])

[(0, '<s>'), (31414, 'Hello'), (961, 'Ġeveryone'), (328, '!'), (2, '</s>')]
[(0, '<s>'), (1185, 'You'), (6056, 'Ġca'), (295, 'Ġn'), (75, "'t"), (95, 'Ġjust'), (213, 'Ġgo'), (198, 'Ġaround'), (39257, 'Ġassass'), (15647, 'inating'), (5, 'Ġthe'), (917, 'Ġleaders'), (9, 'Ġof'), (749, 'Ġcountries'), (47, 'Ġyou'), (109, 'Ġdo'), (295, 'Ġn'), (90, 't'), (101, 'Ġlike'), (27785, 'Ġ!'), (20, 'ĠThe'), (758, 'Ġinternational'), (19973, 'Ġcondemnation'), (74, 'Ġwould'), (28, 'Ġbe'), (8513, 'Ġbrutal'), (479, 'Ġ.'), (1648, 'ĠEven'), (600, 'Ġthough'), (117, 'Ġno'), (1264, 'one'), (3829, 'Ġlikes'), (1636, 'ĠKim'), (6465, 'ĠJong'), (111, 'Ġ-'), (1890, 'ĠUn'), (2156, 'Ġ,'), (8, 'Ġand'), (961, 'Ġeveryone'), (4265, 'Ġthinks'), (369, 'ĠNorth'), (1101, 'ĠKorea'), (16, 'Ġis'), (1256, 'Ġpretty'), (43816, 'Ġshitty'), (7, 'Ġto'), (63, 'Ġits'), (2286, 'Ġcitizens'), (2156, 'Ġ,'), (114, 'Ġif'), (224, 'Ġsay'), (5, 'Ġthe'), (382, 'ĠUS'), (58, 'Ġwere'), (7, 'Ġto'), (2142, 'Ġsend'), (3525, 'Ġagents'), (81, 'Ġover'), (36,

In [46]:
#Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [51]:
# making dataframe include human answers
question_list = []
human_answer_list = []
index_list = []
source_list = []
input_ids_list = []
attention_mask_list = []
label_list = []

for sample in ds['train']:

  # some of the question does not have answer.
  if len(sample['human_answers'])!=0:
    one_human_answer = sample['human_answers'][0]

    question_list.append(sample['question'])
    human_answer_list.append([clean_text(one_human_answer)])
    index_list.append(sample['index'])
    source_list.append(sample['source'])
    input_ids_list.append(roberta_tokenizer(one_human_answer, truncation=True)['input_ids'])
    attention_mask_list.append(roberta_tokenizer(one_human_answer, truncation=True)['attention_mask'])
    label_list.append(0)

tokenized_human_answers_df = pd.DataFrame.from_dict({'question':question_list, 'answer':human_answer_list, 'index':index_list, \
                                                       'source':source_list, 'input_ids':input_ids_list, 'attention_mask':attention_mask_list, \
                                                      'label':label_list})

In [52]:
# making dataframe include chatgpt answers
question_list = []
chatgpt_answer_list = []
index_list = []
source_list = []
input_ids_list = []
attention_mask_list = []
label_list = []

for sample in ds['train']:

  # some of the question does not have answer.
  if len(sample['chatgpt_answers'])!=0:
    one_chatgpt_answer = sample['chatgpt_answers'][0]

    question_list.append(sample['question'])
    chatgpt_answer_list.append([clean_text(one_chatgpt_answer)])
    index_list.append(sample['index'])
    source_list.append(sample['source'])
    input_ids_list.append(roberta_tokenizer(one_chatgpt_answer, truncation=True)['input_ids'])
    attention_mask_list.append(roberta_tokenizer(one_chatgpt_answer, truncation=True)['attention_mask'])
    label_list.append(1)


tokenized_chatgpt_answers_df = pd.DataFrame.from_dict({'question':question_list, 'answer':chatgpt_answer_list, 'index':index_list, \
                                                       'source':source_list, 'input_ids':input_ids_list, 'attention_mask':attention_mask_list, \
                                                      'label':label_list})

In [61]:
# shows clean texts
tokenized_human_answers_df.loc[5]['answer']

['melanin  many of the the first known humans existed in the fertile crescent  modern day iraq and surrounding areas  and it was just as sunny and hot as it is today  melanin causes skin and eyes to have a darker color  and as a benefit reduced the amount of uv radiation absorbed into the skin  eventually humans expanded into less hot and sunlit areas allowing for the survival and procreation of people who developed lighter colored eyes and skin because of the lack of need of melanin for survival ']

In [62]:
# shows clean texts
tokenized_chatgpt_answers_df.loc[5]['answer']

['the color of your eyes is determined by the amount and type of pigments in your iris which is the colored part of your eye and by the way that the iris scatters light the iris contains two types of pigment one called melanin which gives your skin hair and eyes their color and another called lipochrome which is a yellowish pigment the combination of these pigments along with the structure of the iris determines the color of your eyes there are many different shades of eye color ranging from dark brown to light blue and the most common eye colors are brown blue and green the color of your eyes is determined by your genes which are the instructions that you inherit from your parents that tell your body how to grow and function there is no scientific evidence to suggest that the color of your eyes has any special meaning or significance however many people believe that the color of your eyes can reveal certain things about your personality or your health but these beliefs are not support

In [53]:
def add_token_padding(sample):

  if len(sample)<512:

    token_paddings = [0 for i in range(512-len(sample))]
    return sample + token_paddings

  else:
    return sample

def add_mask_padding(sample):

  if len(sample)<512:

    mask_paddings = [1 for i in range(512-len(sample))]
    return sample + mask_paddings

  else:
    return sample

In [54]:
tokenized_chatgpt_answers_df['input_ids'] = tokenized_chatgpt_answers_df['input_ids'].map(lambda x: add_token_padding(x))
tokenized_chatgpt_answers_df['attention_mask'] = tokenized_chatgpt_answers_df['attention_mask'].map(lambda x: add_mask_padding(x))

In [55]:
tokenized_human_answers_df['input_ids'] = tokenized_human_answers_df['input_ids'].map(lambda x: add_token_padding(x))
tokenized_human_answers_df['attention_mask'] = tokenized_human_answers_df['attention_mask'].map(lambda x: add_mask_padding(x))

In [56]:
combined_df = pd.concat([tokenized_human_answers_df, tokenized_chatgpt_answers_df], ignore_index=True)

In [57]:
combined_df.head()

Unnamed: 0,question,answer,index,source,input_ids,attention_mask,label
0,"Why is every book I hear about a "" NY Times # ...",[basically there are many categories of best ...,,reddit_eli5,"[0, 34480, 89, 32, 171, 6363, 9, 22, 2700, 447...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1,"If salt is so bad for cars , why do we use it ...",[salt is good for not dying in car crashes and...,,reddit_eli5,"[0, 29, 3967, 16, 205, 13, 45, 8180, 11, 512, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
2,Why do we still have SD TV channels when HD lo...,[the way it works is that old tv stations got ...,,reddit_eli5,"[0, 133, 169, 24, 1364, 16, 14, 793, 1012, 449...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
3,Why has nobody assassinated Kim Jong - un He i...,[you ca nt just go around assassinating the le...,,reddit_eli5,"[0, 1185, 6056, 295, 75, 95, 213, 198, 39257, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
4,How was airplane technology able to advance so...,[wanting to kill the shit out of germans drive...,,reddit_eli5,"[0, 38576, 154, 7, 3549, 5, 15328, 66, 9, 1841...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0


In [65]:
print(tokenized_human_answers_df.shape, tokenized_chatgpt_answers_df.shape)
print(combined_df.shape)

(24322, 7) (23867, 7)
(48189, 7)


In [67]:
# convert dataframe to dataset
tokenized_dataset = Dataset.from_pandas(combined_df)

In [68]:
tokenized_dataset = tokenized_dataset.remove_columns(["question", "answer", "index", "source"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

## 3. Experiments

In [69]:
# sample some data for experiments
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.99)['train'].train_test_split(test_size=0.2)

In [73]:
id2label = {0: "human", 1: "chatgpt"}
label2id = {"human": 0, "chatgpt": 1}


# This automodel class gives us the model with pretrained weights + a sequence classification head
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=2, id2label=id2label, label2id=label2id).to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [74]:
# arguments and objects required for trainer
training_args = TrainingArguments(
    num_train_epochs=5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate = 2e-5
)

data_collator = DataCollatorWithPadding(tokenizer=roberta_tokenizer)

In [102]:
from sklearn.metrics import confusion_matrix

class CustomTrainer(Trainer):

def _inner_training_loop(
      self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, \
      ignore_keys_for_eval=None):

    number_of_epochs = args.num_train_epochs

    train_loss = []
    train_acc = []
    eval_acc = []

    criterion = torch.nn.CrossEntropyLoss().to(device)
    self.optimizer = AdamW(model.parameters(), lr=5e-5)
    self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 1, gamma = 0.9)

    train_dataloader = self.get_train_dataloader()
    eval_dataloader = self.get_eval_dataloader()

    max_steps = math.ceil(len(train_dataloader) * args.num_train_epochs)

    for epoch in range(number_of_epochs):

      self.model.train()
      self.model.zero_grad()


      train_loss_per_epoch = 0
      train_acc_per_epoch = 0

      with tqdm(train_dataloader, unit="batch") as training_epoch:
        training_epoch.set_description(f"Training Epoch {epoch}")

        for step, inputs in enumerate(training_epoch):

            inputs = inputs.to(device)
            labels = inputs['labels']

            self.optimizer.zero_grad()

            output = model(**inputs)
            loss = criterion(output.logits, labels)
            train_loss_per_epoch+=loss.item()

            loss.backward()

            self.optimizer.step()
            train_acc_per_epoch += (output['logits'].argmax(1) == labels).sum().item()


        self.scheduler.step()

        train_loss_per_epoch /= len(train_dataloader)
        train_acc_per_epoch /= (len(train_dataloader)*batch_size)


        eval_loss_per_epoch = 0
        eval_acc_per_epoch = 0


        with tqdm(eval_dataloader, unit="batch") as eval_epoch:
          eval_epoch.set_description(f"Evaluation Epoch {epoch}")

          for step, inputs in enumerate(eval_epoch):

            inputs = inputs.to(device)
            labels = inputs['labels']

            self.optimizer.zero_grad()

            output = model(**inputs)
            loss = criterion(output.logits, labels)
            eval_loss_per_epoch+=loss.item()


            loss.backward()


            self.optimizer.step()
            eval_acc_per_epoch += (output['logits'].argmax(1) == labels).sum().item()

            # print(confusion_matrix(output['logits'].argmax(1).cpu(), labels.cpu()).ravel())


        self.scheduler.step()

        eval_loss_per_epoch /= len(eval_dataloader)
        eval_acc_per_epoch /= (len(eval_dataloader)*batch_size)

        print(f'\tTrain Loss: {train_loss_per_epoch} |  Train Acc: {train_acc_per_epoch*100.0}%')
        print(f'\tEval Loss: {eval_loss_per_epoch} |  eval Acc: {eval_acc_per_epoch*100.0}%')



In [103]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator = data_collator)


In [104]:
trainer.train()

  0%|          | 0/39 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?batch/s]

[5 0 0 5]
[3 0 1 6]
[6 0 0 4]
[4 0 0 6]
[3 0 0 7]
[3 0 0 7]
[4 0 0 6]
[5 0 0 5]
[7 0 0 3]
[4 0 0 3]
	Train Loss: 0.06505106416406282 |  Train Acc: 96.92307692307692%
	Eval Loss: 0.0631826430791989 |  eval Acc: 96.0%


  0%|          | 0/39 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?batch/s]

[4 0 1 5]
[3 0 1 6]
[6 0 0 4]
[3 0 1 6]
[3 0 0 7]
[3 0 0 7]
[4 0 0 6]
[5 0 0 5]
[7 0 0 3]
[4 0 0 3]
	Train Loss: 0.0673476666591178 |  Train Acc: 97.43589743589743%
	Eval Loss: 0.07773658046498895 |  eval Acc: 94.0%


  0%|          | 0/39 [00:00<?, ?batch/s]

KeyboardInterrupt: ignored