In [1]:
import torch
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from collections import Counter
from datasets import Dataset, load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


2023-12-17 19:08:51.175339: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Loading the data

In [2]:
data = pd.read_csv('train_v2_drcat_02.csv', usecols=['text','label'])
data.head(), np.sum(data['label']==1), np.sum(data['label']==0), len(data)

(                                                text  label
 0  Phones\n\nModern humans today are always on th...      0
 1  This essay will explain if drivers should or s...      0
 2  Driving while the use of cellular devices\n\nT...      0
 3  Phones & Driving\n\nDrivers should not be able...      0
 4  Cell Phone Operation While Driving\n\nThe abil...      0,
 17497,
 27371,
 44868)

In [None]:
import re
def rem_punc(text):
    return re.sub(r'[^\w\s]','',text)

data['text'] = data['text'].apply(rem_punc)

In [21]:
subset_size = 500
dfsub = data.groupby('label').apply(lambda x: x.sample(n=subset_size, random_state=42)).reset_index(drop=True)
len(dfsub), np.sum(dfsub['label']==1),np.sum(dfsub['label']==0)

(1000, 500, 500)

In [22]:
train, df_temp = train_test_split(dfsub, test_size=0.2, stratify=dfsub['label'], random_state=42)
val, test = train_test_split(df_temp, test_size=0.5, stratify=df_temp['label'], random_state=42)
len(train), len(val), len(test)

(800, 100, 100)

In [23]:
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)

### Tokenization

In [24]:
train = Dataset.from_pandas(train[['text','label']])
val = Dataset.from_pandas(val[['text','label']])
test = Dataset.from_pandas(test[['text','label']])
train,test,val

(Dataset({
     features: ['text', 'label'],
     num_rows: 800
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 100
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 100
 }))

In [7]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

def compute_accuracy(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)  # Assuming predictions are logits; use argmax to get class indices
    labels = np.array(labels, dtype=int)  # Ensure labels are integers

    correct_predictions = np.sum(predictions == labels)
    accuracy = correct_predictions / len(labels)
    
    return {"accuracy": accuracy}

### DistilBERT

In [26]:
# model_name = 'distilbert-base-uncased'
# output_dir='chatgpt-or-human'
# model = AutoModelForSequenceClassification.from_pretrained(model_name)
# tokenizer = DistilBertTokenizer.from_pretrained(model_name)
# model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)

In [25]:
def preprocess_function1(examples):
    return tokenizer1(examples["text"], truncation=True)

In [27]:
model_name1 = 'distilbert-base-uncased'
tokenizer1 = AutoTokenizer.from_pretrained(model_name1)
# tokens = tokenizer.tokenize(ds[0]['text'])
train_token1 = train.map(preprocess_function1, batched=True)
val_token1 = val.map(preprocess_function1, batched=True)
test_token1 = test.map(preprocess_function1, batched=True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [10]:
data_collator1 = DataCollatorWithPadding(tokenizer=tokenizer1)

In [11]:
model1 = AutoModelForSequenceClassification.from_pretrained(
    model_name1, num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
training_args1 = TrainingArguments(
    output_dir="chatgpt-or-human",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer1 = Trainer(
    model=model1,
    args=training_args1,
    train_dataset=train_token1,
    eval_dataset=val_token1,
    tokenizer=tokenizer1,
    data_collator=data_collator1,
    compute_metrics=compute_accuracy,
)

trainer1.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.236951,0.95
2,No log,0.156122,0.98


TrainOutput(global_step=50, training_loss=0.35275760650634763, metrics={'train_runtime': 400.484, 'train_samples_per_second': 3.995, 'train_steps_per_second': 0.125, 'total_flos': 211947837849600.0, 'train_loss': 0.35275760650634763, 'epoch': 2.0})

In [45]:
outputs1 = trainer1.predict(test_token1)
# outputs1, id2label[np.argmax(outputs1.predictions[0])]

In [48]:
logits1 = outputs1.predictions
# Apply softmax to get probabilities
probabilities1 = np.exp(logits1) / np.sum(np.exp(logits1), axis=1, keepdims=True)
# print(probabilities1)

# The second column (index 1) corresponds to class B - ai generated text
generated1 = probabilities1[:, 1]

# print(generated1)

In [19]:
def acc(probs,labs):
    preds = np.argmax(probs,axis=1)
    return np.sum(preds==labs)/len(labs)

In [59]:
print('Accuracy of DistilBERT:',100*acc(probabilities1,test['label']),'%')

Accuracy of DistilBERT: 93.0 %


### ALBERT

In [55]:
def preprocess_function2(examples):
    return tokenizer2(examples["text"], truncation=True)

model_name2 = 'albert-base-v2'
tokenizer2 = AutoTokenizer.from_pretrained(model_name2)
# tokens = tokenizer.tokenize(ds[0]['text'])
train_token2 = train.map(preprocess_function2, batched=True)
val_token2 = val.map(preprocess_function2, batched=True)
test_token2 = test.map(preprocess_function2, batched=True)

data_collator2 = DataCollatorWithPadding(tokenizer=tokenizer2)

model2 = AutoModelForSequenceClassification.from_pretrained(
    model_name2, num_labels=2, id2label=id2label, label2id=label2id
)

training_args2 = TrainingArguments(
    output_dir="chatgpt-or-human",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer2 = Trainer(
    model=model2,
    args=training_args2,
    train_dataset=train_token2,
    eval_dataset=val_token2,
    tokenizer=tokenizer2,
    data_collator=data_collator2,
    compute_metrics=compute_accuracy,
)

trainer2.train()

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.067966,0.99
2,No log,0.027002,1.0


TrainOutput(global_step=50, training_loss=0.21963270187377928, metrics={'train_runtime': 830.2404, 'train_samples_per_second': 1.927, 'train_steps_per_second': 0.06, 'total_flos': 38236962816000.0, 'train_loss': 0.21963270187377928, 'epoch': 2.0})

In [61]:
output_dir='chatgpt-or-human'
model = AutoModelForSequenceClassification.from_pretrained(model_name2)
tokenizer = AutoTokenizer.from_pretrained(model_name2)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


('who_wrote_it/tokenizer_config.json',
 'who_wrote_it/special_tokens_map.json',
 'who_wrote_it/spiece.model',
 'who_wrote_it/added_tokens.json',
 'who_wrote_it/tokenizer.json')

In [56]:
outputs2 = trainer2.predict(test_token2)

logits2 = outputs2.predictions
# Apply softmax to get probabilities
probabilities2 = np.exp(logits2) / np.sum(np.exp(logits2), axis=1, keepdims=True)
# print(probabilities2)

# The second column (index 1) corresponds to class B - generated by AI
generated2 = probabilities2[:, 1]

# print(generated2)

In [58]:
print('Accuracy of ALBERT:',100*acc(probabilities2,test['label']),'%')

Accuracy of ALBERT: 100.0 %


### Mean predictions

In [60]:
print(generated1[:5], generated2[:5])
preds = generated1 + generated2
preds = preds/2
preds

[0.90566695 0.10269861 0.12681969 0.9154422  0.86411685] [0.97029305 0.02136991 0.02398213 0.9716846  0.970585  ]


array([0.93798   , 0.06203426, 0.0754009 , 0.9435634 , 0.9173509 ,
       0.94923806, 0.877828  , 0.95114285, 0.05777133, 0.9499601 ,
       0.07287563, 0.9489278 , 0.8813256 , 0.91945404, 0.93826026,
       0.07940765, 0.06568179, 0.57515156, 0.9278771 , 0.3820201 ,
       0.9491663 , 0.06348974, 0.07036015, 0.9514024 , 0.05843329,
       0.05698265, 0.07256859, 0.9408345 , 0.31512663, 0.38028312,
       0.90207136, 0.9510464 , 0.08037234, 0.6948719 , 0.93919784,
       0.94969714, 0.9275706 , 0.05721666, 0.38312206, 0.05785446,
       0.94533485, 0.9456029 , 0.9308734 , 0.12795922, 0.9486919 ,
       0.94227797, 0.94118786, 0.0627152 , 0.05777857, 0.9290557 ,
       0.23128168, 0.93966514, 0.05843047, 0.8823756 , 0.9504529 ,
       0.19577684, 0.9360187 , 0.92589974, 0.08815014, 0.31746042,
       0.07736894, 0.0599874 , 0.05982019, 0.87717307, 0.05774756,
       0.05591822, 0.92473865, 0.94784695, 0.8683653 , 0.9351584 ,
       0.07020098, 0.05780216, 0.08680337, 0.05820052, 0.94478

### Hyper parameter tuning using Optuna

In [62]:
!pip install optuna

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[K     |████████████████████████████████| 413 kB 15.6 MB/s eta 0:00:01
Collecting alembic>=1.5.0
  Downloading alembic-1.13.0-py3-none-any.whl (230 kB)
[K     |████████████████████████████████| 230 kB 61.5 MB/s eta 0:00:01
Collecting sqlalchemy>=1.3.0
  Downloading SQLAlchemy-2.0.23-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 66.7 MB/s eta 0:00:01
[?25hCollecting colorlog
  Downloading colorlog-6.8.0-py3-none-any.whl (11 kB)
Collecting Mako
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 18.9 MB/s  eta 0:00:01
Collecting greenlet!=0.4.17; platform_machine == "aarch64" or (platform_machine == "ppc64le" or (platform_machine == "x86_64" or (platform_machine == "amd64" or (platform_machine == "AMD64" or (platform_machine == "win32" or platform_machine == "WIN32")))))
  Downloading greenlet-3.0.2-

In [15]:
import optuna
from sklearn.metrics import accuracy_score
def objective(trial):
    learning_rate = trial.suggest_float('learning_rate', 5e-5, 1e-4, log=True)
    batch_size = trial.suggest_categorical('batch_size', [16, 32])

    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
    model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased')
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Define Trainer arguments
    training_args = TrainingArguments(
        output_dir='chatgpt-or-human',
        num_train_epochs=2,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        evaluation_strategy='epoch',
        logging_dir='./logs',
        logging_strategy='epoch'
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_token1,
        eval_dataset=val_token1,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_accuracy
    )

    # Train the model
    trainer.train()

    predictions = trainer.predict(trainer.eval_dataset)
    y_true = predictions.label_ids
    y_pred = predictions.predictions.argmax(axis=1)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy

# Create an Optuna study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=2)

# Get the best hyperparameters
best_params = study.best_params
print("Best hyperparameters:", best_params)

[I 2023-12-17 19:50:03,794] A new study created in memory with name: no-name-e877dcca-841f-4938-a0c5-b1531bdc2be2
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3663,0.066419,0.98
2,0.1021,0.051831,0.99


[I 2023-12-17 19:56:49,433] Trial 0 finished with value: 0.99 and parameters: {'learning_rate': 5.113189892146903e-05, 'batch_size': 16}. Best is trial 0 with value: 0.99.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3971,0.043671,0.99
2,0.1195,0.219415,0.95


[I 2023-12-17 20:03:36,208] Trial 1 finished with value: 0.95 and parameters: {'learning_rate': 9.935729571831524e-05, 'batch_size': 16}. Best is trial 0 with value: 0.99.


Best hyperparameters: {'learning_rate': 5.113189892146903e-05, 'batch_size': 16}


In [34]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define Trainer arguments for the best hyperparameters
best_training_args = TrainingArguments(
    output_dir='chatgpt-or-human',
    num_train_epochs=2,
    per_device_train_batch_size=best_params['batch_size'],
    per_device_eval_batch_size=best_params['batch_size'],
    learning_rate=best_params['learning_rate'],
    evaluation_strategy='epoch',
    logging_dir='./logs',
    logging_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=best_training_args,
    train_dataset=train_token1,
    eval_dataset=test_token1,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_accuracy
)

# Evaluate the model on the new test set
results = trainer.evaluate()
print("Results of best hyperparameters using Optuna on DistilBERT:", results)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Results of best hyperparameters using Optuna on DistilBERT: {'eval_loss': 0.7063475251197815, 'eval_accuracy': 0.5, 'eval_runtime': 7.6745, 'eval_samples_per_second': 13.03, 'eval_steps_per_second': 0.521}


In [33]:
outputs = trainer.predict(test_token1)

logits = outputs.predictions

probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
# generated2 = probabilities2[:, 1]
print('Accuracy of DistilBERT optimized with Optuna:',100*acc(probabilities,test['label']),'%')

Accuracy of DistilBERT optimized with Optuna: 67.0 %


#### Increase hyper parameter search space and n_trials to get better performance