In [2]:
%load_ext autoreload
%autoreload 2

# Model Initialization

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline

tokenizer = AutoTokenizer.from_pretrained("Hello-SimpleAI/chatgpt-detector-roberta")
model = AutoModelForSequenceClassification.from_pretrained("Hello-SimpleAI/chatgpt-detector-roberta")
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# text generated with chatGPT - almost 100% certain it's AI generated
pipe(
    """Alright, so when you hear "New York Times Bestseller," it's not literally saying there's only one book at the number one spot. Think of it more like a list where multiple books can be at the top, but they're still all considered "bestsellers." The New York Times updates this list regularly, so different books can take the top spot at different times. It's kind of like when you have a bunch of favorite toys, but you can only play with one at a time. So, yeah, when you hear about a bunch of books being "number one," they're all just sharing that top spot at different times."""
)

[[{'label': 'Human', 'score': 0.0007840888574719429},
  {'label': 'ChatGPT', 'score': 0.999215841293335}]]

In [5]:
# text generated with Claude - LLM which is not ChatGPT (which was used to create the HC3 dataset)
# even though, I would say the text isn't all too different
pipe(
    """Alright, so here's the deal with those "New York Times #1 Bestseller" books. It's not like there's just one single bestselling book that gets the top spot. Nah, they've got different lists for different types of books - fiction, non-fiction, paperbacks, you name it. So a bunch of books can be #1 at the same time, just in their own category. Plus, the list changes every week, so lots of books get a turn at being the top dog. It's all just a big marketing thing, really."""
)

[[{'label': 'Human', 'score': 0.7533059120178223},
  {'label': 'ChatGPT', 'score': 0.2466941475868225}]]

the big issue we have here is that the pretrained model is trained only on data from one single source - it doesn't do well with data generated (with prompts) from other sources

In [6]:
def get_prediction(scores: dict):
    return max(scores, key=lambda x: x["score"])

In [7]:
[
    get_prediction(x)["label"] for x in pipe([
        """How do you do fellow kids?""",
        """Alright, so when you hear "New York Times Bestseller," it's not literally saying there's only one book at the number one spot. Think of it more like a list where multiple books can be at the top, but they're still all considered "bestsellers." The New York Times updates this list regularly, so different books can take the top spot at different times. It's kind of like when you have a bunch of favorite toys, but you can only play with one at a time. So, yeah, when you hear about a bunch of books being "number one," they're all just sharing that top spot at different times.""",
        """Alright, so here's the deal with those "New York Times #1 Bestseller" books. It's not like there's just one single bestselling book that gets the top spot. Nah, they've got different lists for different types of books - fiction, non-fiction, paperbacks, you name it. So a bunch of books can be #1 at the same time, just in their own category. Plus, the list changes every week, so lots of books get a turn at being the top dog. It's all just a big marketing thing, really.""",
    ])
]

['Human', 'ChatGPT', 'Human']

# Out of the Box Evaluation on TweepFake

In [8]:
import pandas as pd

class DataHandler():
    def __init__(self):
        ...
    
    def read_csv_file(self, file_path, sep=";"):
        df = pd.read_csv(file_path, sep=sep)
        return df

In [9]:
dh = DataHandler()
tweepf_train = dh.read_csv_file("data/tweepfake/train.csv")
tweepf_train

Unnamed: 0,screen_name,text,account.type,class_type
0,imranyebot,YEA now that note GOOD,bot,others
1,zawvrk,Listen to This Charming Man by The Smiths htt...,human,human
2,zawarbot,wish i can i would be seeing other hoes on the...,bot,others
3,ahadsheriffbot,The decade in the significantly easier schedul...,bot,others
4,kevinhookebot,"""Theim class=\""alignnone size-full wp-image-60...",bot,rnn
...,...,...,...,...
20707,AINarendraModi,Met on the Abversion of our science for the co...,bot,rnn
20708,AINarendraModi,Land for their during the opportunity to the p...,bot,rnn
20709,DeepDrumpf,@TayandYou doesn't have a clue. You're right. ...,bot,rnn
20710,jaden,Me And My Bestie https://t.co/vPq2iDkWZm,human,human


In [10]:
tweepf_train[tweepf_train.class_type.isnull()]

Unnamed: 0,screen_name,text,account.type,class_type


In [11]:
tweepf_train[tweepf_train["account.type"].isnull()]

Unnamed: 0,screen_name,text,account.type,class_type


In [12]:
human_count = tweepf_train[tweepf_train.class_type=="human"].text.count()
rnn_count = tweepf_train[tweepf_train.class_type=="rnn"].text.count()
oth_count = tweepf_train[tweepf_train.class_type=="others"].text.count()
gpt2_count = tweepf_train[tweepf_train.class_type=="gpt2"].text.count()
pd.DataFrame({"type": ["human", "rnn", "markov", "gpt-2"], "count": [human_count, rnn_count, oth_count, gpt2_count]})

Unnamed: 0,type,count
0,human,10358
1,rnn,3325
2,markov,3920
3,gpt-2,3109


In [13]:
tweepf_valid = dh.read_csv_file("data/tweepfake/validation.csv")
tweepf_valid

Unnamed: 0,screen_name,text,account.type,class_type
0,ahadsheriff,"TIGHT, TIGHT, TIGHT, YEAH!!! https://t.co/wj3n...",human,human
1,narendramodi,India has millennia old relations with Oman. W...,human,human
2,jaden,Anxious Teenagers,human,human
3,JustinTrudeau,Our top priority is keeping Canadians safe. Wi...,human,human
4,imranyebot,nah bro You’re taking sis so much I’m just a g...,bot,others
...,...,...,...,...
2297,DeepDrumpf,You're going to be even prouder when we don't ...,bot,rnn
2298,jaden,https://t.co/10XkzXDBCf https://t.co/cIUIYWEB45,human,human
2299,ahadsheriff,2. “Once you take the place of the people who ...,human,human
2300,imranyebot,black will be like a company with them need so...,bot,others


In [14]:
human_count = tweepf_valid[tweepf_valid.class_type=="human"].text.count()
rnn_count = tweepf_valid[tweepf_valid.class_type=="rnn"].text.count()
oth_count = tweepf_valid[tweepf_valid.class_type=="others"].text.count()
gpt2_count = tweepf_valid[tweepf_valid.class_type=="gpt2"].text.count()
pd.DataFrame({"type": ["human", "rnn", "markov", "gpt-2"], "count": [human_count, rnn_count, oth_count, gpt2_count]})

Unnamed: 0,type,count
0,human,1150
1,rnn,370
2,markov,436
3,gpt-2,346


In [15]:
tweepf_test = dh.read_csv_file("data/tweepfake/test.csv")
tweepf_test

Unnamed: 0,screen_name,text,account.type,class_type
0,zawvrk,justin timberlake really one of the goats if y...,human,human
1,narendramodi,Thank you @PMBhutan for your gracious prayers ...,human,human
2,ahadsheriff,Theory: the number of red lights you will hit ...,human,human
3,AINarendraModi,Respects on the Upt of the I good with the peo...,bot,rnn
4,kevinhooke,Might give the BASIC #10Liner game contest ano...,human,human
...,...,...,...,...
2553,ahadsheriffbot,“The best kept secret,bot,others
2554,kevinhooke,Love the Choose your own adventure style of th...,human,human
2555,dril_gpt2,JOIN OUR TEAM: Sneezing,bot,gpt2
2556,kevinhooke,These deeply discounted 256GB SanDisk flash dr...,human,human


In [16]:
valid_text = list(tweepf_valid[tweepf_valid.text.str.len() < 512].text)
valid_label = list(tweepf_valid[tweepf_valid.text.str.len() < 512]["account.type"])

In [17]:
import time

start = time.time()

predictions = [
    # get_prediction(x) for x in pipe(valid_text)
]
# predictions[:5]

end = time.time()
print(f"elapsed time: {end - start:.2f}")

elapsed time: 0.00


In [18]:
pred_edit = ["human" if x["label"] == "Human" else "bot" for x in predictions]

In [19]:
num_correct = sum(x == y for x, y in zip(valid_label, pred_edit))
accuracy = num_correct / len(valid_text)
print(f"total out of the box accuracy: {accuracy * 100: .2f}%")

total out of the box accuracy:  0.00%


In [20]:
indices = set([x if z[0] == z[1] else None for x, z in enumerate(zip(valid_label, pred_edit))])

In [21]:
num_gpt2 = tweepf_valid[
    (tweepf_valid.index.isin(indices)) & (tweepf_valid.text.str.len() < 512) & (tweepf_valid.class_type == "gpt2")
].text.count()

tot_gpt2 = tweepf_valid[tweepf_valid.class_type == "gpt2"].text.count()
print(f"accuracy for just GPT-2 samples: {num_gpt2*100/tot_gpt2:.2f}%")

accuracy for just GPT-2 samples: 0.00%


out of the box - completely random at 50%

and surprisingly, does really poorly on GPT2 as well even though it was trained on a chatGPT corpus

# Finetuning on TweepFake

In [22]:
import datasets

# since we need 1's and 0's for training instead of text - this mapping needs to occur
tweepf_train["account.type"] = tweepf_train["account.type"].map({"human": 0, "bot": 1})
tweepf_valid["account.type"] = tweepf_valid["account.type"].map({"human": 0, "bot": 1})
tweepf_test["account.type"] = tweepf_test["account.type"].map({"human": 0, "bot": 1})

# reformat the pandas dfs into a Dataset for training
# if you need the types account type - you can left join it back in on the text
train_dataset = datasets.Dataset.from_dict(tweepf_train.drop(["screen_name", "class_type"], axis=1).rename(columns={"account.type": "labels"}))
valid_dataset = datasets.Dataset.from_dict(tweepf_valid.drop(["screen_name", "class_type"], axis=1).rename(columns={"account.type": "labels"}))
test_dataset = datasets.Dataset.from_dict(tweepf_test.drop(["screen_name", "class_type"], axis=1).rename(columns={"account.type": "labels"}))
datasets = datasets.DatasetDict({"train": train_dataset, "valid": valid_dataset, "test": test_dataset})
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 20712
    })
    valid: Dataset({
        features: ['text', 'labels'],
        num_rows: 2302
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 2558
    })
})

In [25]:
list(datasets["train"]["labels"])

[1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,


In [23]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_ds = datasets.map(tokenize_function, batched=True)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████| 20712/20712 [00:05<00:00, 3861.96 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████| 2302/2302 [00:00<00:00, 3820.29 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████| 2558/2558 [00:00<00:00, 3413.06 examples/s]


In [24]:
tokenized_ds = tokenized_ds.remove_columns(["text"])
small_train_dataset = tokenized_ds["train"].shuffle(seed=42).select(range(100))
small_valid_dataset = tokenized_ds["valid"].shuffle(seed=42).select(range(100))

In [25]:
small_train_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 100
})

In [26]:
tokenized_ds.set_format("torch")

In [27]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=512)
valid_dataloader = DataLoader(small_valid_dataset, batch_size=512)

In [28]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [29]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [30]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
import evaluate
import numpy as np
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: torch.from_numpy(np.asarray(v)).to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


    metric = evaluate.load("accuracy")
    model.eval()
    for batch in eval_dataloader:
        batch = {k: torch.from_numpy(np.asarray(v)).to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
    
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    
    metric.compute()

  0%|                                                                                                                | 0/3 [00:00<?, ?it/s]

# Calculating Tweet Parse Metrics

# Calculating Tweet Perplexity
you need to do this with a model trained on twitter data - not a LLM