In [2]:
import transformers
import torch
import pandas as pd
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained("roberta-base")
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.json',
 'tokenizer/merges.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [4]:
data = pd.read_csv("train_essay_combined.csv")

data

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0
...,...,...,...,...
2757,1379,1,"Dear Senator,\n\nI am writing to you today to ...",1
2758,1380,1,"Dear Senator,\n\nI am writing to you today to ...",1
2759,1381,1,"Dear Senator,\n\nI am writing to you today to ...",1
2760,1382,1,"Dear Senator,\n\nI am writing to you today to ...",1


In [5]:
from sklearn.model_selection import train_test_split

train_data, dev_data = train_test_split(data, test_size=0.2)

train_dataset = datasets.Dataset.from_pandas(train_data)
dev_dataset = datasets.Dataset.from_pandas(dev_data)

train_dataset =train_dataset.rename_column("generated", "labels")
dev_dataset =dev_dataset.rename_column("generated", "labels")


In [6]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=["text", "id", "prompt_id"])
dev_dataset = dev_dataset.map(preprocess_function, batched=True, remove_columns=["text", "id", "prompt_id"])

train_dataset.set_format("torch")
dev_dataset.set_format("torch")

Map: 100%|██████████| 2209/2209 [00:00<00:00, 2787.74 examples/s]
Map: 100%|██████████| 553/553 [00:00<00:00, 3452.24 examples/s]


In [7]:
train_dataset

Dataset({
    features: ['labels', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 2209
})

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import os

num_labels = 2
device_1 = "cuda" if torch.cuda.is_available() else "cpu"
device = "mps" if torch.backends.mps.is_available() else device_1

model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels).to(device)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = (preds == labels).mean()
    print(acc)
    return {"accuracy": acc}


training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


In [13]:
torch.cuda.empty_cache()
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("./model_tokenizer/model").to(device)
probability_of_llm = []
for idx, example in dev_data.iterrows():
    tokenized = tokenizer(example["text"], padding="max_length", truncation=True, return_tensors="pt").to(device)
    prediction = fine_tuned_model(**tokenized)
    probability_of_llm.append(torch.sigmoid(prediction.logits).cpu().detach().numpy()[0][1])

# tokenized = tokenizer(test_data["text"].tolist(), padding="max_length", truncation=True, return_tensors="pt").to(device)
# prediction = fine_tuned_model(**tokenized)
# probability_of_llm = torch.sigmoid(prediction.logits).cpu().detach().numpy()[:, 1]

print(probability_of_llm)
    

[0.9941678, 0.99416465, 0.011102008, 0.9939995, 0.99404377, 0.010408482, 0.9939546, 0.010858155, 0.010413138, 0.9939454, 0.9939465, 0.99409443, 0.017544586, 0.994034, 0.0118556395, 0.010506605, 0.011019036, 0.010840873, 0.010840198, 0.9940446, 0.011236898, 0.9939912, 0.011318469, 0.99404967, 0.010805301, 0.010732793, 0.9940189, 0.011200643, 0.011148697, 0.99398106, 0.9941351, 0.010868115, 0.9941454, 0.010984311, 0.0119889295, 0.99396896, 0.01138982, 0.0105857495, 0.012543207, 0.9941281, 0.012005337, 0.011649064, 0.99417865, 0.010440464, 0.99400914, 0.0109012835, 0.9939442, 0.011973163, 0.012388779, 0.99399287, 0.99409395, 0.010447975, 0.012077607, 0.99396944, 0.012422177, 0.9939335, 0.0139307715, 0.011120906, 0.9942609, 0.99419564, 0.010489112, 0.994083, 0.010950722, 0.014956329, 0.99412894, 0.012284744, 0.9939063, 0.99402976, 0.99398756, 0.010585871, 0.0126536675, 0.9939262, 0.993982, 0.9941684, 0.011479874, 0.010854023, 0.9942075, 0.9940481, 0.011563696, 0.9940959, 0.01173727, 0.0138

: 