**Credit** 

1. Notebook : https://www.kaggle.com/code/thedrcat/detectai-transformers-baseline
2. Dataset  : https://www.kaggle.com/datasets/thedrcat/daigt-v2-train-dataset
3. Model    : https://huggingface.co/microsoft/deberta-v3-xsmall

In [1]:
# !pip install wandb
# import wandb

In [2]:
# !wandb login

In [3]:
# %env WANDB_LOG_MODEL=true

In [4]:
import transformers
import datasets
import pandas as pd
import numpy as np

from datasets import Dataset
from collections import Counter

from sklearn.metrics import roc_auc_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer





In [5]:
# Load the Huggingface "microsoft/deberta-v3-xsmall"
model_checkpoint = "/kaggle/input/huggingfacedebertav3variants/deberta-v3-xsmall"

In [6]:
# Loading the Dataset with LLM Generated Data
df = pd.read_csv('/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv')

In [7]:
print(" Train DF Shape : ", df.shape)
print("\n Train DF Labels : ", df.label.value_counts())
print("\n Train DF Prompt : ", df.prompt_name.value_counts())

 Train DF Shape :  (44868, 5)

 Train DF Labels :  label
0    27371
1    17497
Name: count, dtype: int64

 Train DF Prompt :  prompt_name
Distance learning                        5554
Seeking multiple opinions                5176
Car-free cities                          4717
Does the electoral college work?         4434
Facial action coding system              3084
Mandatory extracurricular activities     3077
Summer projects                          2701
Driverless cars                          2250
Exploring Venus                          2176
Cell phones at school                    2119
Grades for extracurricular activities    2116
Community service                        2092
"A Cowboy Who Rode the Waves"            1896
The Face on Mars                         1893
Phones and driving                       1583
Name: count, dtype: int64


In [8]:
# Splitting the Dataset into Train / Val on the basis of a "prompt_name"
train = df[df.prompt_name != 'Car-free cities'].reset_index(drop=True)
valid = df[df.prompt_name == 'Car-free cities'].reset_index(drop=True)
print(" Train DF Shape : ", train.shape)
print(" Val DF Shape : ", valid.shape)

train.head(2)

 Train DF Shape :  (40151, 5)
 Val DF Shape :  (4717, 5)


Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False


In [9]:
# Let's try to undersample the persuade_corpus
# https://www.kaggle.com/competitions/llm-detect-ai-generated-text/discussion/452127

# In total, the PERSUADE 2.0 corpus comprises over 25,000 argumentative essays produced by 6th-12th grade students 
# in the United States for 15 prompts on two writing tasks: independent and source-based writing.

not_persuade_df = train[train['source'] != 'persuade_corpus']
persuade_df = train[train['source'] == 'persuade_corpus']
sampled_persuade_df = persuade_df.sample(n=6000, random_state=42)

In [10]:
# Testing idea from discussion with @nbroad about limited characters in human essays
all_human = set(list(''.join(sampled_persuade_df.text.to_list())))
other = set(list(''.join(not_persuade_df.text.to_list())))

In [11]:
chars_to_remove = ''.join([x for x in other if x not in all_human])
print(chars_to_remove)

📱👧🤯が🧐🧠💚👥😋🍭🐶有🐟🚣👍🛋🏔🥜“🌃💥🔧ā📸🕺🌸🌨🎮🌅​🤦🙌🚴📝💆👯😈🏫😌と🥨👇🕰💃💪💦🎹す❄🎩👀🧹ã🤗🥳🌿路🎢


In [12]:
translation_table = str.maketrans('', '', chars_to_remove)
def remove_chars(s):
    return s.translate(translation_table)
not_persuade_df['text'] = not_persuade_df['text'].apply(remove_chars)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_persuade_df['text'] = not_persuade_df['text'].apply(remove_chars)


In [13]:
train = pd.concat([not_persuade_df, sampled_persuade_df]).sample(frac=1, random_state=42).reset_index(drop=True)
train.source.value_counts()

source
persuade_corpus                       6000
llama2_chat                           2411
chat_gpt_moth                         2409
mistral7binstruct_v1                  2408
mistral7binstruct_v2                  2406
llama_70b_v1                           984
darragh_claude_v6                      952
darragh_claude_v7                      951
falcon_180b_v1                         899
kingki19_palm                          672
train_essays                           670
cohere-command                         301
palm-text-bison1                       300
radek_500                              250
mistralai/Mistral-7B-Instruct-v0.1     201
NousResearch/Llama-2-7b-chat-hf        200
radekgpt4                              100
Name: count, dtype: int64

In [14]:
ds_train = Dataset.from_pandas(train)
ds_valid = Dataset.from_pandas(valid)

In [15]:
print(" ### Train ###  \n",ds_train)
print("\n ### Val ###  \n",ds_valid)

 ### Train ###  
 Dataset({
    features: ['text', 'label', 'prompt_name', 'source', 'RDizzl3_seven'],
    num_rows: 22114
})

 ### Val ###  
 Dataset({
    features: ['text', 'label', 'prompt_name', 'source', 'RDizzl3_seven'],
    num_rows: 4717
})


In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
def preprocess_function(examples):
    return tokenizer(examples['text'], max_length=128, padding=True, truncation=True)

In [18]:
ds_train_enc = ds_train.map(preprocess_function, batched=True)

  0%|          | 0/23 [00:00<?, ?ba/s]

In [19]:
print(" ### Train Dataset ###  \n", ds_train_enc)
print("\n Unique Prompts :", set(ds_train_enc['prompt_name']))
print("\n Value Counts Label : ", Counter(ds_train_enc['label']))

 ### Train Dataset ###  
 Dataset({
    features: ['text', 'label', 'prompt_name', 'source', 'RDizzl3_seven', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 22114
})

 Unique Prompts : {'Grades for extracurricular activities', 'Seeking multiple opinions', 'Cell phones at school', 'Summer projects', 'The Face on Mars', 'Mandatory extracurricular activities', 'Exploring Venus', '"A Cowboy Who Rode the Waves"', 'Facial action coding system', 'Phones and driving', 'Distance learning', 'Does the electoral college work?', 'Community service', 'Driverless cars'}

 Value Counts Label :  Counter({1: 15446, 0: 6668})


In [20]:
ds_valid_enc = ds_valid.map(preprocess_function, batched=True)

  0%|          | 0/5 [00:00<?, ?ba/s]

In [21]:
print(" ### Val Dataset ###  \n", ds_valid_enc)
print("\n Unique Prompts :", set(ds_valid_enc['prompt_name']))
print("\n Value Counts Label : ", Counter(ds_valid_enc['label']))


 ### Val Dataset ###  
 Dataset({
    features: ['text', 'label', 'prompt_name', 'source', 'RDizzl3_seven', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4717
})

 Unique Prompts : {'Car-free cities'}

 Value Counts Label :  Counter({0: 2666, 1: 2051})


In [22]:
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/huggingfacedebertav3variants/deberta-v3-xsmall and are newly initialized: ['classifier.weight', 'pooler.dense.bias', 'classifier.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
metric_name = "roc_auc"
model_name = "deberta-xsmall"
train_batch_size = 4
eval_batch_size = 32
grad_acc = 4

In [24]:
num_steps = len(train) // (train_batch_size * grad_acc)
print("Total Training Steps : ", num_steps)

Total Training Steps :  1382


In [25]:
args = TrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "steps",
    save_strategy = "steps",
    eval_steps = num_steps // 3,
    save_steps = num_steps // 3,
    learning_rate=2e-5,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    gradient_accumulation_steps=grad_acc,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=False,
    metric_for_best_model=metric_name,
    report_to='none',
    run_name = 'LLM_Detect_AI_Text'
)

In [26]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
    auc = roc_auc_score(labels, probs[:,1], multi_class='ovr')
    return {"roc_auc": auc}

In [27]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds_train_enc,
    eval_dataset=ds_valid_enc,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [28]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Roc Auc
460,No log,0.300964,0.991946
920,0.143100,0.506143,0.987818
1380,0.049900,0.255214,0.997117


TrainOutput(global_step=1382, training_loss=0.0793227307530802, metrics={'train_runtime': 384.1025, 'train_samples_per_second': 57.573, 'train_steps_per_second': 3.598, 'total_flos': 364157494247424.0, 'train_loss': 0.0793227307530802, 'epoch': 1.0})

In [29]:
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
test_ds = Dataset.from_pandas(test)
test_ds_enc = test_ds.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [30]:
test_preds = trainer.predict(test_ds_enc)

In [31]:
print("Test Predictions : ", test_preds)

Test Predictions :  PredictionOutput(predictions=array([[-0.27906865,  0.37921295],
       [-2.7127442 ,  2.5009239 ],
       [-3.045803  ,  2.7482803 ]], dtype=float32), label_ids=None, metrics={'test_runtime': 0.0249, 'test_samples_per_second': 120.626, 'test_steps_per_second': 40.209})


In [32]:
logits = test_preds.predictions
print("Logits :", logits)

Logits : [[-0.27906865  0.37921295]
 [-2.7127442   2.5009239 ]
 [-3.045803    2.7482803 ]]


In [33]:
probs = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
print("Probs :", probs)

Probs : [[0.3411258  0.6588743 ]
 [0.00541223 0.9945878 ]
 [0.00303627 0.99696374]]


In [34]:
sub = pd.DataFrame()
sub['id'] = test['id']
sub['generated'] = probs[:,1]
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,id,generated
0,0000aaaa,0.658874
1,1111bbbb,0.994588
2,2222cccc,0.996964


In [35]:
# res = []
# for src in valid.source.unique():
#     if src in ['train_essays', 'persuade_corpus', 'original_moth']: 
#         continue
#     test2  = valid[valid['source'].isin([src, 'train_essays'])]
#     test_ds2 = Dataset.from_pandas(test2)
#     test_ds_enc2 = test_ds2.map(preprocess_function, batched=True)
#     eval_result = trainer.evaluate(test_ds_enc2)
#     score = eval_result['eval_roc_auc']
#     res.append(f'{src}: {score}')
    
# for r in res: 
#     print(r)