In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

# List contents of the My Drive
print(os.listdir('/content/drive/My Drive'))

# Then list the contents of the AI_Detector folder
print(os.listdir('/content/drive/My Drive/assets'))


['Colab Notebooks', 'xyzabc', 'AI_Detector']
['Training_Essay_Data.csv', 'saved_model', 'templates', 'saved_modelv2', 'static']


In [4]:
print(os.listdir('/content/drive/My Drive'))


['Colab Notebooks', 'xyzabc', 'AI_Detector']


In [None]:
# Load dataset
dataset_path = "/content/drive/My Drive/assets/Training_Essay_Data.csv"
df = pd.read_csv(dataset_path)
print(df.head())

                                                text  generated
0   Car-free cities have become a subject of incr...          1
1   Car Free Cities  Car-free cities, a concept g...          1
2    A Sustainable Urban Future  Car-free cities ...          1
3    Pioneering Sustainable Urban Living  In an e...          1
4    The Path to Sustainable Urban Living  In an ...          1


In [6]:
print(df['generated'].value_counts())

generated
0    17508
1    11637
Name: count, dtype: int64


In [7]:
print(df['generated'].unique())  # Check the unique values in the column

[1 0]


In [8]:
# Step 1: Check the distribution of the 'generated' column
print("Original Distribution:")
print(df['generated'].value_counts())  # Check counts of 'Human' (0) and 'AI' (1)

# Step 2: Split the dataset into 'Human' and 'AI' DataFrames
human_df = df[df['generated'] == 0]  # Filter human data (0 for Human)
ai_df = df[df['generated'] == 1]     # Filter AI data (1 for AI)

Original Distribution:
generated
0    17508
1    11637
Name: count, dtype: int64


In [9]:
# Step 3: Print the shapes of the filtered data to verify that the filtering is correct
print("Shape of Human Data:", human_df.shape)  # Should print (17508, 2)
print("Shape of AI Data:", ai_df.shape)       # Should print (11637, 2)

Shape of Human Data: (17508, 2)
Shape of AI Data: (11637, 2)


In [10]:
# Step 4: Under-sample Human data to match the number of AI samples
human_df = human_df.sample(n=len(ai_df), random_state=42)   # Sample Human data to match the number of AI samples

# Step 5: Concatenate Human and AI DataFrames to create the balanced dataset
balanced_df = pd.concat([human_df, ai_df])
print("Shape of Balanced Dataset:", balanced_df.shape)  # Should print (11637 * 2, 2)

Shape of Balanced Dataset: (23274, 2)


In [11]:
# Step 6: Verify the distribution of the 'generated' column in the balanced dataset
print("Balanced Dataset Distribution:")
print(balanced_df['generated'].value_counts())  # Should show equal counts for 0 and 1 (11637 each)

Balanced Dataset Distribution:
generated
0    11637
1    11637
Name: count, dtype: int64


In [12]:
# Split into train and test sets
train_df, test_df = train_test_split(balanced_df, test_size=0.2, random_state=42)

print(train_df.shape)  # Print the size of the training set
print(test_df.shape)  # Print the size of the testing set


(18619, 2)
(4655, 2)


In [13]:
print(train_df['generated'].value_counts())
print(train_df[['text', 'generated']].head(10))


generated
0    9328
1    9291
Name: count, dtype: int64
                                                    text  generated
24548  Life is a precious gift that we are given, and...          1
11311  There are lots of arguments and opinions on th...          0
24450  Many students believe that it is more importan...          1
22050  In my opinion, I agree that the government sho...          1
2794   Dear State Senator,\n\nI hope this letter find...          1
23191  It is a common belief that it is better to spe...          1
27811  Have you ever wondered what will happen if we ...          0
965    Cars are useful machines that get you around. ...          0
16375  I think it is a good idea that people should a...          0
4687   Cars  are starting to beome more and more expe...          0


In [14]:
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
import torch
from torch.optim import AdamW
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report
from torch.cuda.amp import autocast, GradScaler

# Load DeBERTa Tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base")

# Tokenize Text
def tokenize_texts(texts, max_length=512):
    return tokenizer(
        texts.tolist(),
        padding="max_length",  # Pad to max length
        truncation=True,        # Truncate to max length
        max_length=max_length,
        return_tensors="pt"    # Return pytorch tensors
    )

# Tokenize Train & Test Data
train_encodings = tokenize_texts(train_df['text'])
test_encodings = tokenize_texts(test_df['text'])

# Convert labels to tensors
train_labels = torch.tensor(train_df['generated'].values)
test_labels = torch.tensor(test_df['generated'].values)

# Create Dataloaders
batch_size = 16
train_data = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)

test_data = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)
test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data), batch_size=batch_size)

# Load DeBERTa Model (with dropout modifications)
model = DebertaV2ForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels=2)
model.config.hidden_dropout_prob = 0.3
model.config.attention_probs_dropout_prob = 0.3

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer with a lower learning rate
optimizer = AdamW(model.parameters(), lr=1e-5)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

In [15]:
# Initialize fp16 GradScaler
scaler = GradScaler()

# Training Loop
epochs = 4  # Reduce epochs to prevent overfitting
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        model.zero_grad()

        with autocast():  # Mixed precision enabled here
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss

        # Backward pass with fp16
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)  # Unscale before clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

        if step %100 == 0:
            print(f"Epoch {epoch+1}, Step {step}, Loss: {loss.item()}")

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}, Loss: {avg_train_loss}")


  scaler = GradScaler()
  with autocast():  # Mixed precision enabled here


Epoch 1, Step 0, Loss: 0.7376623153686523
Epoch 1, Step 100, Loss: 0.005493347067385912
Epoch 1, Step 200, Loss: 0.0020848000422120094
Epoch 1, Step 300, Loss: 0.0016756481491029263
Epoch 1, Step 400, Loss: 0.0016620291862636805
Epoch 1, Step 500, Loss: 0.000761581992264837
Epoch 1, Step 600, Loss: 0.019467616453766823
Epoch 1, Step 700, Loss: 0.003804797073826194
Epoch 1, Step 800, Loss: 0.00029041265952400863
Epoch 1, Step 900, Loss: 0.0001655570522416383
Epoch 1, Step 1000, Loss: 0.00019268639152869582
Epoch 1, Step 1100, Loss: 0.0004061425570398569
Epoch 1, Loss: 0.038143197103855116
Epoch 2, Step 0, Loss: 0.00026405180688016117
Epoch 2, Step 100, Loss: 0.00015829404583200812
Epoch 2, Step 200, Loss: 0.00022077801986597478
Epoch 2, Step 300, Loss: 0.00014023773837834597
Epoch 2, Step 400, Loss: 7.310946966754273e-05
Epoch 2, Step 500, Loss: 0.5423246026039124
Epoch 2, Step 600, Loss: 0.6968365907669067
Epoch 2, Step 700, Loss: 0.0002931429189629853
Epoch 2, Step 800, Loss: 0.000223

In [18]:
# Evaluation
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        outputs = model(b_input_ids, attention_mask=b_input_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(b_labels.cpu().numpy())

# Metrics
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average='weighted')  # Use 'micro', 'macro', or 'weighted' depending on your needs
precision = precision_score(true_labels, predictions, average='weighted')
recall = recall_score(true_labels, predictions, average='weighted')
conf_matrix = confusion_matrix(true_labels, predictions)

# Classification Report
report = classification_report(true_labels, predictions, target_names=['Human', 'AI-Generated'])

# Print metrics
print(f"Accuracy: {accuracy}")
print(f"F1 Score (Weighted): {f1}")
print(f"Precision (Weighted): {precision}")
print(f"Recall (Weighted): {recall}")
print("Confusion Matrix:")
print(conf_matrix)
print(report)


Accuracy: 0.9984962406015038
F1 Score (Weighted): 0.9984962260270047
Precision (Weighted): 0.9984985156581194
Recall (Weighted): 0.9984962406015038
Confusion Matrix:
[[2303    6]
 [   1 2345]]
              precision    recall  f1-score   support

       Human       1.00      1.00      1.00      2309
AI-Generated       1.00      1.00      1.00      2346

    accuracy                           1.00      4655
   macro avg       1.00      1.00      1.00      4655
weighted avg       1.00      1.00      1.00      4655



In [None]:
# Save the model
model.save_pretrained("/content/drive/MyDrive/assets/models/saved_modelv3")
tokenizer.save_pretrained("/content/drive/MyDrive/assets/models/saved_modelv3")

('/content/drive/MyDrive/AI_Detector/saved_modelv3/tokenizer_config.json',
 '/content/drive/MyDrive/AI_Detector/saved_modelv3/special_tokens_map.json',
 '/content/drive/MyDrive/AI_Detector/saved_modelv3/spm.model',
 '/content/drive/MyDrive/AI_Detector/saved_modelv3/added_tokens.json')