In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
from peft import PeftModel
import time
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# # openai-community/roberta-large-openai-detector

# class DetectingLLM(nn.Module):
#     def __init__(self, model_name="Jahid05/llama-3.2-1b-text-classification"):
#         super(DetectingLLM, self).__init__()

#         # Load tokenizer
#         self.tokenizer = AutoTokenizer.from_pretrained(model_name)

#         # Load base model (without classification head)
#         self.encoder = AutoModel.from_pretrained(model_name)
#         hidden_size = self.encoder.config.hidden_size

#         # Classification head (you can customize dimensions)
#         self.classifier = nn.Sequential(
#             nn.Linear(hidden_size, 128),
#             nn.ReLU(),
#             nn.Dropout(0.3),
#             nn.Linear(128, 2)  # For binary classification
#         )

#     def forward(self, text_input, device):
#         # Tokenize
#         inputs = self.tokenizer(
#             text_input,
#             return_tensors="pt",
#             padding=True,
#             truncation=True,
#             max_length=256
#         ).to(device)

#         # Forward pass through LLM
#         outputs = self.encoder(**inputs)
#         cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token

#         # Classification
#         logits = self.classifier(cls_embedding)
#         return logits

In [4]:
class DetectingLLM(nn.Module):
    def __init__(self, model_name="FacebookAI/roberta-large-mnli"):
        super(DetectingLLM, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=2,
            ignore_mismatched_sizes=True  # <--- This avoids the size mismatch error
        )
        self.model.resize_token_embeddings(len(self.tokenizer))  # <--- Ensures vocab sizes match

        for param in self.model.base_model.parameters():
            param.requires_grad = False

    def forward(self, text_input, device):
        inputs = self.tokenizer(
            text_input,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=256
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = self.model(**inputs)
        return outputs.logits

In [5]:
# Initialize Model, Optimizer, and Loss Function
model = DetectingLLM().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at FacebookAI/roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassific

In [6]:
# Custom Dataset for Loading Text Data
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

In [7]:
# Load and preprocess the dataset
df = pd.read_csv("dataset/gemma_marked_7k.csv")

In [8]:
df.shape

(14000, 2)

In [9]:
# Clean and format the text
def clean_text(raw_text):
    # Remove extra newlines and whitespace
    cleaned_text = re.sub(r'\s*\n\s*', '\n', raw_text.strip())
    
    # Remove unnecessary quotes
    cleaned_text = cleaned_text.strip("'\"")
    
    # Replace escaped newline characters with actual newlines
    cleaned_text = cleaned_text.replace("\\n", "\n")
    cleaned_text = cleaned_text.replace("**", "")
    cleaned_text = cleaned_text.replace("\n", ". ")
    cleaned_text = cleaned_text.replace("## ", "")
    
    # Remove excessive blank lines
    cleaned_text = re.sub(r'\n{2,}', '\n\n', cleaned_text)
    
    # Ensure proper formatting
    cleaned_text = cleaned_text.strip()
    
    return cleaned_text

In [10]:
df['text'] = df['text'].apply(clean_text)

In [11]:
df.head()

Unnamed: 0,text,label
0,"Sure, here's an explanation of the importance ...",0
1,Academic writing can help to solve issues in c...,0
2,Vocational training can help to solve issues i...,0
3,Strategies for Improving Adult Education in Ed...,0
4,Inclusive Education and Digital Textbooks. 1. ...,0


In [12]:
df["label"] = df["label"].astype(int)

In [13]:
shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [14]:
shuffled_df.head()

Unnamed: 0,text,label
0,Unique Perspectives on Allegory in Satire:. 1....,0
1,Cultural Relativism and Theories of Happiness....,0
2,Embedding Watermarks in Biomedical Engineering...,1
3,Argument for Thought Experiments in Philosophy...,0
4,Nanotechnology in Particle Physics. Nanotechno...,1


In [18]:
# Split into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    shuffled_df['text'], shuffled_df['label'], test_size=0.2, random_state=42
)

# Create Datasets and DataLoaders
train_dataset = TextDataset(train_texts.tolist(), train_labels.tolist())
test_dataset = TextDataset(test_texts.tolist(), test_labels.tolist())
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

In [19]:
train_texts.shape

(11200,)

In [20]:
# Evaluation Loop
def evaluate_model(model, test_loader, device):
    model.eval()
    predictions, actuals = [], []
    with torch.no_grad():
        for texts, labels in test_loader:
            # Move labels to GPU
            labels = torch.tensor(labels).to(device)
            # Forward pass with inputs on GPU
            outputs = model(texts, device=device)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            actuals.extend(labels.cpu().numpy())
    acc = accuracy_score(actuals, predictions)
    print(f"Accuracy: {acc * 100:.2f}%")

In [21]:
# Training Loop
def train_model(model, train_loader, optimizer, criterion, device, epochs):
    model.train()
    for epoch in range(epochs):  # Train for `epochs`
        start = time.time()
        total_loss = 0
        for texts, labels in train_loader:
            # Move labels to GPU
            labels = torch.tensor(labels).to(device)
            optimizer.zero_grad()
            # Forward pass with inputs on GPU
            outputs = model(texts, device=device)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        end = time.time()
        print(f"Epoch {epoch + 1}, Time: {end-start:.4f}, Loss: {total_loss / len(train_loader):.4f}")
        #evaluate_model(model, train_loader, device)
        if (epoch+1)%10==0:
            evaluate_model(model, test_loader, device)

In [22]:
train_model(model, train_loader, optimizer, criterion, device, epochs=200)

  labels = torch.tensor(labels).to(device)


Epoch 1, Time: 43.9376, Loss: 0.6529
Epoch 2, Time: 42.2463, Loss: 0.6029
Epoch 3, Time: 42.6121, Loss: 0.5924
Epoch 4, Time: 43.5618, Loss: 0.5865
Epoch 5, Time: 42.0985, Loss: 0.5796
Epoch 6, Time: 42.4137, Loss: 0.5714
Epoch 7, Time: 41.3255, Loss: 0.5703
Epoch 8, Time: 41.3691, Loss: 0.5652
Epoch 9, Time: 42.0793, Loss: 0.5632
Epoch 10, Time: 42.6261, Loss: 0.5611


  labels = torch.tensor(labels).to(device)


Accuracy: 74.29%


  labels = torch.tensor(labels).to(device)


Epoch 11, Time: 41.7365, Loss: 0.5190
Epoch 12, Time: 41.3605, Loss: 0.5055
Epoch 13, Time: 40.2669, Loss: 0.4970
Epoch 14, Time: 42.1281, Loss: 0.4948
Epoch 15, Time: 41.6786, Loss: 0.4874
Epoch 16, Time: 40.5075, Loss: 0.4903
Epoch 17, Time: 41.8403, Loss: 0.4817
Epoch 18, Time: 40.1611, Loss: 0.4800
Epoch 19, Time: 39.2477, Loss: 0.4757
Epoch 20, Time: 40.2645, Loss: 0.4683


  labels = torch.tensor(labels).to(device)


Accuracy: 77.79%


  labels = torch.tensor(labels).to(device)


Epoch 21, Time: 40.9298, Loss: 0.4663
Epoch 22, Time: 40.5066, Loss: 0.4680
Epoch 23, Time: 40.9411, Loss: 0.4660
Epoch 24, Time: 41.0315, Loss: 0.4602
Epoch 25, Time: 41.0744, Loss: 0.4579
Epoch 26, Time: 41.3617, Loss: 0.4563
Epoch 27, Time: 41.3289, Loss: 0.4569
Epoch 28, Time: 40.6314, Loss: 0.4532
Epoch 29, Time: 40.9418, Loss: 0.4481
Epoch 30, Time: 39.6093, Loss: 0.4470


  labels = torch.tensor(labels).to(device)


Accuracy: 77.86%


  labels = torch.tensor(labels).to(device)


Epoch 31, Time: 41.2749, Loss: 0.4476
Epoch 32, Time: 41.0842, Loss: 0.4443
Epoch 33, Time: 39.8064, Loss: 0.4444
Epoch 34, Time: 41.4061, Loss: 0.4409
Epoch 35, Time: 41.4852, Loss: 0.4373
Epoch 36, Time: 40.0306, Loss: 0.4380
Epoch 37, Time: 41.7035, Loss: 0.4348
Epoch 38, Time: 42.1893, Loss: 0.4351
Epoch 39, Time: 40.6227, Loss: 0.4290
Epoch 40, Time: 41.3355, Loss: 0.4269


  labels = torch.tensor(labels).to(device)


Accuracy: 78.36%


  labels = torch.tensor(labels).to(device)


Epoch 41, Time: 39.7949, Loss: 0.4288
Epoch 42, Time: 39.8817, Loss: 0.4259
Epoch 43, Time: 40.8523, Loss: 0.4256
Epoch 44, Time: 41.0418, Loss: 0.4227
Epoch 45, Time: 40.7309, Loss: 0.4192
Epoch 46, Time: 41.2349, Loss: 0.4222
Epoch 47, Time: 41.3768, Loss: 0.4185
Epoch 48, Time: 40.5309, Loss: 0.4152
Epoch 49, Time: 41.6863, Loss: 0.4144
Epoch 50, Time: 41.5530, Loss: 0.4088


  labels = torch.tensor(labels).to(device)


Accuracy: 76.18%


  labels = torch.tensor(labels).to(device)


Epoch 51, Time: 41.1256, Loss: 0.4098
Epoch 52, Time: 40.1198, Loss: 0.4054
Epoch 53, Time: 39.0677, Loss: 0.4050
Epoch 54, Time: 40.0780, Loss: 0.4056
Epoch 55, Time: 40.5022, Loss: 0.4051
Epoch 56, Time: 39.9900, Loss: 0.4012
Epoch 57, Time: 41.5749, Loss: 0.4034
Epoch 58, Time: 40.9983, Loss: 0.3981
Epoch 59, Time: 40.2383, Loss: 0.3982
Epoch 60, Time: 41.6488, Loss: 0.3962


  labels = torch.tensor(labels).to(device)


Accuracy: 77.21%


  labels = torch.tensor(labels).to(device)


Epoch 61, Time: 42.2135, Loss: 0.3941
Epoch 62, Time: 41.0372, Loss: 0.3900
Epoch 63, Time: 41.0948, Loss: 0.3924
Epoch 64, Time: 40.3720, Loss: 0.3886
Epoch 65, Time: 39.9404, Loss: 0.3851
Epoch 66, Time: 40.5769, Loss: 0.3854
Epoch 67, Time: 40.9979, Loss: 0.3821
Epoch 68, Time: 40.2149, Loss: 0.3828
Epoch 69, Time: 41.3802, Loss: 0.3791
Epoch 70, Time: 41.5638, Loss: 0.3754


  labels = torch.tensor(labels).to(device)


Accuracy: 77.46%


  labels = torch.tensor(labels).to(device)


Epoch 71, Time: 40.2564, Loss: 0.3749
Epoch 72, Time: 41.3844, Loss: 0.3814
Epoch 73, Time: 40.9952, Loss: 0.3731
Epoch 74, Time: 40.8480, Loss: 0.3715
Epoch 75, Time: 40.4219, Loss: 0.3681
Epoch 76, Time: 39.3509, Loss: 0.3678
Epoch 77, Time: 40.3238, Loss: 0.3645
Epoch 78, Time: 41.0594, Loss: 0.3634
Epoch 79, Time: 40.5918, Loss: 0.3598
Epoch 80, Time: 41.1947, Loss: 0.3637


  labels = torch.tensor(labels).to(device)


Accuracy: 78.32%


  labels = torch.tensor(labels).to(device)


Epoch 81, Time: 41.0637, Loss: 0.3532
Epoch 82, Time: 40.5627, Loss: 0.3552
Epoch 83, Time: 42.2169, Loss: 0.3528
Epoch 84, Time: 42.1979, Loss: 0.3546
Epoch 85, Time: 40.5756, Loss: 0.3492
Epoch 86, Time: 41.3118, Loss: 0.3480
Epoch 87, Time: 40.3256, Loss: 0.3473
Epoch 88, Time: 39.6018, Loss: 0.3463
Epoch 89, Time: 40.6435, Loss: 0.3441
Epoch 90, Time: 41.2540, Loss: 0.3397


  labels = torch.tensor(labels).to(device)


Accuracy: 76.68%


  labels = torch.tensor(labels).to(device)


Epoch 91, Time: 40.7305, Loss: 0.3388
Epoch 92, Time: 41.1168, Loss: 0.3382
Epoch 93, Time: 41.5992, Loss: 0.3382
Epoch 94, Time: 40.6362, Loss: 0.3337
Epoch 95, Time: 41.9107, Loss: 0.3328
Epoch 96, Time: 41.9964, Loss: 0.3320
Epoch 97, Time: 41.0953, Loss: 0.3267
Epoch 98, Time: 41.1548, Loss: 0.3249
Epoch 99, Time: 39.4991, Loss: 0.3225
Epoch 100, Time: 41.3443, Loss: 0.3244


  labels = torch.tensor(labels).to(device)


Accuracy: 75.89%


  labels = torch.tensor(labels).to(device)


Epoch 101, Time: 41.4111, Loss: 0.3195
Epoch 102, Time: 40.7165, Loss: 0.3162
Epoch 103, Time: 41.6467, Loss: 0.3153
Epoch 104, Time: 41.9463, Loss: 0.3157
Epoch 105, Time: 41.0332, Loss: 0.3133
Epoch 106, Time: 41.3502, Loss: 0.3135
Epoch 107, Time: 42.2377, Loss: 0.3078
Epoch 108, Time: 40.8176, Loss: 0.3093
Epoch 109, Time: 40.8225, Loss: 0.3054
Epoch 110, Time: 39.9750, Loss: 0.3043


  labels = torch.tensor(labels).to(device)


Accuracy: 78.21%


  labels = torch.tensor(labels).to(device)


Epoch 111, Time: 40.1252, Loss: 0.3061
Epoch 112, Time: 40.9861, Loss: 0.3007
Epoch 113, Time: 41.1733, Loss: 0.2964
Epoch 114, Time: 40.9491, Loss: 0.2958
Epoch 115, Time: 41.4389, Loss: 0.2909
Epoch 116, Time: 41.1697, Loss: 0.2904
Epoch 117, Time: 41.2669, Loss: 0.2892
Epoch 118, Time: 42.0308, Loss: 0.2928
Epoch 119, Time: 40.6368, Loss: 0.2860
Epoch 120, Time: 41.8343, Loss: 0.2829


  labels = torch.tensor(labels).to(device)


Accuracy: 75.57%


  labels = torch.tensor(labels).to(device)


Epoch 121, Time: 40.4966, Loss: 0.2791
Epoch 122, Time: 40.0359, Loss: 0.2830
Epoch 123, Time: 40.7982, Loss: 0.2776
Epoch 124, Time: 40.9428, Loss: 0.2762
Epoch 125, Time: 40.6982, Loss: 0.2706
Epoch 126, Time: 41.6686, Loss: 0.2718
Epoch 127, Time: 41.2876, Loss: 0.2732
Epoch 128, Time: 40.8738, Loss: 0.2697
Epoch 129, Time: 41.2375, Loss: 0.2645
Epoch 130, Time: 41.2629, Loss: 0.2637


  labels = torch.tensor(labels).to(device)


Accuracy: 77.75%


  labels = torch.tensor(labels).to(device)


Epoch 131, Time: 40.8103, Loss: 0.2607
Epoch 132, Time: 40.4833, Loss: 0.2596
Epoch 133, Time: 39.7670, Loss: 0.2537
Epoch 134, Time: 40.0703, Loss: 0.2558
Epoch 135, Time: 40.1336, Loss: 0.2531
Epoch 136, Time: 40.9417, Loss: 0.2534
Epoch 137, Time: 40.4899, Loss: 0.2469
Epoch 138, Time: 41.3189, Loss: 0.2482
Epoch 139, Time: 41.1139, Loss: 0.2475
Epoch 140, Time: 38.6954, Loss: 0.2417


  labels = torch.tensor(labels).to(device)


Accuracy: 74.29%


  labels = torch.tensor(labels).to(device)


Epoch 141, Time: 41.5741, Loss: 0.2407
Epoch 142, Time: 40.2661, Loss: 0.2402
Epoch 143, Time: 41.7960, Loss: 0.2403
Epoch 144, Time: 40.2040, Loss: 0.2361
Epoch 145, Time: 38.6086, Loss: 0.2358
Epoch 146, Time: 41.5329, Loss: 0.2317
Epoch 147, Time: 41.0126, Loss: 0.2303
Epoch 148, Time: 39.7201, Loss: 0.2246
Epoch 149, Time: 42.2592, Loss: 0.2251
Epoch 150, Time: 40.9774, Loss: 0.2233


  labels = torch.tensor(labels).to(device)


Accuracy: 76.61%


  labels = torch.tensor(labels).to(device)


Epoch 151, Time: 40.9936, Loss: 0.2207
Epoch 152, Time: 40.8930, Loss: 0.2221
Epoch 153, Time: 41.7338, Loss: 0.2197
Epoch 154, Time: 40.6207, Loss: 0.2142
Epoch 155, Time: 40.4146, Loss: 0.2155
Epoch 156, Time: 39.2531, Loss: 0.2120
Epoch 157, Time: 39.5401, Loss: 0.2088
Epoch 158, Time: 40.5178, Loss: 0.2082
Epoch 159, Time: 40.6989, Loss: 0.2084
Epoch 160, Time: 40.3163, Loss: 0.2019


  labels = torch.tensor(labels).to(device)


Accuracy: 74.64%


  labels = torch.tensor(labels).to(device)


Epoch 161, Time: 41.1390, Loss: 0.2013
Epoch 162, Time: 41.0576, Loss: 0.2031
Epoch 163, Time: 40.3261, Loss: 0.1997
Epoch 164, Time: 40.6159, Loss: 0.1945
Epoch 165, Time: 41.5105, Loss: 0.1957
Epoch 166, Time: 40.6093, Loss: 0.1935
Epoch 167, Time: 40.3066, Loss: 0.1923
Epoch 168, Time: 39.4017, Loss: 0.1902
Epoch 169, Time: 40.3386, Loss: 0.1876
Epoch 170, Time: 40.6493, Loss: 0.1904


  labels = torch.tensor(labels).to(device)


Accuracy: 76.57%


  labels = torch.tensor(labels).to(device)


Epoch 171, Time: 39.8418, Loss: 0.1829
Epoch 172, Time: 41.2273, Loss: 0.1817
Epoch 173, Time: 41.2821, Loss: 0.1787
Epoch 174, Time: 39.9816, Loss: 0.1739
Epoch 175, Time: 41.2954, Loss: 0.1770
Epoch 176, Time: 41.3141, Loss: 0.1726
Epoch 177, Time: 39.9322, Loss: 0.1704
Epoch 178, Time: 41.3172, Loss: 0.1682
Epoch 179, Time: 39.5866, Loss: 0.1735
Epoch 180, Time: 39.2926, Loss: 0.1665


  labels = torch.tensor(labels).to(device)


Accuracy: 77.04%


  labels = torch.tensor(labels).to(device)


Epoch 181, Time: 40.7774, Loss: 0.1675
Epoch 182, Time: 40.7065, Loss: 0.1652
Epoch 183, Time: 40.6167, Loss: 0.1626
Epoch 184, Time: 41.4731, Loss: 0.1584
Epoch 185, Time: 40.8454, Loss: 0.1587
Epoch 186, Time: 40.5396, Loss: 0.1559
Epoch 187, Time: 41.7262, Loss: 0.1551
Epoch 188, Time: 41.4474, Loss: 0.1537
Epoch 189, Time: 40.2763, Loss: 0.1510
Epoch 190, Time: 40.1089, Loss: 0.1493


  labels = torch.tensor(labels).to(device)


Accuracy: 75.79%


  labels = torch.tensor(labels).to(device)


Epoch 191, Time: 39.7570, Loss: 0.1482
Epoch 192, Time: 39.8716, Loss: 0.1494
Epoch 193, Time: 40.7772, Loss: 0.1482
Epoch 194, Time: 41.1884, Loss: 0.1398
Epoch 195, Time: 40.6372, Loss: 0.1418
Epoch 196, Time: 40.7364, Loss: 0.1355
Epoch 197, Time: 41.5181, Loss: 0.1395
Epoch 198, Time: 40.6588, Loss: 0.1341
Epoch 199, Time: 41.4072, Loss: 0.1358
Epoch 200, Time: 41.4012, Loss: 0.1321


  labels = torch.tensor(labels).to(device)


Accuracy: 72.25%


In [None]:
# Evaluation Loop
def evaluate_model(model, test_loader, device):
    model.eval()
    predictions, actuals = [], []
    with torch.no_grad():
        for texts, labels in test_loader:
            # Move labels to GPU
            labels = torch.tensor(labels).to(device)
            # Forward pass with inputs on GPU
            outputs = model(texts, device=device)
            #print(f"outputs: {outputs}")
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            actuals.extend(labels.cpu().numpy())
    # print(f"Actual: {actuals}")
    # print(f"Predictions: {predictions}")
    acc = accuracy_score(actuals, predictions)
    print(f"Test Accuracy: {acc * 100:.2f}%")

In [None]:
evaluate_model(model, train_loader, device)

In [None]:
# Get one batch from the test loader
texts, labels = next(iter(test_loader))

# Print the data
print("Texts:", texts)
print("Labels:", labels)

In [None]:
outputs = model(texts, device=device)

In [None]:
outputs

In [None]:
preds = torch.argmax(outputs, dim=1).cpu().numpy()
preds

In [None]:
evaluate_model(model, train_loader, device)

In [None]:
# Save the entire model
torch.save(model, "detecting_llm_full_v2.pth")
print("Entire model saved successfully!")

In [None]:
# Load the entire model
model = torch.load("detecting_llm_full_v2.pth")
model.to(device)  # Move model to the appropriate device
print("Entire model loaded successfully!")