In [5]:
pip install transformers

Collecting transformers
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.46.2-py3-none-any.whl (10.0 MB)


In [3]:
import pandas as pd
import re

# Load the CSV file
df = pd.read_csv("cleaned_file_with_text_length.csv")
df


Unnamed: 0,text,label,text_length
0,there are many companies that sell gold becaus...,1,588
1,"instead of "" grower "" and "" shower "", i'm a fa...",0,676
2,most malwares tend to hide their malicious act...,1,1021
3,fast food chains often place their restaurants...,1,535
4,sure! when a man is in the stage of sleep call...,1,700
...,...,...,...
82992,"the tags on mattresses and pillows often say ""...",1,689
82993,recent studies have shown great interest in id...,1,1210
82994,"way back in the day, there were computers that...",0,3789
82995,the radio spectrum is a range of frequencies t...,1,869


In [8]:
from transformers import ElectraTokenizer, ElectraForSequenceClassification
import torch

model_name = "google/electra-base-discriminator"
tokenizer = ElectraTokenizer.from_pretrained(model_name)
model = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Binary classification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


  return self.fget.__get__(instance, owner)()
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [9]:
from torch.utils.data import DataLoader, TensorDataset
from transformers import ElectraTokenizer

max_length = 512
tokens = tokenizer(
    df['text'].tolist(),
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt"
)

labels = torch.tensor(df['label'].values)
dataset = TensorDataset(tokens['input_ids'], tokens['attention_mask'], labels)

from torch.utils.data import random_split

train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [10]:
from transformers import AdamW
from tqdm import tqdm

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

epochs = 10
model.train()

for epoch in range(epochs):
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}"):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        predictions = torch.argmax(logits, dim=-1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

    avg_loss = total_loss / len(train_loader)
    accuracy = correct_predictions / total_predictions
    print(f"Epoch {epoch + 1}/{epochs} - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")


Training Epoch 1/10: 100%|██████████| 4150/4150 [14:13<00:00,  4.86it/s]


Epoch 1/10 - Loss: 0.0926, Accuracy: 0.9656


Training Epoch 2/10: 100%|██████████| 4150/4150 [14:11<00:00,  4.87it/s]


Epoch 2/10 - Loss: 0.0417, Accuracy: 0.9851


Training Epoch 3/10: 100%|██████████| 4150/4150 [13:41<00:00,  5.05it/s]


Epoch 3/10 - Loss: 0.0232, Accuracy: 0.9924


Training Epoch 4/10: 100%|██████████| 4150/4150 [13:54<00:00,  4.97it/s]


Epoch 4/10 - Loss: 0.0153, Accuracy: 0.9948


Training Epoch 5/10: 100%|██████████| 4150/4150 [13:45<00:00,  5.03it/s]


Epoch 5/10 - Loss: 0.0113, Accuracy: 0.9960


Training Epoch 6/10: 100%|██████████| 4150/4150 [13:45<00:00,  5.03it/s]


Epoch 6/10 - Loss: 0.0082, Accuracy: 0.9970


Training Epoch 7/10: 100%|██████████| 4150/4150 [13:40<00:00,  5.06it/s]


Epoch 7/10 - Loss: 0.0068, Accuracy: 0.9976


Training Epoch 8/10: 100%|██████████| 4150/4150 [13:41<00:00,  5.05it/s]


Epoch 8/10 - Loss: 0.0065, Accuracy: 0.9977


Training Epoch 9/10: 100%|██████████| 4150/4150 [13:49<00:00,  5.01it/s]


Epoch 9/10 - Loss: 0.0051, Accuracy: 0.9983


Training Epoch 10/10: 100%|██████████| 4150/4150 [13:50<00:00,  5.00it/s]

Epoch 10/10 - Loss: 0.0050, Accuracy: 0.9981





In [13]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8

In [14]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(all_labels, all_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average="binary")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")


Test Accuracy: 0.9700
Precision: 0.9488, Recall: 0.9937, F1 Score: 0.9707


In [16]:
pip install protobuf

Collecting protobuf
  Downloading protobuf-5.28.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Downloading protobuf-5.28.3-cp38-abi3-manylinux2014_x86_64.whl (316 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.6/316.6 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
Successfully installed protobuf-5.28.3
[0mNote: you may need to restart the kernel to use updated packages.


In [19]:
# Specify the path where you want to save the model
save_directory = "modelElectra_ai_human"

# Save the model weights and configuration
model.save_pretrained(save_directory)

# Save the tokenizer (if you used a custom tokenizer, you should save it too)
tokenizer = ElectraTokenizer.from_pretrained(model_name)  # Load the tokenizer
tokenizer.save_pretrained(save_directory)

('modelElectra_ai_human/tokenizer_config.json',
 'modelElectra_ai_human/special_tokens_map.json',
 'modelElectra_ai_human/vocab.txt',
 'modelElectra_ai_human/added_tokens.json')