In [None]:
import torch_xla
import torch_xla.core.xla_model as xm

device = xm.xla_device()
print("Using device:", device)


Using device: xla:0


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install transformers torch scikit-learn pandas




In [None]:
import pandas as pd

# ✅ Update paths based on Google Drive location
human_dataset_path = "/content/drive/MyDrive/mini project/HumanDataset.csv"
ai_dataset_path = "/content/drive/MyDrive/mini project/AiDataset.csv"

# ✅ Load datasets
human_texts = pd.read_csv(human_dataset_path)
ai_texts = pd.read_csv(ai_dataset_path)

# ✅ Ensure correct column names
human_texts.columns = human_texts.columns.str.strip()
ai_texts.columns = ai_texts.columns.str.strip()

# ✅ Check if "text" column exists
if "text" not in human_texts.columns or "text" not in ai_texts.columns:
    raise KeyError("Column 'text' not found in dataset. Check CSV files.")

# ✅ Assign labels
human_texts["label"] = 0  # Human = 0
ai_texts["label"] = 1  # AI = 1

# ✅ Merge and shuffle dataset
df = pd.concat([human_texts, ai_texts]).sample(frac=1).reset_index(drop=True)

# ✅ Print first 5 rows to confirm
print(df.head())


                                                text  \
0                                                NaN   
1  As cyber threats continue to intersect with em...   
2  Heterogeneous data are the most commonly used ...   
3  Broadly, IoT architectures and paradigms consi...   
4  This paper aims to provide valuable insights f...   

                           DOI  Label  label  
0                          NaN    1.0      1  
1  10.1109/ACCESS.2024.3493957    0.0      0  
2   10.1109/TNNLS.2022.3229161    0.0      0  
3   10.1109/COMST.2019.2910750    0.0      0  
4  10.1109/ACCESS.2024.3502628    0.0      0  


In [None]:
import torch
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# ✅ Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# ✅ Ensure df["text"] is a list of strings
df["text"] = df["text"].astype(str).fillna("")  # Convert NaN to empty strings

# ✅ Split dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

# ✅ Ensure all texts are strings (avoid tokenizer errors)
assert all(isinstance(text, str) for text in train_texts), "❌ Error: train_texts contains non-string values!"
assert all(isinstance(text, str) for text in test_texts), "❌ Error: test_texts contains non-string values!"

# ✅ Create Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(
            texts, padding=True, truncation=True, max_length=512, return_tensors="pt"
        )
        self.labels = torch.tensor(labels, dtype=torch.long)  # Ensure labels are LongTensor

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

# ✅ Convert to PyTorch Dataset
train_dataset = TextDataset(train_texts, train_labels)
test_dataset = TextDataset(test_texts, test_labels)

# ✅ Create DataLoaders (Optimized for GPU)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, pin_memory=True)

print("✅ Data preprocessing completed successfully!")


✅ Data preprocessing completed successfully!


In [None]:
pip install opencv-python




In [None]:
import os
from transformers import BertForSequenceClassification

# ✅ Check if a saved model exists
if os.path.exists("/content/drive/MyDrive/saved_model"):
    print("✅ Loading pre-trained model...")
    model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/saved_model")
else:
    print("🚀 No trained model found! Initializing new model.")
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("✅ Model is ready!")


🚀 No trained model found! Initializing new model.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model is ready!


In [None]:
import torch
from torch.amp import GradScaler, autocast

# ✅ Mount Google Drive path
drive_path = "/content/drive/My Drive/mini project/saved_model"

# ✅ Initialize GradScaler correctly for PyTorch 2.0+
scaler = torch.amp.GradScaler("cuda")

def train(model, train_loader, optimizer, criterion, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            batch = {key: val.to("cuda") for key, val in batch.items()}  # Move batch to GPU

            optimizer.zero_grad()

            # ✅ Use correct AMP syntax
            with torch.amp.autocast(device_type="cuda"):
                outputs = model(**batch)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()

        print(f"✅ Epoch {epoch+1} completed | Loss: {total_loss / len(train_loader):.4f}")

    # ✅ Save trained model in Google Drive
    model.save_pretrained(drive_path)
    tokenizer.save_pretrained(drive_path)
    print(f"✅ Model training completed and saved in: {drive_path}")


In [None]:
from huggingface_hub import login
from google.colab import userdata

# Step 1: Retrieve the Hugging Face token from Colab secrets
try:
    hf_token = userdata.get('HF_TOKEN')  # Retrieve the secret
    print("✅ Hugging Face token found!")
except Exception as e:
    print("❌ Hugging Face token not found. Please add it to Colab secrets.")
    raise e

# Step 2: Log in to Hugging Face Hub
login(token=hf_token)

# Step 3: Use Hugging Face models/datasets
from transformers import AutoModel, AutoTokenizer

model_name = "bert-base-uncased"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("✅ Hugging Face model and tokenizer loaded successfully!")

✅ Hugging Face token found!
✅ Hugging Face model and tokenizer loaded successfully!


In [None]:
pip install torch torchvision torchaudio




In [None]:
import torch

# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Using device: {device}")


✅ Using device: cuda


In [None]:
print(torch.cuda.is_available())  # Should return True if GPU is available
print(torch.cuda.device_count())  # Check number of available GPUs
print(torch.cuda.get_device_name(0))  # Display GPU name


True
1
Tesla T4


In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Import Required Libraries
import os
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import transformers

# Suppress unnecessary warnings
transformers.logging.set_verbosity_error()

#  Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Using device: {device}")

# Load DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

#  Mount Google Drive for dataset access
from google.colab import drive
drive.mount('/content/drive')

#  Define dataset paths
folder_path = "/content/drive/My Drive/mini project"
human_file_path = os.path.join(folder_path, "HumanDataset.csv")
ai_file_path = os.path.join(folder_path, "AiDataset.csv")

# Load datasets
try:
    human_texts = pd.read_csv(human_file_path)
    ai_texts = pd.read_csv(ai_file_path)
    print("✅ Datasets loaded successfully!")
except FileNotFoundError:
    print("❌ File not found. Please check the file paths in Google Drive.")
    exit()
except Exception as e:
    print(f"❌ An error occurred: {e}")
    exit()

# Clean dataset column names
human_texts.columns = human_texts.columns.str.strip()
ai_texts.columns = ai_texts.columns.str.strip()

#  Ensure "text" column exists
if "text" not in human_texts.columns or "text" not in ai_texts.columns:
    raise KeyError("❌ Column 'text' not found in dataset. Check CSV files.")

# Convert text data and remove NaNs
human_texts["text"] = human_texts["text"].astype(str).fillna("")
ai_texts["text"] = ai_texts["text"].astype(str).fillna("")

#  Assign labels (0 = Human, 1 = AI)
human_texts["label"] = 0
ai_texts["label"] = 1

#  Combine datasets and shuffle
df = pd.concat([human_texts, ai_texts]).sample(frac=1, random_state=42).reset_index(drop=True)

#  Split data into train/test sets (80% train, 20% test)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

#  Create PyTorch dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, max_length=256):  # Reduce max_length for speed
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

#  Create DataLoaders with smaller batch size
train_dataset = TextDataset(train_texts, train_labels, max_length=256)
test_dataset = TextDataset(test_texts, test_labels, max_length=256)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)  # Reduce batch size
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

#  Load pre-trained model or initialize a new one
model_path = "/content/saved_model"
if os.path.exists(model_path):
    print("✅ Loading pre-trained model...")
    model = DistilBertForSequenceClassification.from_pretrained(model_path)
else:
    print("🚀 No trained model found! Initializing a new model.")
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

model.to(device)

#  Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=1e-2)
criterion = nn.CrossEntropyLoss()

#  Training function
def train(model, train_loader, optimizer, criterion, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            batch = {key: val.to(device) for key, val in batch.items()}

            optimizer.zero_grad()
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"✅ Epoch {epoch+1} completed | Loss: {total_loss / len(train_loader):.4f}")

    #  Save model after training
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)
    print("✅ Model training completed and saved successfully!")

#  Evaluation function
def evaluate(model, test_loader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():  # Disable gradient calculations for efficiency
        for batch in test_loader:
            batch = {key: val.to(device) for key, val in batch.items()}
            outputs = model(**batch)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(labels)

    print("✅ Model Accuracy:", accuracy_score(true_labels, predictions))
    print("📊 Classification Report:\n", classification_report(true_labels, predictions))

#  Train the model if no saved model exists
if not os.path.exists(model_path):
    train(model, train_loader, optimizer, criterion)

#  Always evaluate the model
evaluate(model, test_loader)

#  Prediction function
def predict_text(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)

    with torch.no_grad():
        output = model(**inputs)
        prediction = torch.argmax(output.logits, dim=1).item()

    return "AI-generated" if prediction == 1 else "Human-generated"

#  Allow user to enter text for prediction
while True:
    user_input = input("\nEnter a text to check (or type 'exit' to quit): ")
    if user_input.lower() == "exit":
        break
    print("📝 Prediction:", predict_text(user_input))


✅ Using device: cpu
Mounted at /content/drive
✅ Datasets loaded successfully!
🚀 No trained model found! Initializing a new model.


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

KeyboardInterrupt: 