In [None]:
# need code for something called as an emebedding model on top
def arguement(int):
  if parse.args == f{string:"print samn"}

In [None]:

!pip install datasets        # or:  !pip install datasets  (in a notebook)
!pip install -U "gcsfs==2024.12.0"

import sys, json, random, argparse, os
import numpy as np
import torch
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


# ---------- helpers --------------------------------------------------------- #
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def build_dataset(csv_path: str) -> DatasetDict:
    """Read a CSV (columns: text,label) → 60 / 40 train/test split."""
    data = load_dataset("csv", data_files=csv_path)["train"]
    splits = data.train_test_split(test_size=0.4, seed=42, stratify_by_column="label")
    return DatasetDict(train=splits["train"], test=splits["test"])


def tokenize_function(examples, tokenizer):
    return tokenizer(examples["text"], truncation=True)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}


# ---------- training -------------------------------------------------------- #
def train(csv_path: str, out_dir: str, epochs: int = 3, batch: int = 8):
    set_seed()
    ds = build_dataset(csv_path)

    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    ds = ds.map(lambda x: tokenize_function(x, tokenizer), batched=True,
                remove_columns=["text"])

    data_collator = DataCollatorWithPadding(tokenizer)
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=2
    )

    args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=batch,
        per_device_eval_batch_size=batch,
        num_train_epochs=epochs,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        weight_decay=0.01,
        report_to="none",
    )

    trainer = Trainer(
        model,
        args,
        train_dataset=ds["train"],
        eval_dataset=ds["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    print("\nFINAL METRICS:", json.dumps(trainer.evaluate(), indent=2))
    trainer.save_model(out_dir)
    tokenizer.save_pretrained(out_dir)


# ---------- inference ------------------------------------------------------- #
def predict_one(text: str, model_dir: str):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    prob_ai = torch.softmax(logits, dim=-1)[0, 0].item()   # class 0 = AI
    label = "AI‑GENERATED" if prob_ai >= 0.5 else "HUMAN"
    print(f"\nResult: {label}   (prob_AI = {prob_ai:.3f})")


# ---------- CLI ------------------------------------------------------------- #
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="BERT AI‑text cheat detector")
    sub = parser.add_subparsers(dest="cmd", required=True)

    t = sub.add_parser("train", help="fine‑tune on a labelled CSV")
    t.add_argument("csv_path"), t.add_argument("out_dir")
    t.add_argument("--epochs", type=int, default=3)
    t.add_argument("--batch", type=int, default=8)

    p = sub.add_parser("predict", help="classify a single passage")
    p.add_argument("text"), p.add_argument("model_dir")

    args = parser.parse_args()

    if args.cmd == "train":
        train(args.csv_path, args.out_dir, args.epochs, args.batch)
    else:
        predict_one(args.text, args.model_dir)
