In [None]:
!pip install textattack

Collecting textattack
  Downloading textattack-0.3.10-py3-none-any.whl.metadata (38 kB)
Collecting bert-score>=0.3.5 (from textattack)
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting flair (from textattack)
  Downloading flair-0.15.1-py3-none-any.whl.metadata (12 kB)
Collecting language-tool-python (from textattack)
  Downloading language_tool_python-2.9.4-py3-none-any.whl.metadata (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lemminflect (from textattack)
  Downloading lemminflect-0.2.3-py3-none-any.whl.metadata (7.0 kB)
Collecting lru-dict (from textattack)
  Downloading lru_dict-1.3.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting terminaltables (from textattack)
  Downloading terminaltables-3.1.10-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting word2number (from textattack)
 

In [None]:
from textattack.datasets import HuggingFaceDataset

dataset = HuggingFaceDataset("imdb", split="test")  # Example with IMDb


In [None]:
dataset[0]

In [None]:
!pip install nltk
import nltk
nltk.download('averaged_perceptron_tagger')
# Download the specific English language data for the averaged_perceptron_tagger
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from textattack.datasets import HuggingFaceDataset
from textattack.attack_recipes import TextFoolerJin2019
from textattack.models.wrappers import HuggingFaceModelWrapper
from textattack import Attacker, AttackArgs
import pandas as pd
from tqdm import tqdm
from textattack.attack_results import SuccessfulAttackResult, FailedAttackResult, SkippedAttackResult

# 1. Load IMDb dataset
dataset = HuggingFaceDataset("imdb", split="test")

# 2. Load pre-trained model and tokenizer
model_name = "textattack/bert-base-uncased-imdb"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Specify device for model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Define device
model.to(device) # Move model to the defined device

model_wrapper = HuggingFaceModelWrapper(model, tokenizer)

# 3. Define the adversarial attack
attack = TextFoolerJin2019.build(model_wrapper)

# 4. Configure attack arguments (disable logging to console)
attack_args = AttackArgs(
    num_examples=20,  # adjust number as needed
    disable_stdout=True,
)

attacker = Attacker(attack, dataset, attack_args)

# 5. Generate adversarial examples
attack_results = attacker.attack_dataset()

# 6. Process and save results
results = []

for result in tqdm(attack_results):
    if result.perturbed_result is None:
        continue

    original_text = result.original_result.attacked_text.text
    adversarial_text = result.perturbed_result.attacked_text.text

    # Determine attack success based on result type
    if isinstance(result, SuccessfulAttackResult):
        attack_success = True
    elif isinstance(result, FailedAttackResult):
        attack_success = False
    else:
        attack_success = None  # Skipped

    # Predict probability for adversarial
    inputs = tokenizer(adversarial_text, return_tensors="pt", truncation=True, padding=True)

    # Move inputs to the same device as the model
    inputs = inputs.to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1).squeeze()
        adv_prob = float(probs[1])  # Assuming label '1' is positive sentiment

    results.append({
        "original_text": original_text,
        "adversarial_text": adversarial_text,
        "attack_type": "TextFoolerJin2019",
        "is_adversarial": int(attack_success) if attack_success is not None else -1,  # -1 for skipped
        "adversarial_probability": round(adv_prob, 4)
    })

# 7. Save to CSV
df = pd.DataFrame(results)
df.to_csv("adversarial_output.csv", index=False)

print("✅ Done. Results saved to 'adversarial_output.csv'")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# Load CSV
df = pd.read_csv("adversarial_output.csv")

# Use 'adversarial_text' if available, otherwise fallback to original
df['text'] = df['adversarial_text'].fillna(df['original_text'])

# Features and label
X = df['text']
y = df['is_adversarial']  # 0 = clean, 1 = adversarial

# Split for training/testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train logistic regression classifier
clf = LogisticRegression()
clf.fit(X_train_vec, y_train)

#


In [None]:
import joblib

# Save the model and vectorizer first (ideally, you would have done this in cell 13)
joblib.dump(clf, "adv_detector_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

# Load model and vectorizer
clf = joblib.load("adv_detector_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Get user input
user_input = input("Enter your sentence: ")

# Vectorize and predict
user_vec = vectorizer.transform([user_input])
proba = clf.predict_proba(user_vec)[0][1]  # Probability of class 1 (adversarial)
label = clf.predict(user_vec)[0]

print("\n🔎 Prediction:")
print(f"→ Is adversarial? {'Yes' if label == 1 else 'No'}")
print(f"→ Adversarial probability: {round(proba, 4)}")

In [None]:
!pip install --upgrade transformers

In [None]:
!pip install --upgrade transformers accelerate

**FINAL OUTPUT**

In [None]:
import os
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, pipeline

# Disable WandB logging
os.environ["WANDB_DISABLED"] = "true"

# Load and preprocess dataset
df = pd.read_csv("adversarial_output.csv")
df['text'] = df['adversarial_text'].fillna(df['original_text'])
df['is_adversarial'] = df['is_adversarial'].astype(int)
df = df[df['is_adversarial'].isin([0, 1])]
df = df[['text', 'is_adversarial']]

# Convert to HuggingFace Dataset
hf_dataset = Dataset.from_pandas(df)

# Tokenization
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize(example):
    return tokenizer(example['text'], padding="max_length", truncation=True)

hf_dataset = hf_dataset.map(tokenize)
hf_dataset = hf_dataset.rename_column("is_adversarial", "label")
hf_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Train/Test split
hf_dataset = hf_dataset.train_test_split(test_size=0.2)

# Load model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# TrainingArguments (compatible with old transformers)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    eval_steps=100,
    save_steps=100,
    do_eval=True,
    do_train=True
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset["train"],
    eval_dataset=hf_dataset["test"]
)

# Train the model
trainer.train()

# Save model and tokenizer
model.save_pretrained("adv-bert-model")
tokenizer.save_pretrained("adv-bert-model")

# Inference
clf = pipeline("text-classification", model="adv-bert-model", tokenizer="adv-bert-model")

# User input
while True:
    text = input("\nEnter a sentence to classify (or type 'exit' to quit): ")
    if text.lower() == "exit":
        break
    result = clf(text)[0]
    label = result['label']
    score = result['score']

    print("\n🔍 Result:")
    print(f"→ Is adversarial? {'Yes' if label == 'LABEL_1' else 'No'}")
    print(f"→ Adversarial probability: {round(score, 4)}")


In [None]:
pip install evaluate matplotlib


In [None]:
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset["train"],
    eval_dataset=hf_dataset["test"],
    compute_metrics=compute_metrics
)


In [None]:
!pip install -U evaluate
import evaluate

# Load the accuracy metric
accuracy_metric = evaluate.load("accuracy")

from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
!pip install --upgrade datasets

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)


In [None]:
# Install required packages (if not already installed)
!pip install -U evaluate

# Import the library
import evaluate

# Load the accuracy metric
accuracy_metric = evaluate.load("accuracy")

# Example compute_metrics function for use in Trainer or evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length")

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Load model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Training arguments (using eval_steps for older versions)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    eval_steps=500,  # Perform evaluation every 500 steps
)

# Define compute_metrics to calculate accuracy
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)  # Get the index of the highest probability
    accuracy = accuracy_score(labels, predictions)  # Calculate accuracy
    return {"accuracy": accuracy}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(1000)),
    eval_dataset=tokenized_datasets["test"].select(range(1000)),
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Plot the training loss (it is stored in the Trainer's `state` object)
train_loss = trainer.state.log_history

# Extract loss values from the training logs
loss_values = [entry["loss"] for entry in train_loss if "loss" in entry]

# Plot the loss curve
plt.plot(loss_values)
plt.title("Training Loss Over Time")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.show()

# Evaluate and print accuracy on the test set
eval_results = trainer.evaluate()
print(f"Test Accuracy: {eval_results['eval_accuracy']:.4f}")


In [None]:
# Save the trained model and tokenizer to a directory
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")


In [None]:
!pip install streamlit

In [None]:
!pip install streamlit
!pip install pyngrok


In [None]:
import streamlit as st
from transformers import pipeline

# Load your trained model from 'saved_model'
@st.cache_resource
def load_model():
    return pipeline("text-classification", model="saved_model", tokenizer="saved_model")

clf = load_model()

# Streamlit Interface
st.title("🛡️ Adversarial Text Detector")
st.write("Enter a sentence to check if it's adversarial.")
t
user_input = st.text_area("Enter a sentence:", height=150)

if st.button("Analyze"):
    if user_input.strip():
        result = clf(user_input)[0]
        label = result['label']
        score = result['score']

        st.subheader("🔍 Result")
        st.write(f"**→ Is adversarial?** {'Yes' if label == 'LABEL_1' else 'No'}")
        st.write(f"**→ Adversarial Probability:** {round(score, 4)}")
    else:
        st.warning("Please enter some text.")


In [None]:
!pip install streamlit transformers pyngrok --quiet


In [None]:
from google.colab import files
uploaded = files.upload()  # Zip your saved_model folder first


In [None]:
!unzip saved_model.zip


In [None]:
%%writefile app.py
import streamlit as st
from transformers import pipeline

# Load your trained model
@st.cache_resource
def load_model():
    return pipeline("text-classification", model="saved_model", tokenizer="saved_model")

clf = load_model()

st.title("🛡️ Adversarial Text Detector")
st.write("Enter a sentence to check if it's adversarial.")

user_input = st.text_area("Enter a sentence:", height=150)

if st.button("Analyze"):
    if user_input.strip():
        result = clf(user_input)[0]
        label = result['label']
        score = result['score']

        st.subheader("🔍 Result")
        st.write(f"**→ Adversarial Probability:** {round(score, 4)}")
    else:
        st.warning("Please enter some text.")


In [None]:
from pyngrok import ngrok

# Replace 'your_ngrok_token' with the actual token you got from your ngrok dashboard
ngrok.set_auth_token("2wPOXAqzYOJhERUt7tlOw6HYpQT_2sMxFJ8PhQXvsXD6cn35A")


In [None]:
from pyngrok import ngrok
!streamlit run app.py &>/content/log.txt &
public_url = ngrok.connect(8501)
print("Streamlit app URL:", public_url)
