In [1]:
import sys
from pathlib import Path
src_path = Path.cwd().parent / 'src'
sys.path.append(str(src_path))

import numpy as np
import pandas as pd


In [2]:
from data_loader import load_data

# Set up data paths
data_path = Path.cwd().parent / "data" / "ag_news"
train_path = data_path / "train.csv"
test_path = data_path / "test.csv"

# Load training data
df_train = load_data(train_path)
X_train = (df_train["Title"] + " " + df_train["Description"]).values
y_train = df_train["Class Index"].values

# Load test data
df_test = load_data(test_path)
X_test = (df_test["Title"] + " " + df_test["Description"]).values
y_test = df_test["Class Index"].values

# Show training data
df_train.head()

# Show test data
df_test.head()

Unnamed: 0,Class Index,Title,Description
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...


In [3]:
from vectorizer import vectorize_data
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize data
vectorizer = TfidfVectorizer()
X_train_vect, X_test_vect = vectorize_data(vectorizer, X_train, X_test, True)

In [None]:
from model_trainer import train_model
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# Train Logistic Regression Model
lg_model = LogisticRegression()
lg_trained_model = train_model(lg_model, X_train_vect, y_train)

# Train Naive Bayes Model
nb_model = MultinomialNB()
#nb_trained_model = train_model(nb_model, X_train_vect, y_train)

# Train Random Forest Model
rf_model = RandomForestClassifier()
#rf_trained_model = train_model(rf_model, X_train_vect, y_train)

In [31]:
from perturbation import apply_perturbation

sample_text = X_test[1]

perturbed_data = apply_perturbation([sample_text], 1.0)
print(sample_text)
print(perturbed_data[0])


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\t440p\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com) SPACE.com - TORONTO, Canada -- A second\team of rocketeers competing for the  #36;10 million Ansari X Prize, a contest for\privately funded suborbital space flight, has officially announced the first\launch date for its manned rocket.
The race is on: s private squad Set launch see for homo spacefaring (blank.com) blank.com - Toronto, Canada -- amp endorse\squad of rocketeers compete for the  #thirty-six;x 1000000 Ansari Adam swag, a competition for\privately fund subocular place trajectory, has formally proclaimed the beginning\plunge appointment for its man projectile.


In [None]:
from evaluator import evaluate_robustness

perturbation_levels = np.linspace(0.1, 1.0, 10)
models = ["base_accuracy", "mce", "rce", "robustness_score", "effective_robustness"]

results, metrics_summary = evaluate_robustness(
    lg_trained_model,
    vectorizer,
    X_test,
    y_test,
    perturbation_levels,
    models
)