In [1]:
import sys
from pathlib import Path
src_path = Path.cwd().parent / 'src'
sys.path.append(str(src_path))

import numpy as np
import pandas as pd


In [2]:
from data_loader import load_data

# Set up data paths
data_path = Path.cwd().parent / "data" / "ag_news"
train_path = data_path / "train.csv"
test_path = data_path / "test.csv"

# Load training data
df_train = load_data(train_path)
X_train = (df_train["Title"] + " " + df_train["Description"]).values
y_train = df_train["Class Index"].values

# Load test data
df_test = load_data(test_path)
X_test = (df_test["Title"] + " " + df_test["Description"]).values
y_test = df_test["Class Index"].values

# Show training data
# df_train.head()

# Show test data
# df_test.head()


In [3]:
from vectorizer import vectorize_data_fit
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize data
vectorizer = TfidfVectorizer()
X_train_vect = vectorize_data_fit(vectorizer, X_train)

In [4]:
from model_trainer import train_model
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# Train Logistic Regression Model
lg_model = LogisticRegression()
lg_trained_model = train_model(lg_model, X_train_vect, y_train)

# Train Naive Bayes Model
nb_model = MultinomialNB()
#nb_trained_model = train_model(nb_model, X_train_vect, y_train)

# Train Random Forest Model
rf_model = RandomForestClassifier()
#rf_trained_model = train_model(rf_model, X_train_vect, y_train)

In [5]:
# from perturbation import apply_perturbation

# #Testing of perturbation module

# X_data = X_test[:5]
# level = 0.6

# perturbed_data = apply_perturbation(
#     X_data,
#     level,
#     save_path=f"perturbed_data/charswap/perturbed_data_{level:.2f}.pkl"
# )



In [6]:
from evaluator import evaluate_robustness

# Testing of evaluator module
X_sample = X_test[:10]
y_sample = y_test[:10]

perturbation_levels = [0.0, 0.3, 0.6, 1.0]
metrics = ["base_accuracy", "robustness_score", "effective_robustness"]
file_path = Path("perturbed_data/perturbed_data_")

results, metrics_summary = evaluate_robustness(
    lg_trained_model,
    vectorizer,
    X_sample, #X_test,
    y_sample, #y_test,
    perturbation_levels,
    metrics,
    file_path
)

print("*** Metrics Summary ***")
for metric, value in metrics_summary.items():
    print(f"{metric}: {value: .4f}")

results_df = pd.DataFrame(results)
print("*** Results Per Perturbation Level")
display(results_df)

  from .autonotebook import tqdm as notebook_tqdm



Perturbation level: 0.0
ORIGINAL: Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.
PERTURBED: Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.
ORIGINAL: The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com) SPACE.com - TORONTO, Canada -- A second\team of rocketeers competing for the  #36;10 million Ansari X Prize, a contest for\privately funded suborbital space flight, has officially announced the first\launch date for its manned rocket.
PERTURBED: The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com) SPACE.com - TORONTO, Canada -- A second\team of rocketeers competing for the  #36;10 million Ansari X Prize, a contest for\privately funded suborbital space flight, has officially announced the first\

Unnamed: 0,perturbation level,accuracy
0,0.0,0.9
1,0.3,0.7
2,0.6,0.7
3,1.0,0.4
