In [None]:
import sys
from pathlib import Path
src_path = Path.cwd().parent / 'src'
sys.path.append(str(src_path))

import numpy as np
import pandas as pd


In [None]:
from data_loader import load_data

# Set up data paths
data_path = Path.cwd().parent / "data" / "ag_news"
train_path = data_path / "train.csv"
test_path = data_path / "test.csv"

# Load training data
df_train = load_data(train_path)
X_title_train = df_train["Title"].values
X_desc_train = df_train["Description"].values
y_train = df_train["Class Index"].values

# Load test data
df_test = load_data(test_path)
X_title_test = df_test["Title"].values
X_desc_test = df_test["Description"].values
y_test = df_test["Class Index"].values

# Show training data
df_train.head()

# Show test data
df_test.head()

In [None]:
from vectorizer import vectorize_data_fit
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize data
title_vectorizer = TfidfVectorizer()
desc_vectorizer = TfidfVectorizer()

X_train_combined_vect = vectorize_data_fit(title_vectorizer, desc_vectorizer, X_title_train, X_desc_train)

In [None]:
from model_trainer import train_model
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# Train Logistic Regression Model
lg_model = LogisticRegression()
lg_trained_model = train_model(lg_model, X_train_combined_vect, y_train)

# Train Naive Bayes Model
nb_model = MultinomialNB()
#nb_trained_model = train_model(nb_model, X_train_combined_vect, y_train)

# Train Random Forest Model
rf_model = RandomForestClassifier()
#rf_trained_model = train_model(rf_model, X_train_combined_vect, y_train)

In [None]:
from perturbation import apply_perturbation

#Testing of perturbation module

title_data = X_title_test[:5]
desc_data = X_desc_test[:5]
level = 0.8

perturbed_data = apply_perturbation(
    [title_data, desc_data],
    level,
    save_path="perturbed_data/perturbed_data_{level:.2f}.pkl"
)



In [17]:
from evaluator import evaluate_robustness

X_title_sample = X_title_test[:5]
X_desc_sample = X_desc_test[:5]
y_sample = y_test[:5]

X_sample = [X_title_sample, X_desc_sample]
# perturbation_levels = np.linspace(0, 0.6, 4)
perturbation_levels = [0.2, 0.4, 0.6, 0.8]
metrics = ["base_accuracy", "robustness_score", "effective_robustness"]
vectorizers = [title_vectorizer, desc_vectorizer]

results, metrics_summary = evaluate_robustness(
    lg_trained_model,
    vectorizers,
    X_sample, #X_test,
    y_sample, #y_test,
    perturbation_levels,
    metrics
)

print("*** Metrics Summary ***")
for metric, value in metrics_summary.items():
    print(f"{metric}: {value: .4f}")

results_df = pd.DataFrame(results)
print("*** Results Per Perturbation Level")
display(results_df)

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\t440p\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



Perturbation level: 0.2
Titles:
ORIGINAL: Fears for T N pension after talks
PERTURBED: Fears for T N pension after tetraiodothyronine talks
ORIGINAL: The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com)
PERTURBED: The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com)
ORIGINAL: Ky. Company Wins Grant to Study Peptides (AP)
PERTURBED: Ky. Company Wins Grant to Study Peptides (AP)
ORIGINAL: Prediction Unit Helps Forecast Wildfires (AP)
PERTURBED: Prediction Unit Helps Forecast Wildfires (AP)
ORIGINAL: Calif. Aims to Limit Farm-Related Smog (AP)
PERTURBED: Calif. Aims to Limit Farm-Related Smog (AP)
Descriptions:
ORIGINAL: Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.
PERTURBED: Unions representing workers Turner   Newall they are 'disappointed' after with stricken parent firm Federal Mogul.
ORIGINAL: SPACE.com - TORONTO, Canada -- A second\t

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\t440p\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


10{"stdout":"[{\"variableName\": \"ID_TO_MEANING\", \"type\": \"dictionary\", \"supportedEngines\": [\"pandas\"], \"isLocalVariable\": true, \"rawType\": \"builtins.dict\"}, {\"variableName\": \"NULL\", \"type\": \"unknown\", \"supportedEngines\": [\"pandas\"], \"isLocalVariable\": true, \"rawType\": \"_pydevd_bundle.pydevd_constants.Null\"}]\n","stderr":"","mime":[]}

Perturbation level: 0.4
Titles:
ORIGINAL: Fears for T N pension after talks
PERTURBED: Fears N pension after talks
ORIGINAL: The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com)
PERTURBED: The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com)
ORIGINAL: Ky. Company Wins Grant to Study Peptides (AP)
PERTURBED: Ky. Company Wins Grant to Study Peptides (AP)
ORIGINAL: Prediction Unit Helps Forecast Wildfires (AP)
PERTURBED: Prediction Unit Helps Forecast Wildfires (AP)
ORIGINAL: Calif. Aims to Limit Farm-Related Smog (AP)
PERTURBED: . Aims Limit Farm-Related S

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\t440p\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


10{"stdout":"[{\"variableName\": \"ID_TO_MEANING\", \"type\": \"dictionary\", \"supportedEngines\": [\"pandas\"], \"isLocalVariable\": true, \"rawType\": \"builtins.dict\"}, {\"variableName\": \"NULL\", \"type\": \"unknown\", \"supportedEngines\": [\"pandas\"], \"isLocalVariable\": true, \"rawType\": \"_pydevd_bundle.pydevd_constants.Null\"}]\n","stderr":"","mime":[]}
10{"stdout":"[{\"variableName\": \"ID_TO_MEANING\", \"type\": \"dictionary\", \"supportedEngines\": [\"pandas\"], \"isLocalVariable\": true, \"rawType\": \"builtins.dict\"}, {\"variableName\": \"NULL\", \"type\": \"unknown\", \"supportedEngines\": [\"pandas\"], \"isLocalVariable\": true, \"rawType\": \"_pydevd_bundle.pydevd_constants.Null\"}]\n","stderr":"","mime":[]}

Perturbation level: 0.6
Titles:
ORIGINAL: Fears for T N pension after talks
PERTURBED: nitrogen Fears for T N pension subsequently after talks
ORIGINAL: The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com)
PERTURBED: The Ra

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\t440p\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


10{"stdout":"[{\"variableName\": \"ID_TO_MEANING\", \"type\": \"dictionary\", \"supportedEngines\": [\"pandas\"], \"isLocalVariable\": true, \"rawType\": \"builtins.dict\"}, {\"variableName\": \"NULL\", \"type\": \"unknown\", \"supportedEngines\": [\"pandas\"], \"isLocalVariable\": true, \"rawType\": \"_pydevd_bundle.pydevd_constants.Null\"}]\n","stderr":"","mime":[]}
10{"stdout":"[{\"variableName\": \"ID_TO_MEANING\", \"type\": \"dictionary\", \"supportedEngines\": [\"pandas\"], \"isLocalVariable\": true, \"rawType\": \"builtins.dict\"}, {\"variableName\": \"NULL\", \"type\": \"unknown\", \"supportedEngines\": [\"pandas\"], \"isLocalVariable\": true, \"rawType\": \"_pydevd_bundle.pydevd_constants.Null\"}]\n","stderr":"","mime":[]}
10{"stdout":"[{\"variableName\": \"ID_TO_MEANING\", \"type\": \"dictionary\", \"supportedEngines\": [\"pandas\"], \"isLocalVariable\": true, \"rawType\": \"builtins.dict\"}, {\"variableName\": \"NULL\", \"type\": \"unknown\", \"supportedEngines\": [\"pandas\

Unnamed: 0,perturbation level,accuracy
0,0.2,0.8
1,0.4,0.8
2,0.6,0.8
3,0.8,0.6
