Shape of X_train_preprocessed: (26048, 108)
Shape of X_test_preprocessed: (6513, 108)


In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load the Adult dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 
                'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
df = pd.read_csv(url, names=column_names, skipinitialspace=True, na_values=['?', 'nan', 'NaN'])

# Clean the data
# For categorical columns with missing values, we'll add a new category 'Unknown'
categorical_columns = ['workclass', 'occupation', 'native-country']
for col in categorical_columns:
    df[col] = df[col].fillna('Unknown')

# Encode the target variable
df['income'] = df['income'].map({' <=50K': 0, ' >50K': 1})

print("Unique values in 'income' column after encoding:")
print(df['income'].unique())
print("Value counts in 'income' column:")
print(df['income'].value_counts())

# Check for any remaining NaN values
print("\nRemaining NaN values in the dataset:")
print(df.isna().sum())

# Split features and target
X = df.drop('income', axis=1)
y = df['income']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print unique values and check for NaNs in y_train
print("\nUnique values in y_train:")
print(np.unique(y_train))
print("NaNs in y_train:", pd.isna(y_train).any())

# Define numeric and categorical columns
numeric_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

# Create preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Fit the preprocessor and transform the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Convert to dense if sparse
X_train_preprocessed = X_train_preprocessed.toarray() if hasattr(X_train_preprocessed, "toarray") else X_train_preprocessed
X_test_preprocessed = X_test_preprocessed.toarray() if hasattr(X_test_preprocessed, "toarray") else X_test_preprocessed

# Print shape of the preprocessed datasets
print("\nShape of X_train_preprocessed:", X_train_preprocessed.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test_preprocessed:", X_test_preprocessed.shape)
print("Shape of y_test:", y_test.shape)

# Check for NaNs or infinity in preprocessed data
print("\nNaNs in X_train_preprocessed:", np.isnan(X_train_preprocessed).any())
print("Infinite values in X_train_preprocessed:", np.isinf(X_train_preprocessed).any())

print("\nDataset is now ready for model training.")

Unique values in 'income' column after encoding:
[nan]
Value counts in 'income' column:
Series([], Name: count, dtype: int64)

Remaining NaN values in the dataset:
age                   0
workclass             0
fnlwgt                0
education             0
education-num         0
marital-status        0
occupation            0
relationship          0
race                  0
sex                   0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country        0
income            32561
dtype: int64

Unique values in y_train:
[nan]
NaNs in y_train: True

Shape of X_train_preprocessed: (26048, 108)
Shape of y_train: (26048,)
Shape of X_test_preprocessed: (6513, 108)
Shape of y_test: (6513,)

NaNs in X_train_preprocessed: False
Infinite values in X_train_preprocessed: False

Dataset is now ready for model training.


In [24]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf

# Print data types of all columns
print("Data types in X_train:")
print(X_train.dtypes)

# Check for NaNs in numeric columns
numeric_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
for col in numeric_features:
    print(f"NaNs in {col}:", X_train[col].isna().any())

# Check for NaNs in categorical columns
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
for col in categorical_features:
    print(f"NaNs in {col}:", X_train[col].isna().any())

# Check target variable
print("Unique values in y_train:", np.unique(y_train))
print("NaNs in y_train:", pd.isna(y_train).any())

# Create preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Fit the preprocessor and transform the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Convert to dense if sparse
X_train_preprocessed = X_train_preprocessed.toarray() if hasattr(X_train_preprocessed, "toarray") else X_train_preprocessed
X_test_preprocessed = X_test_preprocessed.toarray() if hasattr(X_test_preprocessed, "toarray") else X_test_preprocessed

# Check preprocessed data
print("NaNs in X_train_preprocessed:", np.isnan(X_train_preprocessed).any())
print("Infinite values in X_train_preprocessed:", np.isinf(X_train_preprocessed).any())

# Get the number of features after preprocessing
num_features = X_train_preprocessed.shape[1]
print("Number of features after preprocessing:", num_features)

# Define the model using Functional API
def create_model(num_features):
    inputs = tf.keras.Input(shape=(num_features,))
    x = tf.keras.layers.BatchNormalization()(inputs)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

# Create and compile the model
model = create_model(num_features)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

# Train the model with early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(
    X_train_preprocessed, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_preprocessed, y_test, verbose=0)
print(f"Test accuracy: {test_accuracy:.4f}")

# Create a robust model (for TRADES)
robust_model = create_model(num_features)
robust_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
                     loss='binary_crossentropy', 
                     metrics=['accuracy'])

# Note: You would need to implement TRADES training for the robust_model

Data types in X_train:
age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
dtype: object
NaNs in age: False
NaNs in fnlwgt: False
NaNs in education-num: False
NaNs in capital-gain: False
NaNs in capital-loss: False
NaNs in hours-per-week: False
NaNs in workclass: False
NaNs in education: False
NaNs in marital-status: False
NaNs in occupation: False
NaNs in relationship: False
NaNs in race: False
NaNs in sex: False
NaNs in native-country: False
Unique values in y_train: [nan]
NaNs in y_train: True
NaNs in X_train_preprocessed: False
Infinite values in X_train_preprocessed: False
Number of features after preprocessing: 104
Epoch 1/100
[1m604/604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15

In [22]:
# Generate adversarial test set using PGD
X_test_adv = np.array([pgd_attack(model, x.reshape(1, -1), y) for x, y in zip(X_test_preprocessed, y_test)])

# Evaluate original model
print("Original model accuracy on PGD adversarial examples:",
      accuracy_score(y_test, (model.predict(X_test_adv) > 0.5).astype(int).flatten()))

# Evaluate robust model (assuming you've trained it with TRADES)
print("Robust model accuracy on PGD adversarial examples:",
      accuracy_score(y_test, (robust_model.predict(X_test_adv) > 0.5).astype(int).flatten()))

# Evaluate with randomized smoothing
smoothed_predictions = np.array([randomized_smoothing(robust_model, x) for x in X_test_adv])
print("Robust model accuracy with randomized smoothing on PGD adversarial examples:",
      accuracy_score(y_test, (smoothed_predictions > 0.5).astype(int).flatten()))

KeyboardInterrupt: 

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load the Adult dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 
                'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
df = pd.read_csv(url, names=column_names, skipinitialspace=True, na_values=['?', 'nan', 'NaN'])

# Clean the data
# For categorical columns with missing values, we'll add a new category 'Unknown'
categorical_columns = ['workclass', 'occupation', 'native-country']
for col in categorical_columns:
    df[col] = df[col].fillna('Unknown')

# Encode the target variable
df['income'] = df['income'].map({' <=50K': 0, ' >50K': 1})

print("Unique values in 'income' column after encoding:")
print(df['income'].unique())
print("Value counts in 'income' column:")
print(df['income'].value_counts())

# Check for any remaining NaN values
print("\nRemaining NaN values in the dataset:")
print(df.isna().sum())

# Split features and target
X = df.drop('income', axis=1)
y = df['income']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print unique values and check for NaNs in y_train
print("\nUnique values in y_train:")
print(np.unique(y_train))
print("NaNs in y_train:", pd.isna(y_train).any())

# Define numeric and categorical columns
numeric_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

# Create preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Fit the preprocessor and transform the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Convert to dense if sparse
X_train_preprocessed = X_train_preprocessed.toarray() if hasattr(X_train_preprocessed, "toarray") else X_train_preprocessed
X_test_preprocessed = X_test_preprocessed.toarray() if hasattr(X_test_preprocessed, "toarray") else X_test_preprocessed

# Print shape of the preprocessed datasets
print("\nShape of X_train_preprocessed:", X_train_preprocessed.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test_preprocessed:", X_test_preprocessed.shape)
print("Shape of y_test:", y_test.shape)

# Check for NaNs or infinity in preprocessed data
print("\nNaNs in X_train_preprocessed:", np.isnan(X_train_preprocessed).any())
print("Infinite values in X_train_preprocessed:", np.isinf(X_train_preprocessed).any())

print("\nDataset is now ready for model training.")

Unique values in 'income' column after encoding:
[nan]
Value counts in 'income' column:
Series([], Name: count, dtype: int64)

Remaining NaN values in the dataset:
age                   0
workclass             0
fnlwgt                0
education             0
education-num         0
marital-status        0
occupation            0
relationship          0
race                  0
sex                   0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country        0
income            32561
dtype: int64

Unique values in y_train:
[nan]
NaNs in y_train: True

Shape of X_train_preprocessed: (26048, 108)
Shape of y_train: (26048,)
Shape of X_test_preprocessed: (6513, 108)
Shape of y_test: (6513,)

NaNs in X_train_preprocessed: False
Infinite values in X_train_preprocessed: False

Dataset is now ready for model training.
