In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Load the Adult dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 
                'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
df = pd.read_csv(url, names=column_names, skipinitialspace=True, na_values='?')

# Preprocess the data
df = df.dropna()
df['income'] = df['income'].map({'>50K': 1, '<=50K': 0})

# Encode categorical variables
le = LabelEncoder()
for column in df.select_dtypes(include=['object']).columns:
    df[column] = le.fit_transform(df[column])

# Split the data
X = df.drop('income', axis=1)
y = df['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [2]:
# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

print(f"Baseline Model Accuracy: {accuracy:.4f}")
print(f"Baseline Model ROC AUC: {roc_auc:.4f}")


Baseline Model Accuracy: 0.8531
Baseline Model ROC AUC: 0.9072


In [3]:
# Add random noise to the test data
np.random.seed(42)
X_test_noisy = X_test + np.random.normal(0, 0.1, X_test.shape)

# Evaluate the model on noisy data
y_pred_noisy = model.predict(X_test_noisy)
accuracy_noisy = accuracy_score(y_test, y_pred_noisy)
roc_auc_noisy = roc_auc_score(y_test, model.predict_proba(X_test_noisy)[:, 1])

print(f"Model Accuracy with Noise: {accuracy_noisy:.4f}")
print(f"Model ROC AUC with Noise: {roc_auc_noisy:.4f}")


Model Accuracy with Noise: 0.8457
Model ROC AUC with Noise: 0.9020


In [4]:
# Add noise to the training data
X_train_noisy = X_train + np.random.normal(0, 0.1, X_train.shape)

# Retrain the model with noisy data
model_noisy = RandomForestClassifier(n_estimators=100, random_state=42)
model_noisy.fit(X_train_noisy, y_train)

# Evaluate the model on original and noisy test data
y_pred_noisy = model_noisy.predict(X_test)
accuracy_noisy = accuracy_score(y_test, y_pred_noisy)
roc_auc_noisy = roc_auc_score(y_test, model_noisy.predict_proba(X_test)[:, 1])

print(f"Enhanced Model Accuracy: {accuracy_noisy:.4f}")
print(f"Enhanced Model ROC AUC: {roc_auc_noisy:.4f}")


Enhanced Model Accuracy: 0.8601
Enhanced Model ROC AUC: 0.9105
