In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("NHANES_age_prediction.csv")

# Clean column names to avoid whitespace issues
df.columns = df.columns.str.strip()

# Encode target
df['age_group'] = LabelEncoder().fit_transform(df['age_group'])

# Drop ID column
if 'SEQN' in df.columns:
    df.drop('SEQN', axis=1, inplace=True)

# Features and Target
X = df.drop('age_group', axis=1)
y = df['age_group']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Define baseline models
models = {
    "SVM RBF": SVC(kernel='rbf'),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1)),
    "Gradient Boost": GradientBoostingClassifier()
}

baseline_scores = {}
for name, model in models.items():
    score = cross_val_score(model, X_train, y_train, cv=5).mean()
    baseline_scores[name] = score
    print(f"Baseline {name}: {score:.4f}")


TypeError: AdaBoostClassifier.__init__() got an unexpected keyword argument 'base_estimator'

In [None]:
# Randomized Search
svm = SVC(kernel='rbf')
svm_random_grid = {'C': np.logspace(-2, 3, 20), 'gamma': np.logspace(-4, 1, 20)}
svm_random = RandomizedSearchCV(svm, svm_random_grid, cv=5, n_iter=20, random_state=42)
svm_random.fit(X_train, y_train)

# Grid Search on narrowed range
C_vals = np.linspace(svm_random.best_params_['C']*0.5, svm_random.best_params_['C']*1.5, 5)
gamma_vals = np.linspace(svm_random.best_params_['gamma']*0.5, svm_random.best_params_['gamma']*1.5, 5)
svm_grid = {'C': C_vals, 'gamma': gamma_vals}
svm_grid_search = GridSearchCV(svm, svm_grid, cv=5)
svm_grid_search.fit(X_train, y_train)

svm_best_score = svm_grid_search.best_score_


NameError: name 'X_train_scaled' is not defined

In [None]:
rf = RandomForestClassifier()
rf_random_grid = {'n_estimators': np.arange(10, 200, 10), 'max_depth': np.arange(2, 20)}
rf_random = RandomizedSearchCV(rf, rf_random_grid, cv=5, n_iter=20, random_state=42)
rf_random.fit(X_train, y_train)

# Grid Search
n_est = [rf_random.best_params_['n_estimators'] - 10, rf_random.best_params_['n_estimators'], rf_random.best_params_['n_estimators'] + 10]
depths = [rf_random.best_params_['max_depth'] - 2, rf_random.best_params_['max_depth'], rf_random.best_params_['max_depth'] + 2]
rf_grid = {'n_estimators': n_est, 'max_depth': depths}
rf_grid_search = GridSearchCV(rf, rf_grid, cv=5)
rf_grid_search.fit(X_train, y_train)

rf_best_score = rf_grid_search.best_score_


In [None]:
adb = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1))
adb_random_grid = {'n_estimators': np.arange(10, 200, 10), 'learning_rate': np.linspace(0.01, 1.0, 20)}
adb_random = RandomizedSearchCV(adb, adb_random_grid, cv=5, n_iter=20, random_state=42)
adb_random.fit(X_train, y_train)

n_est = [adb_random.best_params_['n_estimators'] - 10, adb_random.best_params_['n_estimators'], adb_random.best_params_['n_estimators'] + 10]
lr = np.linspace(adb_random.best_params_['learning_rate']*0.8, adb_random.best_params_['learning_rate']*1.2, 5)
adb_grid = {'n_estimators': n_est, 'learning_rate': lr}
adb_grid_search = GridSearchCV(adb, adb_grid, cv=5)
adb_grid_search.fit(X_train, y_train)

adb_best_score = adb_grid_search.best_score_


In [None]:
gb = GradientBoostingClassifier()
gb_random_grid = {'n_estimators': np.arange(50, 200, 10), 'max_depth': np.arange(2, 10)}
gb_random = RandomizedSearchCV(gb, gb_random_grid, cv=5, n_iter=20, random_state=42)
gb_random.fit(X_train, y_train)

n_est = [gb_random.best_params_['n_estimators'] - 10, gb_random.best_params_['n_estimators'], gb_random.best_params_['n_estimators'] + 10]
depths = [gb_random.best_params_['max_depth'] - 1, gb_random.best_params_['max_depth'], gb_random.best_params_['max_depth'] + 1]
gb_grid = {'n_estimators': n_est, 'max_depth': depths}
gb_grid_search = GridSearchCV(gb, gb_grid, cv=5)
gb_grid_search.fit(X_train, y_train)

gb_best_score = gb_grid_search.best_score_


In [None]:
tuned_scores = {
    "SVM RBF": svm_best_score,
    "Random Forest": rf_best_score,
    "AdaBoost": adb_best_score,
    "Gradient Boost": gb_best_score
}

# Combine results
labels = list(baseline_scores.keys())
baseline_vals = [baseline_scores[k] for k in labels]
tuned_vals = [tuned_scores[k] for k in labels]

x = np.arange(len(labels))
width = 0.35

plt.figure(figsize=(10, 6))
bars1 = plt.bar(x - width/2, baseline_vals, width, label='Baseline')
bars2 = plt.bar(x + width/2, tuned_vals, width, label='Tuned')

plt.ylabel('Cross-Validation Accuracy')
plt.title('Model Performance (Baseline vs Tuned)')
plt.xticks(x, labels)
plt.legend()

# Add value labels
for bars in [bars1, bars2]:
    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, yval + 0.005, f'{yval:.2f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()
