In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [28]:
from sklearn.datasets import fetch_openml
import pandas as pd

# Step 1: Load the Ionosphere dataset
def load_dataset():
    # Fetch Ionosphere dataset from OpenML
    ionosphere = fetch_openml(name='ionosphere', version=1, as_frame=True)
    
    # Convert to DataFrame and Series
    X = pd.DataFrame(ionosphere.data, columns=ionosphere.feature_names)
    y = pd.Series(ionosphere.target)
    
    return X, y

In [29]:
# Step 2: Define an evaluation metric (classification accuracy)
def evaluate_subset(X, y, subset):
    X_subset = X.iloc[:, list(subset)]
    X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.3, random_state=42)
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [30]:
# Step 3: Generate an initial random subset of features
def generate_initial_subset(num_features):
    return set(np.random.choice(num_features, size=np.random.randint(1, num_features + 1), replace=False))

In [31]:
# Step 4: Generate neighboring subsets by adding or removing one feature
def generate_neighbors(subset, num_features):
    neighbors = []
    for i in range(num_features):
        if i in subset:
            neighbor = subset - {i}
            neighbors.append(neighbor)
        else:
            neighbor = subset | {i}
            neighbors.append(neighbor)
    return neighbors

In [32]:
# Step 5: Hill-climbing feature selection
def hill_climbing_feature_selection(X, y):
    num_features = X.shape[1]
    current_subset = generate_initial_subset(num_features)
    current_score = evaluate_subset(X, y, current_subset)
    
    improved = True
    while improved:
        improved = False
        neighbors = generate_neighbors(current_subset, num_features)
        for neighbor in neighbors:
            neighbor_score = evaluate_subset(X, y, neighbor)
            if neighbor_score > current_score:
                current_subset = neighbor
                current_score = neighbor_score
                improved = True
                break  # Move to the best neighbor immediately
        
    return current_subset, current_score

In [33]:
# Step 6: Run the feature selection and compare performance
def main():
    X, y = load_dataset()
    optimal_subset, optimal_score = hill_climbing_feature_selection(X, y)
    
    print("Optimal feature subset:", optimal_subset)
    print("Optimal accuracy score:", optimal_score)
    
    # Compare with all features
    all_features_score = evaluate_subset(X, y, set(range(X.shape[1])))
    print("Accuracy using all features:", all_features_score)

# Execute the main function
if __name__ == "__main__":
    main()

Optimal feature subset: {np.int64(18), np.int64(22), 23, np.int64(9), np.int64(26), np.int64(11), np.int64(12), 29}
Optimal accuracy score: 0.9433962264150944
Accuracy using all features: 0.8962264150943396
