In [7]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=5,
    n_redundant=15,
    shuffle=False,
    random_state=0
)

# Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

rng = np.random.default_rng(seed=0)
shuffled_indices = rng.permutation(X.shape[1])
X_shuffled = X_scaled[:, shuffled_indices]

clf = DecisionTreeClassifier(criterion='entropy', random_state=4)
clf.fit(X_shuffled, y)

importance = clf.feature_importances_
sorted_idx = np.argsort(importance)[::-1]


true_important = np.arange(5)

shuffled_to_original = shuffled_indices[sorted_idx[:5]]
num_correct = len(np.intersect1d(shuffled_to_original, true_important))

print(f"Number of correctly identified features: {num_correct}")

plt.bar(range(20), importance[sorted_idx])
plt.xlabel("Feature Rank (Descending)")
plt.ylabel("Feature Importance Score")
plt.title("Decision Tree Feature Importance")
plt.show()

In [8]:
num_trials = 1000
results = []

for i in range(num_trials):

    X, y = make_classification(
        n_samples=1000,
        n_features=20,
        n_informative=5,
        n_redundant=15,
        shuffle=False,
        random_state=i
    )
    X_scaled = scaler.fit_transform(X)
    shuffled_indices = rng.permutation(X.shape[1])
    X_shuffled = X_scaled[:, shuffled_indices]
    
    clf = DecisionTreeClassifier(criterion='entropy', random_state=4)
    clf.fit(X_shuffled, y)
    
    importance = clf.feature_importances_
    sorted_idx = np.argsort(importance)[::-1]
    shuffled_to_original = shuffled_indices[sorted_idx[:5]]
    num_correct = len(np.intersect1d(shuffled_to_original, np.arange(5)))
    results.append(num_correct)

plt.hist(results, bins=np.arange(0, 6))
plt.xlabel("Number of Correctly Identified Features")
plt.ylabel("Frequency")
plt.title("Decision Tree Stability Analysis")
plt.show()

print(f"Average number of correctly identified features: {np.mean(results):.2f}")

In [9]:
from sklearn.linear_model import LogisticRegression

results_scaled = []
for i in range(1000):
    X, y = make_classification(random_state=i, n_informative=5, shuffle=False)
    X_scaled = scaler.fit_transform(X)
    shuffled_indices = rng.permutation(20)
    X_shuffled = X_scaled[:, shuffled_indices]
    
    lr = LogisticRegression(penalty=None, max_iter=1000)
    lr.fit(X_shuffled, y)
    
    coef = np.abs(lr.coef_[0])
    sorted_idx = np.argsort(coef)[::-1]
    selected = shuffled_indices[sorted_idx[:5]]
    num_correct = len(np.intersect1d(selected, np.arange(5)))
    results_scaled.append(num_correct)

results_unscaled = []
for i in range(1000):
    X, y = make_classification(random_state=i, n_informative=5, shuffle=False)
    shuffled_indices = rng.permutation(20)
    X_shuffled = X[:, shuffled_indices]
    
    lr = LogisticRegression(penalty=None, max_iter=1000)
    lr.fit(X_shuffled, y)
    
    coef = np.abs(lr.coef_[0])
    sorted_idx = np.argsort(coef)[::-1]
    selected = shuffled_indices[sorted_idx[:5]]
    num_correct = len(np.intersect1d(selected, np.arange(5)))
    results_unscaled.append(num_correct)

plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.hist(results_scaled, bins=np.arange(0, 6))
plt.title("Logistic Regression (Scaled)")
plt.subplot(1, 2, 2)
plt.hist(results_unscaled, bins=np.arange(0, 6))
plt.title("Logistic Regression (Unscaled)")
plt.show()

print(f"Scaled average: {np.mean(results_scaled):.2f}")
print(f"Unscaled average: {np.mean(results_unscaled):.2f}")

In [12]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

num_trials = 1000
overlaps = []
rng = np.random.default_rng(seed=0)

for i in range(num_trials):
    
    X, y = make_classification(
        n_samples=1000,
        n_features=20,
        n_informative=5,
        n_redundant=15,
        shuffle=False,
        random_state=i
    )
  
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    shuffled_indices = rng.permutation(X.shape[1])
    X_shuffled = X_scaled[:, shuffled_indices]
   
    dt = DecisionTreeClassifier(criterion='entropy', random_state=4)
    dt.fit(X_shuffled, y)
    dt_importance = dt.feature_importances_
    dt_top5 = np.argsort(dt_importance)[::-1][:5]
    dt_features = shuffled_indices[dt_top5]  
    
    lr = LogisticRegression(penalty=None, max_iter=1000)
    lr.fit(X_shuffled, y)
    lr_coef = np.abs(lr.coef_[0])
    lr_top5 = np.argsort(lr_coef)[::-1][:5]
    lr_features = shuffled_indices[lr_top5]  
    
    dt_true = np.intersect1d(dt_features, np.arange(5))
    lr_true = np.intersect1d(lr_features, np.arange(5))
    overlap = len(np.intersect1d(dt_true, lr_true))
    overlaps.append(overlap)

plt.hist(overlaps, bins=np.arange(0, 6))
plt.xlabel("Number of Overlapping Features")
plt.ylabel("Frequency")
plt.title("Consistency between DT and LR (Scaled)")
plt.show()

print(f"Average overlap: {np.mean(overlaps):.2f}")

In [10]:
from sklearn.metrics import accuracy_score

def backward_selection(X, y, n_features_to_keep=5):
    remaining = list(range(X.shape[1]))
    for _ in range(X.shape[1] - n_features_to_keep):
        worst_feature = -1
        best_score = -np.inf
        for feature in remaining:
            temp = remaining.copy()
            temp.remove(feature)
            X_temp = X[:, temp]
            lr = LogisticRegression(penalty=None, max_iter=1000)
            lr.fit(X_temp, y)
            score = accuracy_score(y, lr.predict(X_temp))
            if score > best_score:
                best_score = score
                worst_feature = feature
        remaining.remove(worst_feature)
    return remaining

# Generate data (no shuffle)
X, y = make_classification(n_informative=5, shuffle=False, random_state=0)
X_scaled = scaler.fit_transform(X)
shuffled_indices = rng.permutation(X.shape[1])
X_shuffled = X_scaled[:, shuffled_indices]

# Run backward selection
selected = backward_selection(X_shuffled, y, n_features_to_keep=5)
original_selected = [shuffled_indices[i] for i in selected]
num_correct = len(np.intersect1d(original_selected, np.arange(5)))

print(f"Number of truly important features identified: {num_correct}")

In [13]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def backward_selection(X, y, n_features_to_keep=5):
    remaining = list(range(X.shape[1]))
    for _ in range(X.shape[1] - n_features_to_keep):
        worst_feature = -1
        best_score = -np.inf
        for feature in remaining:
            temp = remaining.copy()
            temp.remove(feature)
            X_temp = X[:, temp]
            lr = LogisticRegression(penalty=None, max_iter=1000)
            lr.fit(X_temp, y)
            score = accuracy_score(y, lr.predict(X_temp))
            if score > best_score:
                best_score = score
                worst_feature = feature
        remaining.remove(worst_feature)
    return remaining

n_features = 20
n_informative = 5
n_redundant = 15
num_trials = 1000
results = []

for i in range(num_trials):
    X, y = make_classification(
        n_samples=1000,
        n_features=n_features,
        n_informative=n_informative,
        n_redundant=n_redundant,
        shuffle=False,
        random_state=i
    )

    rng = np.random.default_rng(seed=i)
    shuffled_indices = rng.permutation(n_features)
    X_shuffled = X[:, shuffled_indices]
    
    selected = backward_selection(X_shuffled, y, n_features_to_keep=5)
    
    original_selected = [shuffled_indices[idx] for idx in selected]

    num_correct = len(np.intersect1d(original_selected, np.arange(n_informative)))
    results.append(num_correct)

plt.hist(results, bins=np.arange(0, 6))
plt.xlabel("Number of Correctly Identified Features")
plt.ylabel("Frequency")
plt.title("Backward Selection Performance (1000 Trials)")
plt.show()

print(f"Average number of correctly identified features: {np.mean(results):.2f}")

In [15]:
from itertools import combinations

def best_subset_selection(X, y, n_features_to_keep=3):
    best_score = -np.inf
    best_subset = None
    for subset in combinations(range(X.shape[1]), n_features_to_keep):
        X_subset = X[:, subset]
        lr = LogisticRegression(penalty=None, max_iter=1000)
        lr.fit(X_subset, y)
        score = accuracy_score(y, lr.predict(X_subset))
        if score > best_score:
            best_score = score
            best_subset = subset
    return best_subset

n_features = 7
n_informative = 3
n_redundant = 4
num_trials = 1000  
results_best = []

for i in range(num_trials):
    X, y = make_classification(
        n_samples=1000,
        n_features=n_features,
        n_informative=n_informative,
        n_redundant=n_redundant,
        shuffle=False,
        random_state=i
    )
    shuffled_indices = np.random.permutation(n_features)
    X_shuffled = X[:, shuffled_indices]
    
    
    best_subset = best_subset_selection(X_shuffled, y, n_features_to_keep=3)
    original_selected = [shuffled_indices[idx] for idx in best_subset]
    num_correct = len(np.intersect1d(original_selected, np.arange(n_informative)))
    results_best.append(num_correct)

plt.hist(results_best, bins=np.arange(0, 4))
plt.xlabel("Number of Correctly Identified Features")
plt.ylabel("Frequency")
plt.title("Best Subset Selection Performance (1000 Trials)")
plt.show()

print(f"Average number of correctly identified features: {np.mean(results_best):.2f}")

In [11]:
from sklearn.inspection import permutation_importance

# Use decision tree model
clf = DecisionTreeClassifier(random_state=4)
clf.fit(X_shuffled, y)

result = permutation_importance(clf, X_shuffled, y, n_repeats=10, random_state=0)
sorted_idx = result.importances_mean.argsort()[::-1]

# Get original indices of top-5 features
top5_perm = shuffled_indices[sorted_idx[:5]]
num_correct = len(np.intersect1d(top5_perm, np.arange(5)))

print(f"Number of correctly identified features (Permutation): {num_correct}")