# Feature Reduction With ATOM

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from atom import ATOMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.feature_selection import SequentialFeatureSelector

# Select The Output Activity File From PathSingle

In [None]:
data = pd.read_csv('c:\\Users\\user\\Documents\\Downloads\\output_activity_st3.csv', index_col=0)

In [None]:
x = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

In [None]:
ly = LabelEncoder()
y = ly.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y)
atom = ATOMClassifier(x, y, verbose=2)

# Select The Classifier

In [None]:
atom.run(models=["RF"], metric="roc_auc_ovr", n_trials=10)


Models: RF
Metric: roc_auc_ovr


Running hyperparameter tuning for RandomForest...
| trial | n_estimators | criterion | max_depth | min_samples_split | min_samples_leaf | max_features | bootstrap | max_samples | ccp_alpha | roc_auc_ovr | best_roc_auc_ovr | time_trial | time_ht |    state |
| ----- | ------------ | --------- | --------- | ----------------- | ---------------- | ------------ | --------- | ----------- | --------- | ----------- | ---------------- | ---------- | ------- | -------- |
| 0     |          110 |   entropy |         5 |                20 |               11 |          0.5 |      True |         0.5 |      0.03 |      0.9831 |           0.9831 |    05m:37s | 05m:37s | COMPLETE |
| 1     |           20 |      gini |      None |                14 |               16 |          0.8 |     False |         --- |     0.035 |      0.9274 |           0.9831 |    06m:34s | 12m:11s | COMPLETE |
| 2     |          270 |   entropy |         2 |                 2 |                

In [None]:
clf = ExtraTreesClassifier(n_estimators=100)
clf = clf.fit(x_train, y_train)
#clf.feature_importance_
x.shape

(28697, 581)

In [None]:
#Predict the class probabilities of the test data.
y_proba = clf.predict_proba(x_test)
y_pred = clf.predict(x_test)
# Calculate the AUC score.
auc = roc_auc_score(y_test, np.array(y_proba), multi_class='ovr')
acc = accuracy_score(y_test, y_pred)

print("ACC score:", acc, " AUC score:", auc)

ACC score: 0.9810452961672473  AUC score: 0.999380906303266


In [None]:
atom.feature_selection(strategy="sfs", solver="rf", n_features=6, verbose=2)
atom.run(models=["RF"], metric="accuracy", n_trials=10)

# Using SKLearn SelectFromModel

In [None]:
#Select from model.
model = SelectFromModel(clf, prefit=True, max_features=7)
x_new = model.transform(x)
print(x_new.shape)
model.get_feature_names_out(data.columns[:-1])[:10]

(28697, 7)


array(['Aurora B signaling(NCI/Nature)', 'Aurora C signaling(NCI/Nature)',
       'ECM-receptor interaction(Kegg)',
       'Pathogenic Escherichia coli infection(Kegg)',
       'Signaling by Aurora kinases(NCI/Nature)',
       'cdc25 and chk1 regulatory pathway in response to dna damage(BioCarta)',
       'sonic hedgehog receptor ptc1 regulates cell cycle(BioCarta)'],
      dtype=object)

# Using SKLearn SequentialFeatureSelector

In [None]:
#SequentialFeatureSelector.
sfs = SequentialFeatureSelector(clf, n_features_to_select=5, direction="forward")
sfs.fit(x,y)
print(f'Features selected {sfs.get_support()}')

In [None]:
#Retrain.
new_columns = model.get_feature_names_out(data.columns[:-1])
new_data = data[new_columns].copy()
new_data['state'] = data['state']
x = new_data.iloc[:,:-1].values
y = new_data.iloc[:,-1].values
y = ly.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(x, y)
clf = ExtraTreesClassifier(n_estimators=100)
clf = clf.fit(x_train, y_train)
#Predict the class probabilities of the test data.
y_proba = clf.predict_proba(x_test)
# Calculate the AUC score.
auc = roc_auc_score(y_test, np.array(y_proba), multi_class='ovr')
acc = accuracy_score(y_test, [np.argmax(i) for i in y_proba])

print("ACC score:", acc, " AUC score:", auc)

ACC score: 0.9225087108013937  AUC score: 0.9820332647157359


In [None]:
import joblib
import sys
sys.modules['sklearn.externals.joblib'] = joblib
from mlxtend.feature_selection import SequentialFeatureSelector as SFS


knn = KNeighborClassifier(n_neighbors=4)
sfs1 = SFS(knn,
           k_features=10,
           forward=False,
           floating=True,
           verbose=2,
           scoring='accuracy',
           cv=0,
           n_jobs=-1)

sfs1 = sfs1.fit(x_train, y_train)
sfs1.k_score_

In [None]:
best_features = [int(i) for i in sfs1.k_feature_names_]
best_features

[0, 1, 2, 3, 4, 5, 65]

In [None]:
best_features_names = data.columns[best_features]
best_features_names

Index(['1 4-Dichlorobenzene degradation(Kegg)',
       '1- and 2-Methylnaphthalene degradation(Kegg)',
       '3-Chloroacrylic acid degradation(Kegg)',
       'Acute myeloid leukemia(Kegg)',
       'Cellular roles of Anthrax toxin(NCI/Nature)'],
      dtype='object')

In [None]:
best_features_names = data.columns[best_features]
best_features_names
#ACC score: 0.899233449477352  AUC score: 0.9786395870235323

Index(['1 4-Dichlorobenzene degradation(Kegg)',
       '1- and 2-Methylnaphthalene degradation(Kegg)',
       '3-Chloroacrylic acid degradation(Kegg)',
       'Acute myeloid leukemia(Kegg)', 'Adherens junction(Kegg)',
       'Adipocytokine signaling pathway(Kegg)',
       'Cellular roles of Anthrax toxin(NCI/Nature)'],
      dtype='object')