# Feature Reduction With ATOM

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from atom import ATOMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.feature_selection import SequentialFeatureSelector

### Select The Output Activity File From PathSingle

In [None]:
data = pd.read_csv('c:\\Users\\user\\Documents\\Downloads\\output_activity_st3.csv', index_col=0)

In [None]:
x = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

In [None]:
ly = LabelEncoder()
y = ly.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y)
atom = ATOMClassifier(x, y, verbose=2)

### Select A Classifier

In [None]:
atom.run(models=["RF"], metric="roc_auc_ovr", n_trials=10)


Models: RF
Metric: roc_auc_ovr


Running hyperparameter tuning for RandomForest...
| trial | n_estimators | criterion | max_depth | min_samples_split | min_samples_leaf | max_features | bootstrap | max_samples | ccp_alpha | roc_auc_ovr | best_roc_auc_ovr | time_trial | time_ht |    state |
| ----- | ------------ | --------- | --------- | ----------------- | ---------------- | ------------ | --------- | ----------- | --------- | ----------- | ---------------- | ---------- | ------- | -------- |
| 0     |          110 |   entropy |         5 |                20 |               11 |          0.5 |      True |         0.5 |      0.03 |      0.9831 |           0.9831 |    05m:37s | 05m:37s | COMPLETE |
| 1     |           20 |      gini |      None |                14 |               16 |          0.8 |     False |         --- |     0.035 |      0.9274 |           0.9831 |    06m:34s | 12m:11s | COMPLETE |
| 2     |          270 |   entropy |         2 |                 2 |                

In [None]:
clf = ExtraTreesClassifier(n_estimators=100)
clf = clf.fit(x_train, y_train)
#clf.feature_importance_
x.shape

(28697, 581)

In [None]:
#Predict the class probabilities of the test data.
y_proba = clf.predict_proba(x_test)
y_pred = clf.predict(x_test)
# Calculate the AUC score.
auc = roc_auc_score(y_test, np.array(y_proba), multi_class='ovr')
acc = accuracy_score(y_test, y_pred)

print("ACC score:", acc, " AUC score:", auc)

ACC score: 0.9810452961672473  AUC score: 0.999380906303266


In [None]:
atom.feature_selection(strategy="sfs", solver="rf", n_features=6, verbose=2)
atom.run(models=["RF"], metric="accuracy", n_trials=10)

### Using SKLearn SelectFromModel

In [None]:
#Select from model.
model = SelectFromModel(clf, prefit=True, max_features=7)
x_new = model.transform(x)
print(x_new.shape)
model.get_feature_names_out(data.columns[:-1])[:10]

(28697, 7)


array(['Aurora B signaling(NCI/Nature)', 'Aurora C signaling(NCI/Nature)',
       'ECM-receptor interaction(Kegg)',
       'Pathogenic Escherichia coli infection(Kegg)',
       'Signaling by Aurora kinases(NCI/Nature)',
       'cdc25 and chk1 regulatory pathway in response to dna damage(BioCarta)',
       'sonic hedgehog receptor ptc1 regulates cell cycle(BioCarta)'],
      dtype=object)

### Using SKLearn SequentialFeatureSelector

In [None]:
#SequentialFeatureSelector.
sfs = SequentialFeatureSelector(clf, n_features_to_select=5, direction="forward")
sfs.fit(x,y)
print(f'Features selected {sfs.get_support()}')

In [None]:
#Retrain.
new_columns = model.get_feature_names_out(data.columns[:-1])
new_data = data[new_columns].copy()
new_data['state'] = data['state']
x = new_data.iloc[:,:-1].values
y = new_data.iloc[:,-1].values
y = ly.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(x, y)
clf = ExtraTreesClassifier(n_estimators=100)
clf = clf.fit(x_train, y_train)
#Predict the class probabilities of the test data.
y_proba = clf.predict_proba(x_test)
# Calculate the AUC score.
auc = roc_auc_score(y_test, np.array(y_proba), multi_class='ovr')
acc = accuracy_score(y_test, [np.argmax(i) for i in y_proba])

print("ACC score:", acc, " AUC score:", auc)

ACC score: 0.9225087108013937  AUC score: 0.9820332647157359


In [None]:
import joblib
import sys
sys.modules['sklearn.externals.joblib'] = joblib
from mlxtend.feature_selection import SequentialFeatureSelector as SFS


knn = KNeighborClassifier(n_neighbors=4)
sfs1 = SFS(knn,
           k_features=10,
           forward=False,
           floating=True,
           verbose=2,
           scoring='accuracy',
           cv=0,
           n_jobs=-1)

sfs1 = sfs1.fit(x_train, y_train)
sfs1.k_score_

In [None]:
best_features = [int(i) for i in sfs1.k_feature_names_]
best_features

[0, 1, 2, 3, 4, 5, 65]

In [None]:
best_features_names = data.columns[best_features]
best_features_names

Index(['1 4-Dichlorobenzene degradation(Kegg)',
       '1- and 2-Methylnaphthalene degradation(Kegg)',
       '3-Chloroacrylic acid degradation(Kegg)',
       'Acute myeloid leukemia(Kegg)',
       'Cellular roles of Anthrax toxin(NCI/Nature)'],
      dtype='object')

In [None]:
best_features_names = data.columns[best_features]
best_features_names
#ACC score: 0.899233449477352  AUC score: 0.9786395870235323

Index(['1 4-Dichlorobenzene degradation(Kegg)',
       '1- and 2-Methylnaphthalene degradation(Kegg)',
       '3-Chloroacrylic acid degradation(Kegg)',
       'Acute myeloid leukemia(Kegg)', 'Adherens junction(Kegg)',
       'Adipocytokine signaling pathway(Kegg)',
       'Cellular roles of Anthrax toxin(NCI/Nature)'],
      dtype='object')

# Feature Reduction Using MLXtend

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

### Load Output Activity File From PathSingle

In [8]:
#Activity file contains the activity level for each barcode and pathway.
data = pd.read_csv('./data/output_activity_st3.csv', index_col=0)
x = data.iloc[:,:-1].values
y = data.iloc[:,-1].values
ly = LabelEncoder()
y = ly.fit_transform(y)

In [10]:
data.head(2)

Unnamed: 0,1 4-Dichlorobenzene degradation(Kegg),1- and 2-Methylnaphthalene degradation(Kegg),3-Chloroacrylic acid degradation(Kegg),Acute myeloid leukemia(Kegg),Adherens junction(Kegg),Adipocytokine signaling pathway(Kegg),Alanine and aspartate metabolism(Kegg),Alkaloid biosynthesis I(Kegg),Alkaloid biosynthesis II(Kegg),Allograft rejection(Kegg),...,vegf hypoxia and angiogenesis(BioCarta),visceral fat deposits and the metabolic syndrome(BioCarta),visual signal transduction(BioCarta),west nile virus(BioCarta),wnt lrp6 signalling(BioCarta),wnt signaling pathway(BioCarta),y branching of actin filaments(BioCarta),yaci and bcma stimulation of b cell immune responses(BioCarta),-arrestins in gpcr desensitization(BioCarta),state
053l1_AAACCTGAGATGTCGG-1,0,0.255767,0.098922,0.14997,0.981817,0.530502,0.195352,0.070881,0,0.048225,...,0.064926,0.37408,0.004166,0.169582,0,0.001877,0.25696,0,0.026901,terminal exhausted
053l1_AAACCTGAGCAACGGT-1,0,0.237718,0.046336,0.145388,0.975889,0.544321,0.187429,0.051931,0,0.03468,...,0.061697,0.35308,0.003913,0.162971,0,0.001905,0.13747,0,0.090531,effector


### Select Different Classifiers

In [None]:
#knn = KNeighborsClassifier(n_neighbors=6)
#clf = ExtraTreesClassifier(n_estimators=100)
#rfc = RandomForestClassifier(criterion="entropy")
lsvc = LinearSVC(penalty='l1', dual=False, C=2)
#gnb = GaussianNB()

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y, random_state=1)

# Using ForwardSelection Method
sfs1 = SFS(lsvc,
           k_features=3,
           forward=True,
           floating=True,
           verbose=2,
           scoring='accuracy',
           cv=0, n_jobs=-1)

sfs1 = sfs1.fit(X_train, y_train)
# [2023-03-19 19:32:42] Features: 3/3 -- score: 0.9203140972028622 == sfs1.k_score_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   30.6s
[Parallel(n_jobs=-1)]: Done 581 out of 581 | elapsed:   44.3s finished

[2024-12-06 21:14:58] Features: 1/3 -- score: 0.800854939132051[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 580 out of 580 | elapsed:  1.7min finished

[2024-12-06 21:16:43] Features: 2/3 -- score: 0.8909952606635071[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.0min
[Para

In [12]:
best_features = [int(i) for i in sfs1.k_feature_names_]
best_features
#[87, 189, 503]

[87, 189, 503]

In [13]:
best_features_names = data.columns[best_features]
best_features_names
#Index(['ECM-receptor interaction(Kegg)', 'Natural killer cell mediated cytotoxicity(Kegg)', 'rb tumor suppressor/checkpoint signaling in response to dna damage(BioCarta)'], dtype='object')

Index(['ECM-receptor interaction(Kegg)',
       'Natural killer cell mediated cytotoxicity(Kegg)',
       'rb tumor suppressor/checkpoint signaling in response to dna damage(BioCarta)'],
      dtype='object')

In [None]:
data.columns[[34, 189, 430]]
#Index(['Aurora B signaling(NCI/Nature)', 'Natural killer cell mediated cytotoxicity(Kegg)', 'il2_stat5_test3(rotem)'], dtype='object')

In [None]:
X_train_sfs = sfs1.transform(X_train)
X_test_sfs = sfs1.transform(X_test)

lsvc.fit(X_train_sfs, y_train)
y_pred = lsvc.predict(X_test_sfs)

# Compute the accuracy of the prediction
acc = float((y_test == y_pred).sum()) / y_pred.shape[0]
print('Test set accuracy: %.2f %%' % (acc * 100))
#Test set accuracy: 93.85 %