In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import RFE,RFECV,SelectKBest
from sklearn.preprocessing import minmax_scale
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score as acc
import seaborn as sns
import matplotlib.pyplot as plt

from cycifsuite.get_data import read_synapse_file
from rfpimp import importances
%load_ext autoreload
%autoreload 2

# Feature selection based on sampling and emsembling

In [2]:
def data_preprocessing(path, fname, ligand_name=None, sample_size=None, excluding_cells=[]):
    '''Read data and construct
    '''
    os.chdir(path)
    imputer = SimpleImputer(strategy='median')
    x = pd.read_hdf(fname)
    zero_std_cols = x.columns[x.std() == 0]
    x = x.drop(zero_std_cols, axis=1)
    x.loc[:, :] = imputer.fit_transform(x)

    # Sampling easier feature selection
    if sample_size is not None:
        if isinstance(sample_size, float):
            sample_size = int(sample_size * x.shape[0])
        elif isinstance(sample_size, int):
            pass
        # get rid of cells not wanted
        sample_pool = [k for k in x.index if k not in excluding_cells]
        sample_idx = np.random.choice(
            x.index, size=sample_size)
        x = x.loc[sample_idx]
    # make y_vector
    y_vector = None
    if ligand_name is not None:
        y_vector = [ligand_name] * x.shape[0]
    return x, y_vector

In [3]:
path = 'd:/data/MCF10A 090718 data/'
os.chdir(path)
feature_files = [x for x in os.listdir() if ('plate_6_txt_' in x) and 'all' not in x]
pooled_metadata = pd.read_csv(read_synapse_file('syn17902177'),index_col=0)
print(feature_files)
x = pd.DataFrame()
y = []
for fn in feature_files:
    ligand = fn.split('_')[-1][:-4]
    print('Processing {}'.format(ligand))
    _x, _y = data_preprocessing(path,fn,ligand_name=ligand, sample_size=1500)
    if x.shape[0]==0:
        x = _x
    else:
        cols = [k for k in x.columns if k in _x.columns]
        x = x[cols].append(_x[cols])
    y+=_y
x.loc[:,:] = minmax_scale(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.20)
num_features = 50

Welcome, Yunguan Wang!

['plate_6_txt_features_BMP2.hdf', 'plate_6_txt_features_EGF.hdf', 'plate_6_txt_features_HGF.hdf', 'plate_6_txt_features_IFNG.hdf', 'plate_6_txt_features_OSM.hdf', 'plate_6_txt_features_PBS.hdf', 'plate_6_txt_features_TGFB.hdf']
Processing BMP2
Processing EGF
Processing HGF
Processing IFNG
Processing OSM
Processing PBS
Processing TGFB


## Get feature importance based on Anova in training set

In [4]:
# Anova
sk = SelectKBest(k=num_features)
sk.fit(x_train,y_train)
sk_fs = sk.get_support()
sk_fs = x.columns[sk_fs].tolist()


  f = msb / msw



## Get feature importance and top features on classical RF

In [5]:
print(x_train.shape)
full_model = RandomForestClassifier(100)
full_model.fit(x_train, y_train)
y_pred = full_model.predict(x_test)
print('Full feature accuracy {}'.format(acc(y_test, y_pred)))

(8400, 5046)
Full feature accuracy 0.9514285714285714


In [6]:
fi_full = pd.Series(full_model.feature_importances_, index=x.columns)
rf_fs = fi_full.sort_values(ascending=False).index[:num_features].tolist()

## Get feature importance and top features based on cross-validated RF feature importance

In [7]:
rfcv_x_train, rfcv_x_test, rfcv_y_train, rfcv_y_test = train_test_split(x_train,y_train, test_size=0.20)
rfcv = RandomForestClassifier(100, n_jobs=4)
rfcv.fit(rfcv_x_train,rfcv_y_train)
rfcv_y_pred = rfcv.predict(rfcv_x_test)
print('Full feature accuracy {}'.format(acc(rfcv_y_test, rfcv_y_pred)))
fs_imp = importances(rfcv, rfcv_x_test, pd.Series(rfcv_y_test)) # permutation
rfpimp_fs = fs_imp.index[:num_features].tolist()

Full feature accuracy 0.9619047619047619


## Get top features using RFE with random forest for non-linear relationships

In [8]:
rfe_input_features = fi_full.sort_values(ascending=False).index[:500]
sampled_x = x_train[rfe_input_features]
sampled_y = y_train
estimator = RandomForestClassifier(100)
selector = RFE(estimator, n_features_to_select=num_features, step=25, verbose=1)
selector = selector.fit(sampled_x, sampled_y)
rfe_fs_rf = sampled_x.columns[selector.support_]

Fitting estimator with 500 features.
Fitting estimator with 475 features.
Fitting estimator with 450 features.
Fitting estimator with 425 features.
Fitting estimator with 400 features.
Fitting estimator with 375 features.
Fitting estimator with 350 features.
Fitting estimator with 325 features.
Fitting estimator with 300 features.
Fitting estimator with 275 features.
Fitting estimator with 250 features.
Fitting estimator with 225 features.
Fitting estimator with 200 features.
Fitting estimator with 175 features.
Fitting estimator with 150 features.
Fitting estimator with 125 features.
Fitting estimator with 100 features.
Fitting estimator with 75 features.


## Get top features using RFE with logistic regression for linear relationships

In [9]:
rfe_input_features = fi_full.sort_values(ascending=False).index[:500]
sampled_x = x_train[rfe_input_features]
sampled_y = y_train
estimator = LogisticRegression(solver='newton-cg', multi_class='auto')
selector = RFE(estimator, n_features_to_select=num_features, step=25, verbose=1)
selector = selector.fit(sampled_x, sampled_y)
rfe_fs_lr = sampled_x.columns[selector.support_]

Fitting estimator with 500 features.
Fitting estimator with 475 features.
Fitting estimator with 450 features.
Fitting estimator with 425 features.
Fitting estimator with 400 features.
Fitting estimator with 375 features.
Fitting estimator with 350 features.
Fitting estimator with 325 features.
Fitting estimator with 300 features.
Fitting estimator with 275 features.
Fitting estimator with 250 features.
Fitting estimator with 225 features.
Fitting estimator with 200 features.
Fitting estimator with 175 features.
Fitting estimator with 150 features.
Fitting estimator with 125 features.
Fitting estimator with 100 features.
Fitting estimator with 75 features.


## Summerize over all features

In [10]:
all_features = pd.DataFrame(index = x_train.columns)
for col, fs_list in zip(['Anova','RF','RF_cv','RFE_rf','RFE_lr'],[sk_fs, rf_fs, rfpimp_fs, rfe_fs_rf, rfe_fs_lr]):
    all_features.loc[fs_list,col] = 1
all_features.fillna(0,inplace=True)
all_features = all_features.sum(axis=1).sort_values(ascending=False)
all_features = pd.DataFrame(all_features, columns=['feature_rank'])
all_features['RF_cv_fi'] = fs_imp.loc[all_features.index].values
all_features['RF_fi'] = fi_full.loc[all_features.index].values
all_features = all_features.sort_values(['feature_rank','RF_cv_fi','RF_fi'], ascending=False)
best_features = all_features.index.tolist()[:num_features]
three_ab = all_features[all_features.feature_rank>=3].index.tolist()
four_ab = all_features[all_features.feature_rank>=4].index.tolist()

## Evaluate features on test set

In [11]:
for col, fs_list in zip(['Anova','RF','RF_cv','RFE_rf','RFE_lr','Best', 'Shared >=3', 'Shared >=4'],[sk_fs, rf_fs, rfpimp_fs, rfe_fs_rf, rfe_fs_lr, best_features, three_ab, four_ab]):
    sub_features = fs_list
    x_train_sub = x_train[sub_features]
    x_test_sub = x_test[sub_features]
    fs_model = RandomForestClassifier(100)
    fs_model.fit(x_train_sub, y_train)
    y_pred = fs_model.predict(x_test_sub)
    print('{}: Selected {} feature accuracy {:.2f}'.format(col,len(sub_features),acc(y_test, y_pred)))
y_pred = full_model.predict(x_test)
print('Full feature model accuracy {:.2f}'.format(acc(y_test, y_pred)))

Anova: Selected 50 feature accuracy 0.91
RF: Selected 50 feature accuracy 0.95
RF_cv: Selected 50 feature accuracy 0.95
RFE_rf: Selected 50 feature accuracy 0.96
RFE_lr: Selected 50 feature accuracy 0.96
Best: Selected 50 feature accuracy 0.95
Shared >=3: Selected 39 feature accuracy 0.95
Shared >=4: Selected 18 feature accuracy 0.93
Full feature model accuracy 0.95


In [12]:
from scipy.stats import chisquare as chi2
f_ob = len([k for k in best_features if '_int_' in k])
f_ob = [f_ob, len(best_features)-f_ob]
f_exp = len([k for k in x.columns if '_int_' in k])
f_exp = [f_exp, x.shape[1]-f_exp]
pval = chi2(f_ob, f_exp)[1]
if pval >= 0.05:
    print('Chi-2 test pval: {:.3f}, no enrichment of either class in the best features.'.format(pval))
else:
    print('Chi-2 test pval: {:.3f}, abnormal class distribution in  best features.'.format(pval))
    if f_ob[0]/sum(f_ob)>f_exp[0]/sum(f_exp):
        print('Enrichment of int features observed')
    else:
        print('Enrichment of txt features observed')

Chi-2 test pval: 0.000, abnormal class distribution in  best features.
Enrichment of int features observed


In [13]:
with open('d:/data/Best_features.txt', 'a') as f:
    f.write(', '.join(best_features))
    f.write('\n')

## Permutation based estimate on feature set significance

In [14]:
# i=0
# x_train_sub = x_train[best_features]
# x_test_sub = x_test[best_features]
# fs_model = RandomForestClassifier(100)
# fs_model.fit(x_train_sub, y_train)
# y_pred = fs_model.predict(x_test_sub)
# best_fs_acc = acc(y_test, y_pred)
# acc_best_inferior = 0
# print('Selected best feature accuracy {:.2f}'.format(acc(y_test, y_pred)))
# while i < 100:
#     random_features = np.random.choice(x.columns, replace=False, size=num_features)
#     test_model = RandomForestClassifier(100)
#     fs_model.fit(x_train[random_features], y_train)
#     y_pred = fs_model.predict(x_test[random_features])
#     random_fs_acc = acc(y_test, y_pred)
#     if random_fs_acc>=best_fs_acc:
#         print('Randomly selected feature accuracy: {:.2f}'.format(random_fs_acc))
#         acc_best_inferior+=1
#     i+=1
# print('Pval of best feature : {}'.format(acc_best_inferior/100))