In [1]:
import numpy as np
import pandas as pd

In [6]:
def get_data_mat(in_df):
    x = in_df.iloc[:,:-5]
    y = in_df.iloc[:,-1]
    pat_id = in_df['PatientID'].values
    feat_names = np.array(x.columns.to_list())
    return x.values, y.values, feat_names, pat_id
    
def get_class_weight(y_train):
    
    from sklearn.utils.class_weight import compute_class_weight
    cls = np.unique(y_train)
    cls_weight = compute_class_weight('balanced', cls, y_train)
    class_weight_dict = dict(zip(cls, cls_weight))
    return class_weight_dict

# (108), (27)
def gen_train_test(x, y, test_ratio=0.2):
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(x, y,
                                                    stratify=y, 
                                                    test_size=test_ratio, random_state=123)
    return X_train, X_test, y_train, y_test

# Boruta selection
def boruta_sel(X, y):
    from sklearn.ensemble import RandomForestClassifier
    from boruta import BorutaPy
    
    rf = RandomForestClassifier(n_jobs=-1, class_weight=get_class_weight(y), max_depth=5)
    feat_selector = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=1)
    feat_selector.fit(X, y)
    supp_ = feat_selector.support_
    
    # return columns index
    return np.arange(X.shape[1])[supp_]

# chi-square percentile
# X is MinMaxScaler() and non-negative
def chi_perc_sel(X, y, percentile=20):
    
    from sklearn.feature_selection import SelectPercentile, chi2
    selector = SelectPercentile(chi2, percentile=percentile).fit(X, y)
    supp_ = selector.get_support()
    
    # return columns index
    return np.arange(X.shape[1])[supp_]

# mutual-information
# X is StandardScaler()
def mi_perc_sel(X, y, percentile=20):
    
    from sklearn.feature_selection import SelectPercentile, mutual_info_classif
    selector = SelectPercentile(mutual_info_classif, percentile=percentile).fit(X, y)
    supp_ = selector.get_support()
    
    # return columns index
    return np.arange(X.shape[1])[supp_]


# forward selection
# two slow
def forw_sel(X, y, top_k_ratio=0.2):
    
    from mlxtend.feature_selection import SequentialFeatureSelector as SFS
    from sklearn.svm import SVC
    
    sfs = SFS(SVC(kernel="linear", class_weight=get_class_weight(y)),
              k_features=int(X.shape[1]*top_k_ratio), 
              forward=True, 
              floating=False,
              scoring='f1_weighted',#balanced_accuracy, f1_weighted
              cv=5,
              n_jobs=-1)
    sfs = sfs.fit(X, y)
    return sfs.k_feature_idx_

# RFE
# method candidate
def rfe_sel(X, y,top_k_ratio=0.2):
    
    from sklearn.feature_selection import RFE
    from sklearn.svm import SVC
    
    selector = RFE(SVC(kernel="linear", class_weight=get_class_weight(y)),
                   n_features_to_select=int(X.shape[1]*top_k_ratio), 
                   step=1).fit(X, y)
    supp_ = selector.support_
    
    return np.arange(X.shape[1])[supp_]

def feat_sel_comb(in_df, method='mi', percentile=30):
    x, y, feat_names, pat_id = get_data_mat(in_df)
    bor_s = boruta_sel(x, y)
    if method == 'chi':
        chi_sel = chi_perc_sel(x, y, percentile=percentile)
    else:
        chi_sel = mi_perc_sel(x, y, percentile=percentile)
    comb_feats_index = np.array(sorted(list(set.intersection(set(bor_s),set(chi_sel)))))
    sel_df = pd.DataFrame(data=x[:,comb_feats_index], columns=feat_names[comb_feats_index])
    sel_df['label'] = y
    sel_df['pat_id'] = pat_id
    return sel_df

In [4]:
# get full normalized data
norm_lipid_df=pd.read_csv('f_data/norm_full_lipid_df_135.csv', sep='\t')
norm_metabolic_df=pd.read_csv('f_data/norm_full_metabolic_df_135.csv', sep='\t')
norm_protein_df=pd.read_csv('f_data/norm_full_protein_df_135.csv', sep='\t')
norm_mrna_df=pd.read_csv('f_data/norm_full_mrna_df_135.csv', sep='\t')

sel_lipid_df = feat_sel_comb(norm_lipid_df)
sel_metabolic_df = feat_sel_comb(norm_metabolic_df)
sel_protein_df = feat_sel_comb(norm_protein_df)
sel_mrna_df = feat_sel_comb(norm_mrna_df)

sel_lipid_df.to_csv('f_data/full_sel_lipid_df_mi_135.csv', index=False, sep='\t')
sel_metabolic_df.to_csv('f_data/full_sel_metabolic_df_mi_135.csv', index=False, sep='\t')
sel_protein_df.to_csv('f_data/full_sel_protein_df_mi_135.csv', index=False, sep='\t')
sel_mrna_df.to_csv('f_data/full_sel_mrna_df_mi_135.csv', index=False, sep='\t')

 2 2 1 3 0 0 0 0 0 0 0 0 1 2 3 1 3 0 2 2 0 2 2 2 1 2 2 2 1 3 1 0 0 0 0 0 0
 0 0 3 0 0 0 1 3 0 0 0 0 0 0 3 0 2 1 1 2 0 2 2 2 3 2 1 1 2 1 1 0 0 0 0 0 0
 0 0 0 0 3 2 3 3 1 1 1 1 1 1 1 1 1 0 0 3 2 2 2 1] as keyword args. From version 0.25 passing these as positional arguments will result in an error
 2 2 1 3 0 0 0 0 0 0 0 0 1 2 3 1 3 0 2 2 0 2 2 2 1 2 2 2 1 3 1 0 0 0 0 0 0
 0 0 3 0 0 0 1 3 0 0 0 0 0 0 3 0 2 1 1 2 0 2 2 2 3 2 1 1 2 1 1 0 0 0 0 0 0
 0 0 0 0 3 2 3 3 1 1 1 1 1 1 1 1 1 0 0 3 2 2 2 1] as keyword args. From version 0.25 passing these as positional arguments will result in an error
 2 2 1 3 0 0 0 0 0 0 0 0 1 2 3 1 0 0 2 2 0 2 2 2 1 2 2 2 1 3 1 0 0 3 0 0 0
 0 0 3 0 0 0 1 3 0 0 0 0 0 0 3 0 2 1 1 2 0 2 2 2 3 2 1 1 2 1 1 0 0 0 0 0 0
 0 0 0 0 3 2 3 3 1 1 1 1 1 1 1 1 1 0 0 3 2 2 2 1] as keyword args. From version 0.25 passing these as positional arguments will result in an error
 2 2 1 3 0 0 0 0 0 0 0 0 1 2 3 1 3 0 2 2 0 2 2 2 1 2 2 2 1 3 1 0 0 0 0 0 0
 0 0 3 0 0 0 1 3 0 0 0 0 0 0 3 0 2

In [5]:
sel_lipid_df.shape[1]-2,sel_metabolic_df.shape[1]-2, sel_protein_df.shape[1]-2, sel_mrna_df.shape[1]-2

(50, 73, 108, 143)

In [7]:
# get train normalized data
norm_train_lipid_df=pd.read_csv('f_data/norm_train_lipid_df_108.csv', sep='\t')
norm_train_metabolic_df=pd.read_csv('f_data/norm_train_metabolic_df_108.csv', sep='\t')
norm_train_protein_df=pd.read_csv('f_data/norm_train_protein_df_108.csv', sep='\t')
norm_train_mrna_df=pd.read_csv('f_data/norm_train_mrna_df_108.csv', sep='\t')

norm_sel_lipid_df = feat_sel_comb(norm_train_lipid_df)
norm_sel_metabolic_df = feat_sel_comb(norm_train_metabolic_df)
norm_sel_protein_df = feat_sel_comb(norm_train_protein_df)
norm_sel_mrna_df = feat_sel_comb(norm_train_mrna_df)

norm_sel_lipid_df.to_csv('f_data/train_norm_sel_lipid_df_mi_108.csv', index=False, sep='\t')
norm_sel_metabolic_df.to_csv('f_data/train_norm_sel_metabolic_df_mi_108.csv', index=False, sep='\t')
norm_sel_protein_df.to_csv('f_data/train_norm_sel_protein_df_mi_108.csv', index=False, sep='\t')
norm_sel_mrna_df.to_csv('f_data/train_norm_sel_mrna_df_mi_108.csv', index=False, sep='\t')

 0 0 1 2 3 1 3 0 2 0 2 2 2 1 2 2 1 3 1 0 0 0 0 0 0 3 0 0 3 0 0 0 3 0 2 1 1
 2 0 2 2 2 3 2 1 2 1 1 0 0 0 0 0 0 0 0 3 3 1 1 1 1 1 1 1 0 0 3 2 2 1] as keyword args. From version 0.25 passing these as positional arguments will result in an error
 0 0 1 2 3 1 3 0 2 0 2 2 2 1 2 2 1 3 1 0 0 0 0 0 0 3 0 0 3 0 0 0 3 0 2 1 1
 2 0 2 2 2 3 2 1 2 1 1 0 0 0 0 0 0 0 0 3 3 1 1 1 1 1 1 1 0 0 3 2 2 1] as keyword args. From version 0.25 passing these as positional arguments will result in an error
 0 0 1 2 3 1 0 0 2 0 2 2 2 1 2 2 1 3 1 0 3 0 0 0 0 3 0 0 3 0 0 0 3 0 2 1 1
 2 0 2 2 2 3 2 1 2 1 1 0 0 0 0 0 0 0 0 3 3 1 1 1 1 1 1 1 0 0 3 2 2 1] as keyword args. From version 0.25 passing these as positional arguments will result in an error
 0 0 1 2 3 1 3 0 2 0 2 2 2 1 2 2 1 3 1 0 0 0 0 0 0 3 0 0 3 0 0 0 3 0 2 1 1
 2 0 2 2 2 3 2 1 2 1 1 0 0 0 0 0 0 0 0 3 3 1 1 1 1 1 1 1 0 0 3 2 2 1] as keyword args. From version 0.25 passing these as positional arguments will result in an error


In [8]:
norm_sel_lipid_df.shape[1]-2,norm_sel_metabolic_df.shape[1]-2, norm_sel_protein_df.shape[1]-2, norm_sel_mrna_df.shape[1]-2


(42, 64, 98, 93)