In [1]:
import argparse, sys, os, errno
import logging
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s')


In [2]:
command_handlers = {}
def command_handler(f):
    command_handlers[f.__name__] = f
    return f

In [3]:
def select_samples_by_class(matrix, sample_classes, positive_class=None, negative_class=None):
    '''
    Args:
        matrix: 
            pandas DataFrame: [n_samples, n_features]
        sample_classes: 
            pandas Series. Index are sample ids. Values are sample classes.
    Returns:
        X: pandas DataFrame
        y: ndarray
    '''
    if (positive_class is not None) and (negative_class is not None):
        positive_class = positive_class.split(',')
        negative_class = negative_class.split(',')
    else:
        unique_classes = np.unique(sample_classes.values)
        if len(unique_classes) != 2:
            raise ValueError('expect 2 classes but {} classes found'.format(len(unique_classes)))
        positive_class, negative_class = unique_classes
    positive_class = np.atleast_1d(positive_class)
    negative_class = np.atleast_1d(negative_class)

    logger.info('positive class: {}, negative class: {}'.format(positive_class, negative_class))
    X_pos = matrix.loc[sample_classes[sample_classes.isin(positive_class)].index.values]
    X_neg = matrix.loc[sample_classes[sample_classes.isin(negative_class)].index.values]
    logger.info('number of positive samples: {}, negative samples: {}, class ratio: {}'.format(
        X_pos.shape[0], X_neg.shape[0], float(X_pos.shape[0])/X_neg.shape[0]))
    X = pd.concat([X_pos, X_neg], axis=0)
    y = np.zeros(X.shape[0], dtype=np.int32)
    y[X_pos.shape[0]:] = 1
    del X_pos
    del X_neg

    return X, y

In [4]:
@command_handler
def preprocess_features(args):
    import numpy as np
    import pandas as pd
    from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler

    logger.info('read feature matrix: ' + args.matrix)
    X = pd.read_table(args.matrix, index_col=0, sep='\t')
    if args.transpose:
        logger.info('transpose feature matrix')
        X = X.T
    logger.info('{} samples, {} features'.format(X.shape[0], X.shape[1]))
    if args.remove_zero_features is not None:
        logger.info('remove features with zero fraction larger than {}'.format(args.remove_zero_features))
        X = X.loc[:, ~(np.isclose(X, 0).sum(axis=0) > (X.shape[0]*args.remove_zero_features))]
    if args.rpkm_top is not None:
        logger.info('select top {} features ranked by RPKM'.format(args.rpkm_top))
        feature_info = X.columns.to_series().str.split('|', expand=True)
        feature_info.columns = ['gene_id', 'gene_type', 'gene_name', 'feature_id', 'transcript_id', 'start', 'end']
        feature_info['start'] = feature_info['start'].astype('int')
        feature_info['end'] = feature_info['end'].astype('int')
        feature_info['length'] = feature_info['end'] - feature_info['start']
        rpkm = 1e3*X.div(feature_info['length'], axis=1)
        mean_rpkm = np.exp(np.log(rpkm + 0.01).mean(axis=0)) - 0.01
        features_select = mean_rpkm.sort_values(ascending=False)[:args.rpkm_top].index.values
        X = X.loc[:, features_select]
    elif args.expr_top is not None:
        logger.info('select top {} features ranked by raw expression value'.format(args.expr_top))
        mean_expr = np.exp(np.log(X + 0.01).mean(axis=0)) - 0.01
        features_select = mean_expr.sort_values(ascending=False)[:args.expr_top].index.values
        X = X.loc[:, features_select]

    feature_names = X.columns.values
    logger.info('{} samples, {} features'.format(X.shape[0], X.shape[1]))
    logger.info('sample: {} ...'.format(str(X.index.values[:3])))
    logger.info('features: {} ...'.format(str(X.columns.values[:3])))

    n_samples, n_features = X.shape
    sample_ids = X.index.values

    if args.use_log:
        logger.info('apply log2 to feature matrix')
        X = np.log2(X + 0.001)

    if args.scaler == 'zscore':
        logger.info('scale features using z-score normalization')
        X = StandardScaler().fit_transform(X)
    elif args.scaler == 'robust':
        logger.info('scale features using robust normalization')
        X = RobustScaler().fit_transform(X)
    elif args.scaler == 'min_max':
        logger.info('scale features using min-max normalization')
        X = MinMaxScaler().fit_transform(X)
    elif args.scaler == 'max_abs':
        logger.info('scale features using max-abs normalization')
        X = MaxAbsScaler().fit_transform(X)
    
    X = pd.DataFrame(X, index=sample_ids, columns=feature_names)
    X.index.name = 'sample'
    X.to_csv(args.output_file, sep='\t', header=True, index=True, na_rep='NA')


In [10]:
!pip install estimators

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting estimators
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/05/6c/ee058363a0ca44c18eb3b71010cdd479be1beeedbe23ec2ecf11b97d266d/estimators-0.1.0.dev0-py2.py3-none-any.whl
Collecting pympler==0.4.3 (from estimators)
[?25l  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/7c/4d/7da5db3fa5939f661b92d46b3918ae57449a8522507e6562c586a7491d0e/Pympler-0.4.3.tar.gz (166kB)
[K    100% |████████████████████████████████| 174kB 14.1MB/s a 0:00:01
[?25hCollecting SQLAlchemy==1.0.15 (from estimators)
[?25l  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/18/7d/f230ac50198cfe3cdc957c3572a18dc92600047ce707b5b923c56ab92c1b/SQLAlchemy-1.0.15.tar.gz (4.8MB)
[K    100% |████████████████████████████████| 4.8MB 7.3MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: pympler, SQLAlchemy
  Running setup.py bdist_wheel for pympler ... [?25ldone
[?25h  Stored in directory: /home/xieyufeng/.cache/pip/wheel

In [20]:
cd ~/ex

/home/xieyufeng/ex


In [22]:
from ipywidgets import interact,interactive, FloatSlider,IntSlider, RadioButtons,Dropdown,Tab,Text
def interactive_config_settings(dataset,sequencing_type,classifier,value_change,example_cancer,reads_preprocess,stage_info):
    if sequencing_type == 'short':
        exp_mx_name = 'domains_combined'
    elif sequencing_type =='long':
        exp_mx_name = 'featurecounts'
    elif sequencing_type =='domain_only':
        exp_mx_name = 'domains_long'
    elif sequencing_type =='transcript':
        exp_mx_name = 'transcript'
    return dataset,sequencing_type,classifier,value_change,exp_mx_name,example_cancer,reads_preprocess,stage_info

widget =interactive(interactive_config_settings,
           dataset= ['scirep','exorbase','exosome_small','pico_3v3'],
           sequencing_type=['short','long','domain_only','transcript'],
           classifier = ['logistic_regression','linear_svm','random_forest','decision_tree','logistic_regression_l1'],
           value_change = ['any','up','down'],
        example_cancer=['Normal-CRC','Normal-PAAD','Normal-PRAD','Normal-HCC'],
                   reads_preprocess=[True,False],
                   stage_info = ['No Stage','With Stage'])  # if start from preprocessing
display(widget)
dataset,sequencing_type,classifier_use,value_change,exp_mx_name,example_cancer,reads_preprocess,stage_info = widget.result
dataset,sequencing_type,classifier_use,value_change,exp_mx_name,example_cancer,reads_preprocess,stage_info

('scirep',
 'short',
 'logistic_regression',
 'any',
 'domains_combined',
 'Normal-CRC',
 True,
 'No Stage')

In [23]:
original_mx_file = 'output/'+dataset+'/count_matrix/'+exp_mx_name+'.txt'
original_mx = pd.read_table(original_mx_file,index_col=0)

In [29]:
m = original_mx.T
feature_names = m.columns.values
feature_names

array(['hsa-let-7a-2-3p|miRNA|hsa-let-7a-2-3p|hsa-let-7a-2-3p|hsa-let-7a-2-3p|0|22',
       'hsa-let-7a-3p|miRNA|hsa-let-7a-3p|hsa-let-7a-3p|hsa-let-7a-3p|0|21',
       'hsa-let-7a-5p|miRNA|hsa-let-7a-5p|hsa-let-7a-5p|hsa-let-7a-5p|0|22',
       ...,
       'chrY_24209980_24210020_-|genomic|chrY_24209980_24210020_-|peak_5476|chrY|24209980|24210020',
       'chrY_25460120_25460160_+|genomic|chrY_25460120_25460160_+|peak_5477|chrY|25460120|25460160',
       'chrY_26274560_26274620_-|genomic|chrY_26274560_26274620_-|peak_5478|chrY|26274560|26274620'],
      dtype=object)

In [30]:
if dataset=='scirep':
    if stage_info =='No Stage':
        class_info = 'data/'+dataset+'/sample_classes.no_stage.txt'
    else:
        class_info = 'data/'+dataset+'/sample_classes.txt'

sample_classes = pd.read_table(class_info,sep='\t',index_col=0)

In [31]:
sample_classes = sample_classes.iloc[:, 0]
sample_classes = sample_classes.loc[m.index.values]
sample_classes

sample_id
Sample_1S3      Colorectal Cancer
Sample_1S6      Colorectal Cancer
Sample_1S9      Colorectal Cancer
Sample_1S12     Colorectal Cancer
Sample_1S15     Colorectal Cancer
Sample_1S18     Colorectal Cancer
Sample_1S21     Colorectal Cancer
Sample_1S24     Colorectal Cancer
Sample_4S2      Colorectal Cancer
Sample_4S5      Colorectal Cancer
Sample_4S8      Colorectal Cancer
Sample_4S11     Colorectal Cancer
Sample_4S14     Colorectal Cancer
Sample_4S17     Colorectal Cancer
Sample_4S20     Colorectal Cancer
Sample_4S23     Colorectal Cancer
Sample_N1         Healthy Control
Sample_N7         Healthy Control
Sample_N13        Healthy Control
Sample_N19        Healthy Control
Sample_N25        Healthy Control
Sample_N31        Healthy Control
Sample_N37        Healthy Control
Sample_N43        Healthy Control
Sample_2S1      Colorectal Cancer
Sample_2S4      Colorectal Cancer
Sample_2S7      Colorectal Cancer
Sample_2S10     Colorectal Cancer
Sample_2S13     Colorectal Cancer
Samp

In [33]:
positive_class = 'Colorectal Cancer,Prostate Cancer'
negative_class = 'Healthy Control'
positive_class = positive_class.split(',')
positive_class

['Colorectal Cancer', 'Prostate Cancer']

In [34]:
positive_class = 'Colorectal Cancer'
negative_class = 'Healthy Control'

In [36]:
positive_class = np.atleast_1d(positive_class)
negative_class = np.atleast_1d(negative_class)

In [37]:
X_pos = m.loc[sample_classes[sample_classes.isin(positive_class)].index.values]
X_neg = m.loc[sample_classes[sample_classes.isin(negative_class)].index.values]

In [48]:
X = pd.concat([X_pos, X_neg], axis=0)
y = np.zeros(X.shape[0], dtype=np.int32)
y[X_pos.shape[0]:] = 1
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [46]:
n_samples, n_features = X.shape
sample_ids = X.index.values

In [50]:
X = X.values
X

array([[   9,  283, 2460, ...,   16,   25,   16],
       [  14,  503, 6617, ...,    7,    9,   32],
       [   3,  214, 2140, ...,   11,   17,   14],
       ...,
       [   7,  655, 7057, ...,    1,    1,   13],
       [  18,  531, 5808, ...,    5,    7,   12],
       [  11,  837, 7514, ...,    2,    4,    8]])

In [52]:
estimator = None
grid_search = None
estimator = LogisticRegression()
grid_search = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4, 1e5]}
grid_search

{'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 100.0, 1000.0, 10000.0, 100000.0]}

In [53]:
splitter = KFold(n_splits=5)
metrics = []

In [56]:
predictions = np.full((splitter.get_n_splits(X), X.shape[0]), np.nan)
predicted_labels = np.full((splitter.get_n_splits(X), X.shape[0]), np.nan)
train_index_matrix = np.zeros((splitter.get_n_splits(X), X.shape[0]),dtype=np.bool)
feature_selection_matrix = None

array([[False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
      

In [58]:
X.shape[1]

19241

In [57]:
feature_selection_matrix = np.zeros((splitter.get_n_splits(X), X.shape[1]), dtype=bool)
feature_selection_matrix

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [62]:
rfe_step = 0.5
rfe_step = int(max(1, rfe_step*n_features))
rfe_scores = None

In [63]:
i_split = 0

In [78]:
from sklearn.metrics import roc_auc_score
scorer = roc_auc_score
data_splits = list(splitter.split(X, y))

In [79]:
data_splits.append((np.arange(n_samples), None))

In [80]:
data_splits

[(array([ 30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,
          43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
          56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,
          69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,
          82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,
          95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107,
         108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
         121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
         134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
         147, 148, 149]),
  array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
         17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])),
 (array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
          13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
          26,  27, 

In [82]:
for train_index, test_index in tqdm(data_splits, total=splitter.get_n_splits(X) + 1, unit='fold'):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]



  0%|          | 0/6 [00:00<?, ?fold/s][A
100%|██████████| 6/6 [00:00<00:00, 58.52fold/s][A
[A

In [84]:
cv = GridSearchCV(estimator, grid_search, cv=5)
cv.fit(X[train_index], y[train_index])

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 100.0, 1000.0, 10000.0, 100000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [86]:
estimator = cv.best_estimator_

In [88]:
sample_weight = np.ones(X_train.shape[0])

150

In [90]:
y_train

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [91]:
np.bincount(y)

array([100,  50])

In [89]:
from sklearn.utils.class_weight import compute_sample_weight
compute_sample_weight('balanced', y_train)

array([0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75,
       0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75,
       0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75,
       0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75,
       0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75,
       0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75,
       0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75,
       0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75,
       0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75,
       0.75, 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 ,
       1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 ,
       1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 ,
       1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 ,
       1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 , 1.5 ])

In [132]:
rfe = RFE(estimator, n_features_to_select=10, step=0.5)
rfe

RFE(estimator=LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
  n_features_to_select=10, step=0.5, verbose=0)

In [94]:
i_split

0

In [133]:
step_score = lambda estimator, features: scorer(y_test, 
                            score_function(estimator)(X[test_index][:, features])[:, 1])
step_score

<function __main__.<lambda>(estimator, features)>

In [107]:
def score_function(estimator):
    '''Get method of an estimator that predict a continous score for each sample
    '''
    if hasattr(estimator, 'predict_proba'):
        return estimator.predict_proba
    elif hasattr(estimator, 'decision_function'):
        return estimator.decision_function
    else:
        raise ValueError('the estimator should either have decision_function() method or predict_proba() method')

In [111]:
estimator

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [124]:
hasattr(estimator, 'predict_proba')

True

In [125]:
hasattr(estimator, 'decision_function')

True

In [129]:
estimator.predict_proba(X[test_index][:, 1])[:,1]

array([0.00034662])

In [123]:
X[test_index][:, 1]

array([[  14,  503, 6617, ...,    7,    9,   32]])

In [109]:
score_function(estimator)(X[test_index][:, 1])

array([[9.99653377e-01, 3.46622868e-04]])

In [134]:
rfe._fit(X_train, y_train, step_score=step_score)

IndexError: index 150 is out of bounds for axis 1 with size 150

In [135]:
np.nonzero(rfe.ranking_ == 1)

AttributeError: 'RFE' object has no attribute 'ranking_'

In [12]:

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score, accuracy_score, get_scorer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE, RFECV
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit, LeaveOneOut, \
    RepeatedKFold, RepeatedStratifiedKFold, LeaveOneOut, StratifiedShuffleSplit
import pickle
#from estimators import RobustEstimator
from tqdm import tqdm
import h5py



    # select samples
if (args.positive_class is not None) and (args.negative_class is not None):
    positive_class = args.positive_class.split(',')
    negative_class = args.negative_class.split(',')
else:
    unique_classes = np.unique(sample_classes.values)
    if len(unique_classes) != 2:
        raise ValueError('expect 2 classes but {} classes found'.format(len(unique_classes)))
    positive_class, negative_class = unique_classes
positive_class = np.atleast_1d(positive_class)
negative_class = np.atleast_1d(negative_class)



if args.method == 'logistic_regression':
    estimator = LogisticRegression()
    grid_search = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4, 1e5]}
elif args.method == 'random_forest':
    estimator = RandomForestClassifier()
    grid_search = {'n_estimators': [25, 50, 75], 'max_depth': list(range(2, 8)) }
elif args.method == 'linear_svm':
    estimator = LinearSVC()
    grid_search = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4, 1e5]}
else:
    raise ValueError('unknown feature selection method: {}'.format(args.method))

def get_splitter(splitter, n_splits=5, n_repeats=5, test_size=0.2):
    if splitter == 'kfold':
        return KFold(n_splits=n_splits)
    elif splitter == 'stratified_kfold':
        return StratifiedKFold(n_splits=n_splits)
    elif splitter == 'repeated_stratified_kfold':
        return RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats)
    elif splitter == 'shuffle_split':
        return ShuffleSplit(n_splits=n_splits, test_size=test_size)
    elif splitter == 'stratified_shuffle_split':
        return StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size)
    elif splitter == 'leave_one_out':
        return LeaveOneOut()
    else:
        raise ValueError('unknown splitter: {}'.format(splitter))

def score_function(estimator):
    '''Get method of an estimator that predict a continous score for each sample
    '''
    if hasattr(estimator, 'predict_proba'):
        return estimator.predict_proba
    elif hasattr(estimator, 'decision_function'):
        return estimator.decision_function
    else:
        raise ValueError('the estimator should either have decision_function() method or predict_proba() method')

def feature_importances(estimator):
    '''Get feature importance attribute of an estimator
    '''
    if hasattr(estimator, 'coef_'):
        return np.ravel(estimator.coef_)
    elif hasattr(estimator, 'feature_importances_'):
        return np.ravel(estimator.feature_importances_)
    else:
        raise ValueError('the estimator should have either coef_ or feature_importances_ attribute')

def get_scorer(scoring):
    if scoring == 'roc_auc':
        return roc_auc_score
    else:
        raise ValueError('unknonwn scoring: {}'.format(scoring))


NameError: name 'logger' is not defined

In [13]:
from sklearn.model_selection import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])

In [16]:
X

array([[1, 2],
       [3, 4],
       [1, 2],
       [3, 4]])

In [14]:
kf = KFold(n_splits=2)
kf

KFold(n_splits=2, random_state=None, shuffle=False)

In [17]:
vars(kf)

{'n_splits': 2, 'shuffle': False, 'random_state': None}

In [15]:
kf.split(X)

<generator object _BaseKFold.split at 0x7faea325eca8>