In [1]:
"""
Enhancer prediction pitfalls demo
Sean Whalen, Gladstone Institutes, Pollard Lab
sean.whalen at gladstone.ucsf.edu
"""

import pandas as pd

from sklearn.dummy import *
from sklearn.feature_selection import *
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import *
from statsmodels.stats.proportion import proportions_ztest

In [2]:
features = pd.read_feather('cage-k562-features-hg38.feather')
features = features[features['coord'].str.contains(r'chr[12]:')]
features.set_index('coord', inplace = True)
features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10959 entries, chr1:905311-906011 to chr2:242086009-242086316
Columns: 435 entries, ChIP-seq, ADNP, K562, ENCFF817ARM (ENCODE) to ChIP-seq, ZZZ3, K562, ENCFF945HJR (ENCODE)
dtypes: uint8(435)
memory usage: 4.6+ MB


In [3]:
labels = (
    pd.read_csv('cage-k562-labels-hg38.csv', index_col = 0)
    .reindex(features.index)
    .squeeze()
)
print(labels.value_counts())

0    10542
1      417
Name: k562, dtype: int64


In [4]:
# chromosomes for group k-fold cv
chroms = (
    features
    .index
    .to_series()
    .str.extract('^([^:]+)')
)

In [5]:
# k-fold cv, preserve distribution of train/test classes, shuffle samples
shuffled_cv = StratifiedKFold(
    n_splits = 2,
    shuffle = True,
    random_state = 0
)

In [6]:
# cross-chromosome cv, prevent sampels on same chromosome from being in both train and test folds
cc_cv = GroupKFold(n_splits = 2)

In [7]:
# pipeline: remove low-variance features and predict majority class
baseline_estimator = make_pipeline(
    VarianceThreshold(),
    DummyClassifier(strategy = 'prior')
)

In [8]:
# pipeline: remove low-variance features, scale them, and fit a linear (ridge) classifier
linear_estimator = make_pipeline(
    VarianceThreshold(),
    StandardScaler(),
    RidgeClassifier()
)

In [9]:
# same as above, but also select features with highest chi2 statistic with label
# performance difference of feature selection inside/outside cv is more dramatic
# with a penalized multivariate model: using SelectPercentile() here for simplicity
selection_percentile = 5
selection_lm_estimator = make_pipeline(
    VarianceThreshold(),
    SelectPercentile(percentile = selection_percentile),
    StandardScaler(),
    RidgeClassifier()
)

In [10]:
# pipeline for selection outside of cv
selection_estimator = make_pipeline(
    VarianceThreshold(),
    SelectPercentile(percentile = selection_percentile)
)

In [11]:
# stores metadata and scores across all cv runs
all_scores = []

In [12]:
# shuffled cv baseline
scores = cross_val_score(
    baseline_estimator,
    features,
    labels,
    groups = chroms,
    cv = shuffled_cv,
    scoring = 'average_precision'
)
all_scores.append(['none', 'none', 'shuffled', 'baseline', scores.mean()])

In [13]:
# cross-chromosome cv baseline model
scores = cross_val_score(
    baseline_estimator,
    features,
    labels,
    groups = chroms,
    cv = cc_cv,
    scoring = 'average_precision'
)
all_scores.append(['none', 'none', 'cross-chromosome', 'baseline', scores.mean()])

In [14]:
# shuffled cv linear model
scores = cross_val_score(
    linear_estimator,
    features,
    labels,
    groups = chroms,
    cv = shuffled_cv,
    scoring = 'average_precision'
)
all_scores.append(['none', 'none', 'shuffled', 'linear', scores.mean()])

In [15]:
# cross-chromosome cv linear model
scores = cross_val_score(
    linear_estimator,
    features,
    labels,
    groups = chroms,
    cv = cc_cv,
    scoring = 'average_precision'
)
all_scores.append(['none', 'none', 'cross-chromosome', 'linear', scores.mean()])

In [16]:
# feature selection inside cv -> cross-chromosome cv linear model
scores = cross_val_score(
    selection_lm_estimator,
    features,
    labels,
    groups = chroms,
    cv = cc_cv,
    scoring = 'average_precision'
)
all_scores.append(['none', 'feature selection', 'cross-chromosome', 'linear', scores.mean()])

In [17]:
# feature selection outside cv -> cross-chromosome cv linear model
selected_features = selection_estimator.fit_transform(features, labels)

scores = cross_val_score(
    linear_estimator,
    selected_features,
    labels,
    groups = chroms,
    cv = cc_cv,
    scoring = 'average_precision'
)
all_scores.append(['feature selection', 'none', 'cross-chromosome', 'linear', scores.mean()])

In [18]:
# balance classes outside cv -> cross-chromosome linear model
minority_class_count = labels.value_counts().min()
balanced_labels = (
    labels
    .groupby(labels, group_keys = False)
    .apply(
        lambda x: x.sample(
            n = minority_class_count,
            random_state = 0
        )
    )
)
balanced_features = features.reindex(balanced_labels.index)

balanced_chroms = (
    balanced_features
    .index
    .to_series()
    .str.extract('^([^:]+)')
)

scores = cross_val_score(
    linear_estimator,
    balanced_features,
    balanced_labels,
    groups = balanced_chroms,
    cv = cc_cv,
    scoring = 'average_precision'
)
all_scores.append(['balance classes', 'none', 'cross-chromosome', 'linear', scores.mean()])

In [19]:
# balance classes and select features outside cv -> cross-chromosome linear model
selected_balanced_features = selection_estimator.fit_transform(balanced_features, balanced_labels)

scores = cross_val_score(
    linear_estimator,
    selected_balanced_features,
    balanced_labels,
    groups = balanced_chroms,
    cv = cc_cv,
    scoring = 'average_precision'
)
all_scores.append(['balance classes, feature selection', 'none', 'cross-chromosome', 'linear', scores.mean()])

In [20]:
stats = pd.DataFrame(
    all_scores,
    columns = [
        'outside cv steps',
        'inside cv steps',
        'cv type',
        'model type',
        'mean auPR'
    ]
)
print(stats)

                     outside cv steps    inside cv steps           cv type  \
0                                none               none          shuffled   
1                                none               none  cross-chromosome   
2                                none               none          shuffled   
3                                none               none  cross-chromosome   
4                                none  feature selection  cross-chromosome   
5                   feature selection               none  cross-chromosome   
6                     balance classes               none  cross-chromosome   
7  balance classes, feature selection               none  cross-chromosome   

  model type  mean auPR  
0   baseline   0.038051  
1   baseline   0.037744  
2     linear   0.348435  
3     linear   0.287322  
4     linear   0.431820  
5     linear   0.454918  
6     linear   0.684962  
7     linear   0.817434  


In [21]:
# adversarial model by chromosome
chromosome_labels = (
    features
    .index
    .to_series()
    .str.startswith('chr1:')
    .astype(int)
)

scores = cross_val_score(
    linear_estimator,
    features,
    chromosome_labels,
    cv = 5,
    scoring = 'average_precision'
)
print(f'adversarial performance: {scores.mean():.2f}')

adversarial performance: 0.58


In [22]:
# statistical test for difference in peak counts between chromosomes
linear_estimator.fit(features, labels)

coefs = pd.Series(
    linear_estimator.steps[-1][1].coef_[0],
    index = features.columns[linear_estimator.steps[0][1].get_support()]
)
coefs = (
    coefs
    .abs()
    .sort_values(ascending = False)
    .index
    .tolist()
)

features['chrom'] = chromosome_labels

for i in range(5):
    counts = (
        features
        .groupby('chrom')
        [coefs[i]]
        .agg([sum, len])
    )

    zstat, pvalue = proportions_ztest(counts['sum'], counts['len'])
    print(f'{zstat:.1f} {pvalue:.1e} {coefs[i]}')

-6.2 6.0e-10 ChIP-seq, ARNT, K562, ENCFF447FIO (ENCODE)
-5.3 1.4e-07 ChIP-seq, POLR2A, K562, ENCFF285MBX (ENCODE)
-5.0 7.1e-07 ChIP-seq, POLR2A, K562, ENCFF099NYA (ENCODE)
-4.4 1.0e-05 ChIP-seq, TAF1, K562, ENCFF453TIB (ENCODE)
-5.1 4.2e-07 ChIP-seq, POLR2A, K562, ENCFF730DLS (ENCODE)
