# PolyaClassifier negative control test

**Purpose**: To investigate the effects of distinct negative control sets on PolyaClassifier performance.


In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
%run -i notebook_setup.py

## IMPORTS AND SETUP

In [4]:
import paper_utilities
from paper_utilities import models

from functools import reduce


In [5]:
PROJECT   = "/projects/b1080/eks/polyadenylation/yeast"
OUTDIR    = os.path.join(PROJECT, 'manuscript', 'analysis', 'polyaclassifier_model_negative_control_comparison')
RESOURCES = os.path.join(os.path.dirname(OUTDIR), 'resources')
os.makedirs(OUTDIR, exist_ok = True)


## HELPER FUNCTIONS

In [6]:
def compare_configuration(row, config):
    return all([(row[var] == config[var]) for var in config.keys()])


In [7]:
def fetch_split_data_predictions(species, model_stamp, split = 'val'):
    
    matches = glob.glob(os.path.join(PROJECT, species, 'model','classification_only','checkpoints',f'{model_stamp}*', 'evaluations', 'epochs', f'{split}_data.with_predictions.*threshold*.txt'))
    
    if (len(matches) == 0):
        raise ValueError(f"No prediction file found for {species} {model_stamp}: {str(matches)}")
    if (len(matches) > 1):
        raise ValueError(f"Multiple prediction files found for {species} {model_stamp}: {str(matches)}")
    
    return pd.read_csv(matches[0], sep = "\t")


## ANALYSIS

### *S. cerevisiae*

In [8]:
scer_config = {
    'genomic_As'    : 'redistA',
    'architecture'  : 'classification',
    'sequence_size' : 500,
    'conv_units'    : 512,
    'lstm_units'    : 16,
    'conv_shape'    : 12,
    'bin_dropout'   : 0.5,
    'learning_rate' : 0.001,
}


In [9]:
scer_data = pd.read_csv(os.path.join(RESOURCES, 'polyaclassifier_replicate_summary.saccharomyces_cerevisiae.txt'), sep = "\t")
scer_data = scer_data.loc[(scer_data.apply(lambda row : compare_configuration(row, scer_config), axis = 1)) & (scer_data['modeltype'].str.startswith('special_')) & (scer_data['modeltype'] != 'special_pstest')]
print(scer_data.shape)


(40, 21)


#### Compile performance metrics for replicate models and models trained on unique negative control sets

In [10]:
rep_data = scer_data.loc[scer_data['modeltype'] == 'special_replicates'].copy()
ncs_data = scer_data.loc[scer_data['modeltype'] == 'special_negatives'].copy()
bag_data = scer_data.loc[scer_data['modeltype'] == 'special_bagging'].copy()

rep_data['category'] = 'replicate'
ncs_data['category'] = 'negatives'
bag_data['category'] = 'bagging'

scer_model_res = pd.concat([rep_data, ncs_data, bag_data], ignore_index = True, sort = False)
print(scer_model_res.shape)
print(scer_model_res['category'].value_counts())


(40, 22)
bagging      20
negatives    10
replicate    10
Name: category, dtype: int64


#### Replicate models

In [11]:
rep_results = {
    'category' : [],
    'stamp'    : [],
    'auroc'    : [],
    'auprc'    : []
}

for rstamp in rep_data['stamp']:
    
    rdata = fetch_split_data_predictions('saccharomyces_cerevisiae', rstamp, 'test')
    
    auroc, auprc = models.calculate_auc(rdata['obs_class'], rdata['pred_class'])
    
    rep_results['category'].append('Replicates')
    rep_results['stamp'].append(rstamp)
    rep_results['auroc'].append(auroc)
    rep_results['auprc'].append(auprc)

rep_results = pd.DataFrame.from_dict(rep_results, orient = 'columns')


#### Negative models

In [12]:
ncs_results = {
    'category' : [],
    'stamp'    : [],
    'auroc'    : [],
    'auprc'    : []
}

for nstamp in ncs_data['stamp']:
    
    ndata = fetch_split_data_predictions('saccharomyces_cerevisiae', nstamp, 'test')
    
    auroc, auprc = models.calculate_auc(ndata['obs_class'], ndata['pred_class'])
    
    ncs_results['category'].append('Negatives')
    ncs_results['stamp'].append(nstamp)
    ncs_results['auroc'].append(auroc)
    ncs_results['auprc'].append(auprc)

ncs_results = pd.DataFrame.from_dict(ncs_results, orient = 'columns')


#### Bagged models

In [13]:
bag_results = {
    'category' : [],
    'stamp'    : [],
    'auroc'    : [],
    'auprc'    : []
}

for bag_seed in ['12345', '24690', '37035', '49380', '61725', '74070', '86415', '98760', '111105', '123450']:
    
    bag_data = pd.read_csv(os.path.join(RESOURCES, 'polyaclassifier_negative_test', f'scer_negative_test_data_{bag_seed}.polyaclassifier_bagging3models.txt'), sep = '\t')
    bag_data['obs_class'] = (bag_data['supporting_reads'] > 0).astype(int)
    
    auroc, auprc = models.calculate_auc(bag_data['obs_class'], bag_data['classification'])
    
    bag_results['category'].append('Bagging')
    bag_results['stamp'].append(bag_seed)
    bag_results['auroc'].append(auroc)
    bag_results['auprc'].append(auprc)
    
bag_results = pd.DataFrame.from_dict(bag_results, orient = 'columns')
    

#### Combining the comparison results

In [14]:
(pd.concat([rep_results, ncs_results, bag_results], ignore_index = True, sort = False)
 .groupby('category').agg({'auroc' : ['count','mean','std'], 'auprc' : ['count','mean','std']})
 .loc[['Replicates','Negatives','Bagging']]
)


Unnamed: 0_level_0,auroc,auroc,auroc,auprc,auprc,auprc
Unnamed: 0_level_1,count,mean,std,count,mean,std
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Replicates,10,0.987842,0.000806,10,0.988411,0.00092
Negatives,10,0.985757,0.001146,10,0.985999,0.00124
Bagging,10,0.988607,0.000549,10,0.988384,0.000828


### *S. pombe*

In [15]:
spom_config = {
    'genomic_As'    : 'redistA',
    'architecture'  : 'classification1',
    'sequence_size' : 500,
    'conv_units'    : 512,
    'lstm_units'    : 16,
    'conv_shape'    : 6,
    'bin_dropout'   : 0.5,
    'learning_rate' : 0.001,
}


In [16]:
spom_data = pd.read_csv(os.path.join(RESOURCES, 'polyaclassifier_replicate_summary.schizosaccharomyces_pombe.txt'), sep = "\t")
spom_data = spom_data.loc[(spom_data.apply(lambda row : compare_configuration(row, spom_config), axis = 1)) & (spom_data['modeltype'].str.startswith('special_')) & (spom_data['modeltype'] != 'special_pstest')]
print(spom_data.shape)


(40, 21)


#### Compile performance metrics for replicate models and models trained on unique negative control sets

In [17]:
rep_data = spom_data.loc[spom_data['modeltype'] == 'special_replicates'].copy()
ncs_data = spom_data.loc[spom_data['modeltype'] == 'special_negatives'].copy()
bag_data = spom_data.loc[spom_data['modeltype'] == 'special_bagging'].copy()

rep_data['category'] = 'replicate'
ncs_data['category'] = 'negatives'
bag_data['category'] = 'bagging'

spom_model_res = pd.concat([rep_data, ncs_data, bag_data], ignore_index = True, sort = False)
print(spom_model_res.shape)
print(spom_model_res['category'].value_counts())


(40, 22)
bagging      20
negatives    10
replicate    10
Name: category, dtype: int64


#### Replicate models

In [18]:
rep_results = {
    'category' : [],
    'stamp'    : [],
    'auroc'    : [],
    'auprc'    : []
}

for rstamp in rep_data['stamp']:
    
    rdata = fetch_split_data_predictions('schizosaccharomyces_pombe', rstamp, 'test')
    
    auroc, auprc = models.calculate_auc(rdata['obs_class'], rdata['pred_class'])
    
    rep_results['category'].append('Replicates')
    rep_results['stamp'].append(rstamp)
    rep_results['auroc'].append(auroc)
    rep_results['auprc'].append(auprc)

rep_results = pd.DataFrame.from_dict(rep_results, orient = 'columns')


#### Negative models

In [19]:
ncs_results = {
    'category' : [],
    'stamp'    : [],
    'auroc'    : [],
    'auprc'    : []
}

for nstamp in ncs_data['stamp']:
    
    ndata = fetch_split_data_predictions('schizosaccharomyces_pombe', nstamp, 'test')
    
    auroc, auprc = models.calculate_auc(ndata['obs_class'], ndata['pred_class'])
    
    ncs_results['category'].append('Negatives')
    ncs_results['stamp'].append(nstamp)
    ncs_results['auroc'].append(auroc)
    ncs_results['auprc'].append(auprc)

ncs_results = pd.DataFrame.from_dict(ncs_results, orient = 'columns')


#### Bagged models

In [20]:
bag_results = {
    'category' : [],
    'stamp'    : [],
    'auroc'    : [],
    'auprc'    : []
}

for bag_seed in ['12345', '24690', '37035', '49380', '61725', '74070', '86415', '98760', '111105', '123450']:
    
    bag_data = pd.read_csv(os.path.join(RESOURCES, 'polyaclassifier_negative_test', f'spom_negative_test_data_{bag_seed}.polyaclassifier_bagging3models.txt'), sep = '\t')
    bag_data['obs_class'] = (bag_data['supporting_reads'] > 0).astype(int)
    
    auroc, auprc = models.calculate_auc(bag_data['obs_class'], bag_data['classification'])
    
    bag_results['category'].append('Bagging')
    bag_results['stamp'].append(bag_seed)
    bag_results['auroc'].append(auroc)
    bag_results['auprc'].append(auprc)
    
bag_results = pd.DataFrame.from_dict(bag_results, orient = 'columns')
    

#### Combining the comparison results

In [21]:
(pd.concat([rep_results, ncs_results, bag_results], ignore_index = True, sort = False)
 .groupby('category').agg({'auroc' : ['count','mean','std'], 'auprc' : ['count','mean','std']})
 .loc[['Replicates','Negatives','Bagging']]
)


Unnamed: 0_level_0,auroc,auroc,auroc,auprc,auprc,auprc
Unnamed: 0_level_1,count,mean,std,count,mean,std
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Replicates,10,0.983483,0.001463,10,0.984448,0.001452
Negatives,10,0.982321,0.000914,10,0.98196,0.001033
Bagging,10,0.985885,0.001338,10,0.985751,0.001801


### *A. thaliana*

In [22]:
atha_config = {
    'genomic_As'    : 'redistA',
    'architecture'  : 'classification2',
    'sequence_size' : 500,
    'conv_units'    : 512,
    'lstm_units'    : 16,
    'conv_shape'    : 10,
    'bin_dropout'   : 0.5,
    'learning_rate' : 0.001,
}


In [23]:
atha_data = pd.read_csv(os.path.join(RESOURCES, 'polyaclassifier_replicate_summary.arabidopsis_thaliana.txt'), sep = "\t")
atha_data = atha_data.loc[(atha_data.apply(lambda row : compare_configuration(row, atha_config), axis = 1)) & (atha_data['modeltype'].str.startswith('special_')) & (atha_data['modeltype'] != 'special_pstest')]
print(atha_data.shape)


(40, 21)


#### Compile performance metrics for replicate models and models trained on unique negative control sets

In [24]:
rep_data = atha_data.loc[atha_data['modeltype'] == 'special_replicates'].copy()
ncs_data = atha_data.loc[atha_data['modeltype'] == 'special_negatives'].copy()
bag_data = atha_data.loc[atha_data['modeltype'] == 'special_bagging'].copy()

rep_data['category'] = 'replicate'
ncs_data['category'] = 'negatives'
bag_data['category'] = 'bagging'

atha_model_res = pd.concat([rep_data, ncs_data, bag_data], ignore_index = True, sort = False)
print(atha_model_res.shape)
print(atha_model_res['category'].value_counts())


(40, 22)
bagging      20
negatives    10
replicate    10
Name: category, dtype: int64


#### Replicate models

In [25]:
rep_results = {
    'category' : [],
    'stamp'    : [],
    'auroc'    : [],
    'auprc'    : []
}

for rstamp in rep_data['stamp']:
    
    rdata = fetch_split_data_predictions('arabidopsis_thaliana', rstamp, 'test')
    
    auroc, auprc = models.calculate_auc(rdata['obs_class'], rdata['pred_class'])
    
    rep_results['category'].append('Replicates')
    rep_results['stamp'].append(rstamp)
    rep_results['auroc'].append(auroc)
    rep_results['auprc'].append(auprc)

rep_results = pd.DataFrame.from_dict(rep_results, orient = 'columns')


#### Negative models

In [26]:
ncs_results = {
    'category' : [],
    'stamp'    : [],
    'auroc'    : [],
    'auprc'    : []
}

for nstamp in ncs_data['stamp']:
    
    ndata = fetch_split_data_predictions('arabidopsis_thaliana', nstamp, 'test')
    
    auroc, auprc = models.calculate_auc(ndata['obs_class'], ndata['pred_class'])
    
    ncs_results['category'].append('Negatives')
    ncs_results['stamp'].append(nstamp)
    ncs_results['auroc'].append(auroc)
    ncs_results['auprc'].append(auprc)

ncs_results = pd.DataFrame.from_dict(ncs_results, orient = 'columns')


#### Bagged models

In [27]:
bag_results = {
    'category' : [],
    'stamp'    : [],
    'auroc'    : [],
    'auprc'    : []
}

for bag_seed in ['12345', '24690', '37035', '49380', '61725', '74070', '86415', '98760', '111105', '123450']:
    
    bag_data = pd.read_csv(os.path.join(RESOURCES, 'polyaclassifier_negative_test', f'atha_negative_test_data_{bag_seed}.polyaclassifier_bagging3models.txt'), sep = '\t')
    bag_data['obs_class'] = (bag_data['supporting_reads'] > 0).astype(int)
    
    auroc, auprc = models.calculate_auc(bag_data['obs_class'], bag_data['classification'])
    
    bag_results['category'].append('Bagging')
    bag_results['stamp'].append(bag_seed)
    bag_results['auroc'].append(auroc)
    bag_results['auprc'].append(auprc)
    
bag_results = pd.DataFrame.from_dict(bag_results, orient = 'columns')
    

#### Combining the comparison results

In [28]:
(pd.concat([rep_results, ncs_results, bag_results], ignore_index = True, sort = False)
 .groupby('category').agg({'auroc' : ['count','mean','std'], 'auprc' : ['count','mean','std']})
 .loc[['Replicates','Negatives','Bagging']]
)


Unnamed: 0_level_0,auroc,auroc,auroc,auprc,auprc,auprc
Unnamed: 0_level_1,count,mean,std,count,mean,std
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Replicates,10,0.993196,0.000197,10,0.992955,0.00027
Negatives,10,0.993152,0.00067,10,0.992694,0.000805
Bagging,10,0.994434,0.000375,10,0.994082,0.000627
