In [13]:
%reload_ext autoreload
%autoreload 2
import os
import pandas as pd
import pickle
import random
import numpy as np
from collections import defaultdict, Counter
import csv
from pathlib import Path
import sys
lib_dir = '../mylibs'
if lib_dir not in sys.path:
    sys.path.append(lib_dir)
    
import vhdb as vhdb
import features_predict as fp

### Load the input file with datasets to train and test and VHDB 

In [14]:
# Input file  
subsetfile = '../inputs/Bact_holdout.csv'
label_info = pd.read_csv(subsetfile)
subsets = label_info.apply(tuple, axis=1).tolist()
print(f'{len(subsets)} datasets : {subsets[0]}')

vhdbfile = '../inputs/VHDB_25_1_2019.p'
with open(vhdbfile, 'rb') as f:
    V_H = pickle.load( f)
hosts = V_H.hosts
viruses = V_H.viruses
print (f'{len(viruses)} viruses and {len(hosts)} hosts')


# Output file for the results
out_file = Path(subsetfile).stem 
print (f'Results will be saved in: ../results/{out_file}_results.csv')



24 datasets : ('Bacteroidetes', 'phylum', 'Bacteria', 'kingdom', 'DNA', 'Siphoviridae')
9199 viruses and 3006 hosts
Results will be saved in: ../results/Bact_holdout_results.csv


### Set up the different features to be tested and corresponding lists:
1. Filepaths and extensions
2. k-mer lengths
3. Lookup for symbols used in sequences

In [23]:
baltimore = 'all' # default

# Everthing in the same order as the features list
features = ['DNA','AA','PC','Domains']

filepath for the fasta, faa and domain files
filepaths = ['../data/fasta','../data/faa','../data/faa','../data/pfs']
file_exts = ['fasta','fasta','fasta','pfs']

# A list of kmers for each feature set to be tested
kmer_lists = [
              [2,6,9], # dna 
              [3,4], # aa
              [5,6] ,    #pc
              [0]] #domains


# symbol dictionaries 
na_dict = {'mod':4,'a':0,'c':1,'g':2,'t':3}
aa_dict = {'mod':20 ,'A':0,'C':1,'D':2,'E':3,'F':4,'G':5,'H':6,'I':7,'K':8,'L':9,'M':10,'N':11,
              'P':12,'Q':13,'R':14,'S':15,'T':16,'V':17,'W':18,'Y':19}
pc_dict = {'mod':7, 'C':0,
            'A':1,'G':1,'V':1,
           'I':2,'L':2,'F':2,'P':2,
           'M':3,'S':3,'T':3,'Y':3,
           'H':4,'N':4,'Q':4,'W':4,
           'R':5,'K':5,
           'D':6,'E':6}
symbol_dicts = [ na_dict,aa_dict,pc_dict,{}]

classifiers = ['SVM_lin']


### Alter get_class_lists for holdout classifier. 
Training data - everything but holdout viruse holdout virus group
Test data - 

In [4]:
def get_class_lists_ho(subset,viruses,hosts):
    
    (label,label_tax,pool,pool_tax,balt,v_holdout) = subset
    data_lists = {'training':[[],[]],'test':[[],[]]}
    # Get a list of all the viruses in the labelled class and the rest of the pool
    baltimore = 'NA'  # all slasses (except satallites)
    for v, vd in viruses.items() :
        if  baltimore in vd['class']: 
            
            if  vd['family']!= v_holdout:
                ds_name ='training'
            else:
                ds_name = 'test'
            
            host_labels = [hosts[h][label_tax] for h in vd['hosts'] if hosts[h][pool_tax] == pool]    
            if len(host_labels) > 0:
                if label in host_labels:
                    data_lists[ds_name][0].append ((v,label))
                else:
                    data_lists[ds_name][1].append ((v,'Other'))    
                                                    
    
 # Keep all the viruses in the holdout set as test data. they will not be same size   
    datasets = {'training':{},'test':{}}
    nmax = {'training':400,'test':50}
    for k,data_list in data_lists.items():
        datas =[]
        n = min (len(data_list[0]),len(data_list[1]),nmax[k])
        # randomly choose n viruses from both classes
        for clss in data_list: 
            data = (random.sample(clss, n))
            datas.extend(data)
            
       # shuffle together
        random.shuffle(datas)
        
        for v,l in datas:
            datasets [k][v]= {'label':l, 'refseqs':viruses[v]['refseqs']}
    ntrain = (len(datasets['training']))
    ntest = (len(datasets['test']))
   
    return datasets,ntrain,ntest

def get_class_lists(subset,viruses,hosts):
# returns two list of viruses one for each  positive class and negative class    
    (label,label_tax,pool,pool_tax,baltimore,v_ho) = subset
    
    if baltimore =='all':
        baltimore = 'NA'  # all classes  contain either 'DNA' or 'RNA'
    
    pos_neg =  [[],[]]
    # Get 2 lists of all the viruses in the labelled class and the rest of the pool
    for v, vd in viruses.items() :
        if baltimore in vd['class']:
            
            host_labels = [hosts[h][label_tax] for h in vd['hosts'] if hosts[h][pool_tax] == pool]
            
            if len(host_labels) > 0:
                if label in host_labels:
                    pos_neg[0].append ((v,label))
                else:
                    pos_neg[1].append((v,'Other'))    
                         
    # Get a random sample for each class of size n , the size of smallest class 
    #restrict to 400
    n = min(len(pos_neg[0]),len(pos_neg[1]),400)
   # n=6 # test 
    datas =  []
    random.seed(10)
    for clss in pos_neg:
        data = (random.sample(clss, n))
        datas.append(data)
    
    #split into training and test sets     
    split = 0.75 
    trn_tst_lst =  [[],[]]  #2 lists for training and test viruses
    
    for viruslist in datas:
        for i in range(n):
            if random.random() < split:
                trn_tst_lst[0].append(viruslist[i]) 
            else:
                trn_tst_lst[1].append(viruslist[i]) 
  # Convert to dataset dictionaries
    datadicts = {'training':{},'test':{}}
    ds = datadicts.keys()
    for i,lst in enumerate(trn_tst_lst):
        random.shuffle(lst)
        k = list(ds)[i]
        for v,l in lst:
            datadicts [k][v]= {'label':l, 'refseqs':viruses[v]['refseqs']}
        ntrain  =len(datadicts['training'])
        ntest = len(datadicts['test'])
    
    return (datadicts,ntrain,ntest)
     


def results2CSV(results,subset, csvfile):
    (label,label_tax,pool,pool_tax,balt,v_holdout,t_g) = subset
    #results = { k : round(v, 3) for k,v in results.items()}
    results['positive label'] = label
    results['label tax group']= label_tax
    results['pool label']= pool
    results['pool tax group']= pool_tax
    results['Baltimore'] = balt
    results['virus holdout group'] = v_holdout
    results['training group']= t_g
    fieldnames = ['positive label','label tax group','pool label','pool tax group',
                          'Baltimore','virus holdout group','training group', 'N in class' , 
                          'Features','k','AUC' ]
    if os.path.isfile(csvfile):
        with open(csvfile, 'a') as csvfile:
            #fieldnames = ['positive label','label tax group','pool label','pool tax group','Baltimore','virus holdout group','training group', 'N in class' , 'Features','k','AUC','accuracy', 'specificity','sensitivity', 'TN/FP/FN/TP' ]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writerow(results) 
    else:
        with open(csvfile, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerow(results)  

In [31]:
def test_dataset(subset,datasets,n,results_file): 
    (label,label_tax,pool,pool_tax,balt,v_holdout,t_g) = subset
    print ('********   ', t_g , label , v_holdout)
    for feature in features:
        index = features.index(feature)
        kmers = kmer_lists[index]
        filepath = filepaths[index]
        symbol_dict = symbol_dicts[index]
        ext = file_exts[index]
        

        if features != 'PC': #PC same sequence as for AA
            filepath = filepaths[index]
            sequences = fp.get_sequences(filepath,datasets,ext)
            
            
        for k in kmers:
            # get feature matrix , test prediction and write results to CSV file
            print ('-------  ',feature,k)
            x_train,x_test,y_train,y_test = fp.get_feature_matrices(sequences,datasets,label,k,symbol_dict)
            print (f'------------ X train: {np.shape(x_train)},X test: {np.shape(x_test)}, y train: {np.shape(y_train)},y_test: {np.shape(y_test)}')
            results = fp.test_prediction(x_train,x_test,y_train,y_test)
            results.update ({'N in class': n, 'Features':feature, 'k':k})
            print(results)
            results2CSV (results,subset, results_file)
            

## Main loop
#### For each dataset, run inner loop to train and test  all feature sets  as both normal and holdout classifiers. 

In [33]:
for subset in subsets:
    print('*****',subset)
#     # for holdout data  
    datasets_ho,n1,n2 = get_class_lists_ho (subset,viruses,hosts)
    test_dataset((*subset,'holdout'),datasets_ho,n1,'temp.txt')
    
    #for all data
    datasets_all,ntrain,n_ho = get_class_lists(subset,viruses,hosts)
    test_dataset((*subset,'all'), datasets_all,ntrain,'temp.txt')
       

***** ('Bacteroidetes', 'phylum', 'Bacteria', 'kingdom', 'DNA', 'Siphoviridae')
 ho training v 50 , test v 44
********    holdout Bacteroidetes Siphoviridae
viruses number of missing files 0
-------   DNA 2


KeyboardInterrupt: 