## Train and test a classifier for each of the feature sets  for each dataset 

In [6]:
%reload_ext autoreload
%autoreload 2
import os
import pandas as pd
import pickle
import random
import numpy as np
import csv
import timeit
from collections import defaultdict, Counter
from pathlib import Path
from Bio import SeqIO
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split 
from sklearn.metrics import roc_auc_score
import sys
lib_dir = '../mylibs'
if lib_dir not in sys.path:
    sys.path.append(lib_dir)
%matplotlib inline    
    
import vhdb as vhdb
from featureset import FeatureSet
from dataset import DataSet

In [7]:
def get_confusion( y_test,y_pred):
    FN=0
    FP=0
    TP=0
    TN=0
    #missed = 0
    for y,yp in zip(y_test,y_pred):
        #print (y_true,yp,yp2,yprob)
        if y == True and yp == False:
            FN += 1
        elif y == False and yp == True:
            FP +=1
        elif y == True and yp == True:
            TP +=1
        else:
            TN +=1
    spec = round(TN/(TN+FP),2) if (TN + TP) else 'NA'
    sens = round(TP/(TP+FN),2) if (TP+FN) else 'NA'
    fdr  = round(FP/(TP+FP),2) if (TP+FP) else 'NA'
    res  = { 'Acc':round((TP+TN)/(len(y_test)),2),'Spec':spec, 'Sens':sens,\
           'prec':fdr,'TP':TP, 'TN':TN, 'FP':FP, 'FN':FN}
    return res


def test_prediction(fs,y_trn,y_tst):
    clf = make_pipeline(StandardScaler(),SVC(kernel='linear',probability=True) )
    clf.fit(fs.X_trn, y_trn)
    y_pred = clf.predict(fs.X_tst)
    y_pred_probs= clf.predict_proba(fs.X_tst)[:,1]
    AUC =round( roc_auc_score(y_tst, y_pred_probs),3)  
    confusion = get_confusion(y_tst,y_pred)
    confusion.update({'AUC':AUC})
    return confusion
       
        

### The input dataset files in ../inputs 
1. Bacteria_DNA.csv 
2. Euk_subsets.csv
3. Euk_all.csv
4. Euk_RNA.csv

### Load the input files to train and test
1. input/ datasets  
2. VHDB 

In [8]:
# Input file  
subsetfile = '../inputs/Bacteria_DNA.csv'
label_info = pd.read_csv(subsetfile)
subsets = label_info.apply(tuple, axis=1).tolist()
print(f'{len(subsets)} datasets : {subsets[0]}')
for s in subsets:
    print(s)
vhdbfile = '../inputs/VHDB_25_1_2019.p'
with open(vhdbfile, 'rb') as f:
    V_H = pickle.load( f)
hosts = V_H.hosts
viruses = V_H.viruses
print (f'{len(viruses)} viruses and {len(hosts)} hosts')


# Output file for the results
results_file = f'../results/{Path(subsetfile).stem}_results.csv '
print (f'Results will be saved in: {results_file}')



67 datasets : ('Alteromonadales', 'order', 'Gammaproteobacteria', 'class', 'DNA')
('Alteromonadales', 'order', 'Gammaproteobacteria', 'class', 'DNA')
('Salmonella enterica', 'species', 'Enterobacteriaceae', 'family', 'DNA')
('Salmonella', 'genus', 'Enterobacteriaceae', 'family', 'DNA')
('Escherichia coli', 'species', 'Enterobacteriaceae', 'family', 'DNA')
('Escherichia', 'genus', 'Enterobacteriaceae', 'family', 'DNA')
('Klebsiella pneumoniae', 'species', 'Enterobacteriaceae', 'family', 'DNA')
('Klebsiella', 'genus', 'Enterobacteriaceae', 'family', 'DNA')
('Enterobacteriaceae', 'family', 'Enterobacterales', 'order', 'DNA')
('Erwiniaceae', 'family', 'Enterobacterales', 'order', 'DNA')
('Enterobacterales', 'order', 'Gammaproteobacteria', 'class', 'DNA')
('Acinetobacter', 'genus', 'Pseudomonadales', 'order', 'DNA')
('Moraxellaceae', 'family', 'Pseudomonadales', 'order', 'DNA')
('Pseudomonas', 'genus', 'Pseudomonadales', 'order', 'DNA')
('Pseudomonadales', 'order', 'Gammaproteobacteria', 'c

In [9]:

features = ['DNA','AA','PC','Domains']
kmer_lists = [[1,2,3,4,5,6,7,8,9], # dna 
              [1,2,3,4],           # aa
              [1,2,3,4,5,6] ,      #pc
              [0]]  
feature_sets = [f'{f}_{k}' for i,f in enumerate(features) for k in kmer_lists[i] ]


In [11]:
all_results =[]
for subset in subsets:
    print  (subset)
    data = DataSet(subset,V_H,feature_sets=feature_sets)
    (label,label_tax,pool,pool_tax,baltimore) = subset
    print  (label,label_tax,pool,pool_tax,baltimore)
    print((data.ds.groupby(['y','trn/tst']).count(),'\n'))
    mask = data.ds['trn/tst']=='train'
    y_train = np.asarray(data.ds[mask]['y'],dtype=int)
    y_test = np.asarray(data.ds[~mask
                               ]['y'],dtype=int)
    for fs in data.fs:
        results = test_prediction(fs,y_train,y_test)
        results.update ({'N': len(data.ds), 'features':fs.feature, 'k':fs.k})
        print(results)
        data.results2CSV (results,subset, results_file)
        all_results.append(results)
    results_df = pd.DataFrame(all_results)
   

('Alteromonadales', 'order', 'Gammaproteobacteria', 'class', 'DNA')
train test split 40 14
     virus  y      refseqs trn/tst
0   754052  1  [NC_020849]    test
1  1654919  1  [NC_029094]   train
2  1445859  1  [NC_023594]   train
3  1458863  1  [NC_025466]    test
4  1874540  1  [NC_031908]   train
Adding feature sets  ['DNA_1', 'DNA_2', 'DNA_3', 'DNA_4', 'DNA_5', 'DNA_6', 'DNA_7', 'DNA_8', 'DNA_9', 'AA_1', 'AA_2', 'AA_3', 'AA_4', 'PC_1', 'PC_2', 'PC_3', 'PC_4', 'PC_5', 'PC_6', 'Domains_0', <featureset.FeatureSet object at 0x7f5cc99b3dd8>, <featureset.FeatureSet object at 0x7f5cc99b3c50>, <featureset.FeatureSet object at 0x7f5cc99b3e10>, <featureset.FeatureSet object at 0x7f5cd2fd0b70>, <featureset.FeatureSet object at 0x7f5cc99b50f0>, <featureset.FeatureSet object at 0x7f5cc99d7cf8>, <featureset.FeatureSet object at 0x7f5cc998ef98>, <featureset.FeatureSet object at 0x7f5cc998eef0>, <featureset.FeatureSet object at 0x7f5cc998eeb8>, <featureset.FeatureSet object at 0x7f5cc998ef60>, <fe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['trn/tst'] = train.apply(lambda row: 'train', axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['trn/tst'] = test.apply(lambda row: 'test', axis = 1)


adding fs DNA_2
FS get sequences for length ds 54
FS get feature names DNA_2 54
FS get X len fs 16
54 54 16 (54, 16)
adding fs DNA_3
FS get sequences for length ds 54
FS get feature names DNA_3 54
FS get X len fs 64
54 54 64 (54, 64)
adding fs DNA_4
FS get sequences for length ds 54
FS get feature names DNA_4 54
FS get X len fs 256
54 54 256 (54, 256)
adding fs DNA_5
FS get sequences for length ds 54
FS get feature names DNA_5 54
FS get X len fs 1024
54 54 1024 (54, 1024)
adding fs DNA_6
FS get sequences for length ds 54
FS get feature names DNA_6 54
FS get X len fs 4096
54 54 4096 (54, 4096)
adding fs DNA_7
FS get sequences for length ds 54
FS get feature names DNA_7 54
FS get X len fs 16384
54 54 16384 (54, 16384)
adding fs DNA_8
FS get sequences for length ds 54
FS get feature names DNA_8 54
FS get X len fs 65536
54 54 65536 (54, 65536)
adding fs DNA_9
FS get sequences for length ds 54
FS get feature names DNA_9 54
FS get X len fs 262144
54 54 262144 (54, 262144)
adding fs AA_1
FS g

AttributeError: 'FeatureSet' object has no attribute 'split'