In [21]:
%reload_ext autoreload
%autoreload 2
import os
import pandas as pd
import pickle
import random
import numpy as np
import timeit
from collections import defaultdict, Counter
from pathlib import Path
from Bio import SeqIO
import sys
lib_dir = '../mylibs'
if lib_dir not in sys.path:
    sys.path.append(lib_dir)
    
import vhdb as vhdb
import features_predict as fp

### The input dataset files in ../inputs 
1. Bacteria_DNA.csv 
2. Euk_subsets.csv
3. Euk_all.csv
4. Euk_RNA.csv

### Load the input files to train and test
1. input/ datasets  
2. VHDB 

In [22]:
# Input file  
subsetfile = '../inputs/Bacteria_DNA.csv'
label_info = pd.read_csv(subsetfile)
subsets = label_info.apply(tuple, axis=1).tolist()
print(f'{len(subsets)} datasets : {subsets[0]}')

vhdbfile = '../inputs/VHDB_25_1_2019.p'
with open(vhdbfile, 'rb') as f:
    V_H = pickle.load( f)
hosts = V_H.hosts
viruses = V_H.viruses
print (f'{len(viruses)} viruses and {len(hosts)} hosts')


# Output file for the results
results_file = f'../results/{Path(subsetfile).stem}_results.csv '
print (f'Results will be saved in: {results_file}')



67 datasets : ('Alteromonadales', 'order', 'Gammaproteobacteria', 'class', 'DNA')
9199 viruses and 3006 hosts
Results will be saved in: ../results/Bacteria_DNA_results.csv 


### Set up the different features to be tested and corresponding lists:
1. Filepaths and extensions
2. k-mer lengths
3. Lookup for symbols used in sequences

In [24]:
baltimore = 'all'

# Everthing in the same order as the features list

feature_list = ['DNA','AA','PC','Domains']
# filepath for the fasta, faa and domain files
filepaths = [   '/home4/youn01f/Desktop/workspace/newData/fasta',
                '/home4/youn01f/Desktop/workspace/newData/faa',
                 '/home4/youn01f/Desktop/workspace/newData/faa',
                '/home4/youn01f/Desktop/workspace/newData/pfs']
file_exts = ['fasta','fasta','fasta','pfs']#,'fasta',fasta]

# A list of kmers for each feature set to be tested
kmer_lists = [[1,2,3,4,5,6,7,8,9], # dna 
              [1,2,3,4],           # aa
              [1,2,3,4,5,6] ,      #pc
             [0]]                  #domains

# symbol dictionaries 
na_dict = {'mod':4,'a':0,'c':1,'g':2,'t':3}
aa_dict = {'mod':20 ,'A':0,'C':1,'D':2,'E':3,'F':4,'G':5,'H':6,'I':7,'K':8,'L':9,'M':10,'N':11,
              'P':12,'Q':13,'R':14,'S':15,'T':16,'V':17,'W':18,'Y':19}
pc_dict = {'mod':7, 'C':0,
            'A':1,'G':1,'V':1,
           'I':2,'L':2,'F':2,'P':2,
           'M':3,'S':3,'T':3,'Y':3,
           'H':4,'N':4,'Q':4,'W':4,
           'R':5,'K':5,
           'D':6,'E':6}
symbol_dicts = [ na_dict,aa_dict,pc_dict, {}]

classifiers = ['SVM_lin']

In [25]:
features = ['DNA','AA','PC','Domains']
for subset in subsets[-1:]:
    class_data,n = fp.get_class_lists(subset,viruses,hosts)
    datasets = fp.split_data(viruses,class_data,n)
    (label,label_tax,pool,pool_tax,baltimore) = subset
    print  (label,label_tax,pool,pool_tax,baltimore)
    print('len ds',len(datasets['training']))
    for feature in features:
        
        index = feature_list.index(feature)
        kmers = kmer_lists[index]
        filepath = filepaths[index]
        symbol_dict = symbol_dicts[index]
        ext = file_exts[index]
        
        if features != 'PC': #PC same sequence as for AA
            filepath = filepaths[index]
            print ( f'getting {feature} sequences from {filepath}')
            sequences = fp.get_sequences(filepath,datasets,ext)
            print (len(sequences['test']))
        for k in kmers: 
            x_train,x_test,y_train,y_test = fp.get_feature_matrices(sequences,datasets,label,k,symbol_dict)
            print (f'X train: {np.shape(x_train)},X test: {np.shape(x_test)}, y train: {np.shape(y_train)},y_test: {np.shape(y_test)}')
            results = fp.test_prediction(x_train,x_test,y_train,y_test)
            results.update ({'N in class': n, 'Features':feature, 'k':k})
            print(results)
            fp.results2CSV (results,subset, results_file)

 size of training 74 and test set   20
Bacteroidetes phylum Bacteria superkingdom DNA
len ds 74
getting DNA sequences from /home4/youn01f/Desktop/workspace/newData/fasta
viruses number of missing files 0
20
X_training:Extacting  features  of length  1  from   74 sequences
X_test: Extacting  features  of length  1  from   20 sequences
X train: (74, 4),X test: (20, 4), y train: (74,),y_test: (20,)
{'AUC': 0.958, 'N in class': 47, 'Features': 'DNA', 'k': 1}
getting AA sequences from /home4/youn01f/Desktop/workspace/newData/faa
viruses number of missing files 0
20
X_training:Extacting  features  of length  1  from   74 sequences
X_test: Extacting  features  of length  1  from   20 sequences
X train: (74, 20),X test: (20, 20), y train: (74,),y_test: (20,)
{'AUC': 1.0, 'N in class': 47, 'Features': 'AA', 'k': 1}
getting PC sequences from /home4/youn01f/Desktop/workspace/newData/faa
viruses number of missing files 0
20
X_training:Extacting  features  of length  1  from   74 sequences
X_test: 