/
test_params.py
286 lines (230 loc) · 12.1 KB
/
test_params.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
import argparse
import platform
from Bio.SubsMat import MatrixInfo
from Bio import SeqIO
from sequence import NeuriteSequence
class DomainArgumentValidator():
'''
Validates that user-provided arguments relative to domain analysis are
sound and of valid ranges.
'''
def __init__(self, args):
self.args = args
self.check_args()
def test_laplace(self):
if 0 <= self.args['l'] <= 1.0:
return True
else:
raise IOError('Laplace correction must be a fraction [error]')
def test_window_size(self):
if self.args['win'] > 0:
return True
else:
raise IOError('Sliding window must be positive [error]')
def test_max_gaps(self):
if self.args['max_g'] > 0:
return True
else:
raise IOError('Max #/gaps must be positive [error]')
def check_args(self):
return all([self.test_laplace(), self.test_max_gaps(),
self.test_window_size()])
# Validates user-provided command-line arguments relative to sequence alignment
class AlignmentArgumentValidator():
def __init__(self, args):
self.args = args
self.check_python()
self.check_args()
# Check python 3.0+ is available
def check_python(self):
if not platform.python_version_tuple()[0:2] >= ('3', '0'): # >= 3.0
raise RuntimeError('Python 3.0+ recommended')
# Checks user-provided arguments are valid
def check_args(self):
return all([self.test_num_workers(), self.test_mutual_matrices(),
self.test_valid_matrix(), self.test_threshold()])
# Test either a custom matrix or in-built matrix is selected
def test_mutual_matrices(self):
if self.args['custom'] and self.args['matrix']:
raise IOError('Either a custom or in-built matrix must be selected')
else:
return True
# Test a valid substitution matrix is selected
def test_valid_matrix(self):
all_matrices = set(MatrixInfo.available_matrices) # al sub. matrices
if self.args['matrix'] in all_matrices or self.args['custom']:
return True
else:
err = 'An in-built (URL below) or custom matrix is required.\n'+\
'http://biopython.org/DIST/docs/api/Bio.SubsMat.MatrixInfo-module.html'
raise IOError(err)
# Test a valid number of workers are provided
def test_num_workers(self):
if self.args['n'] >= 1:
return True
else:
raise IOError('>= 1 worker processes must be provided')
# Test that a valid consensus threshold is provided
def test_threshold(self):
if self.args['thresh'] >= 0:
return True
else:
raise IOError('Consensus threshold (thresh) must be positive')
# Helper-class to parse input arguments
class AlignmentCommandParser():
def __init__(self):
desc = 'Pairwise and multiple sequence alignment (MSA) of neurite sequences.'
u='%(prog)s [options]' # command-line usage
self.parser = argparse.ArgumentParser(description=desc, add_help=False, usage=u)
self._init_params()
# Create parameters to be used throughout the application
def _init_params(self):
param_aln = self.parser.add_argument_group('Required alignment parameters')
param_local = self.parser.add_argument_group('Pairwise (local) parameters')
param_msa = self.parser.add_argument_group('MSA parameters')
param_opts = self.parser.add_argument_group('Optional parameters')
# Specify required arguments
param_aln.add_argument('-f', metavar='FILE',
help='Input FASTA file [na]')
param_aln.add_argument('-matrix', metavar='STR', default=None,
help='Matrix; see Biopython MatrixInfo [na]')
# Local-alignment specific parameters
param_local.add_argument('-gap', metavar='INT', default=-1, type=int,
help='Gap extension penalty [-8]')
param_local.add_argument('-gapopen', metavar='INT', default=0, type=int,
help='Gap open penalty [0]')
# MSA specific parameters
param_msa.add_argument('-thresh', metavar='FLOAT', default=0.7, type=float,
help='Consensus threshold [0.7]')
param_msa.add_argument('-type', metavar='STR', default='percent',
choices=['percent', 'sqrt'],
help='Threshold type {percent, sqrt} [percent]')
param_msa.add_argument('-build', metavar='FILE', default='alns.xml',
type=str, help='Output file of consensus & alignments [./alns.xml]')
param_opts.add_argument('--subsample', metavar='FLOAT', default=1, type=float,
help='Subsample of data, taking the first n sequences. Value treated as proportion of total if (0,1] and explicit number for [2,N].')
param_opts.add_argument('--subsample_start', metavar='FLOAT', default=0, type=float,
help='If taking a subsample, subsample_start determines from which sequence to start the subset (as proportion or explicit number, indexed from 0). If size and start of subsample lead to an exhaustion of the available sequences an error will be be displayed.')
param_opts.add_argument('--random_subset', action='store_const', const=True, default=False,
help='Subset is a random sample of data (shuffle sequences before taking subset).')
param_opts.add_argument('--random_order', action='store_const', const=True, default=False,
help='Order of sequences (in subsample of data if using --subset) is shuffled.')
# General, optional arguments
param_opts.add_argument('-f2', metavar='FILE', default=None,
help='Second input FASTA file [na]')
param_opts.add_argument('-custom', metavar='FILE', default=None,
help='Custom substitution matrix [na]')
param_opts.add_argument('-nodeTypes', metavar='FILE', default=None,
help='Node Type Specifications [na]')
param_opts.add_argument('-n', metavar='INT', default=2, type=int,
help='Number of worker processes [2]')
param_opts.add_argument('-o', metavar='FILE', default='./scores.tab',
help='File to write/append output [./scores.tab]')
param_opts.add_argument('-a', metavar='FILE', default=None,
help='File to write/append alignments [na]')
param_opts.add_argument('-h','--help', action='help',
help='Show this help screen and exit')
# Get the arguments for each parameter
def parse_args(self):
return vars(self.parser.parse_args()) # parse arguments
class DomainCommandParser():
def __init__(self):
desc = 'Domain analysis of neurite multiple sequence alignments.'
u='%(prog)s [options]' # command-line usage
self.parser = argparse.ArgumentParser(description=desc, add_help=False, usage=u)
self._init_params()
def _init_params(self):
param_reqd = self.parser.add_argument_group('Required parameters')
param_domain = self.parser.add_argument_group('Domain parameters')
param_opts = self.parser.add_argument_group('Optional parameters')
# Domain-specific parameters
param_reqd.add_argument('-query', metavar='BUILD',
help='Set query consensus build [na]')
param_reqd.add_argument('-baseline', metavar='BUILD',
help='Set baseline consensus build [na]')
param_domain.add_argument('-max-g', metavar='INT', default=3, type=int,
help='Maximum #/gaps in domain [3]')
param_domain.add_argument('-win', metavar='INT', default=7, type=int,
help='Sliding window length; characters [7]')
param_domain.add_argument('-l', metavar='FLOAT', default=0.4, type=float,
help='LaPlace correction cutoff [0.4]')
param_domain.add_argument('-o', metavar='FILE', default='./doms.txt', type=str,
help='Output file for saving identified domains [./doms.txt]')
param_domain.add_argument('-p', metavar='FLOAT', default=0.05, type=float,
help='Hypergeometric p-value cutoff [0.05]')
param_domain.add_argument('--ipf', action='store_const', const=True,
default=False, help = 'IPF normalize [false]')
param_domain.add_argument('--enumerate', action='store_const', const=True,
default=False, help = 'Use many window & gap cutoffs [false]')
param_domain.add_argument('--strip', action='store_const', const=True,
default=False, help = 'Strip gaps within sliding window [false]')
param_opts.add_argument('-h','--help', action='help',
help='Show this help screen and exit')
def parse_args(self):
return vars(self.parser.parse_args()) # parse arguments
# An InputStateWrapper wraps data used as input, eg. FASTA file,
# custom matrix, user-provided arguments, etc.
class InputWrapperState():
def __init__(self, args):
self.args = args # reference user-provided arguments
self.subsmat = None # references data for substitution matrix
self.fname = args['f'] # input filename
self.fname2 = args['f2'] # input filename
self.alignment_file = args['a']
# Get arguments
def get_args(self):
return self.args
# Get arguments relative to penalties
def get_penalties(self):
cost_ids = ('gap','gapopen') # all possible costs, might in the future include a separate gap open and gap extension cost
return {k: self.args[k] for k in cost_ids} # get costs per penality
# Get the substitution matrix which will be used
def get_submatrix(self):
return self.subsmat
# Trivial function to parse a fasta file
def parse_fasta(self, fname):
parsed_fasta = list(SeqIO.parse(fname, 'fasta')) # easy indexing
queries = [] # references list of parsed sequences
for i in parsed_fasta: # cast as a neuronal sequence; easy modeling.
s = NeuriteSequence(seq=str(i.seq), name=i.name)
queries.append(s)
print(str(len(queries)) + ' queries parsed [OK]')
return queries # return set of fasta entries
# Set the desired matrix the user wishes to add
def assign_matrix(self):
if not self.args['matrix']: # if no in-built matrix, parse custom matrix
self.subsmat = self.__parse_custom_matrix() # custom matrix
else:
self.subsmat = self.__parse_inbuilt_matrix()
# Get the user-provided in-built matrix
def __parse_inbuilt_matrix(self):
return getattr(MatrixInfo, self.args['matrix']) # get substitution matrix
# Function to parse custom scoring matrix.
def __parse_custom_matrix(self):
"""
A schema is organized such that you have 3 columns: A, B, C.
Columns A and B represents the query and target base, respectively.
Column C is the real-number assigned to the mismatch given A and B.
This file must be tab-delimited.
Example:
A T -5
A C 2
A G -2
A A 8
T A 2
...
etc.
"""
submat = {} # the substitution matrix
for line in open(self.args['custom']): # parse custom matrix file
line = line.strip()
if len(line) == 0: # if an empty line, terminate analysis
break
else: # pertains to parsing of the custom substition matrix
line = line.split('\t')
if len(line) < 3:
raise IndexError('Custom matrix must have 3x columns')
else: # set the Query (A) and Target (B) scores
a, b, score = line[0], line[1], float(line[2])
submat[(a, b)] = score
return submat