/
LineageEngine.java
442 lines (404 loc) · 17.2 KB
/
LineageEngine.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
/*
* Program LICHeE for multi-sample cancer phylogeny reconstruction
* by Victoria Popic (viq@stanford.edu) 2014
*
* MIT License
*
* Copyright (c) 2014 Victoria Popic.
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package lineage;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.logging.ConsoleHandler;
import java.util.logging.Formatter;
import java.util.logging.Level;
import java.util.logging.LogRecord;
import java.util.logging.Logger;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import util.LineageDisplayConfig;
import util.Visualizer;
import lineage.AAFClusterer.Cluster;
import lineage.AAFClusterer.ClusteringAlgorithms;
import lineage.Parameters.Format;
import lineage.PHYTree;
/**
* Main cell lineage reconstruction pipeline
* @autor viq
*/
public class LineageEngine {
protected static final Logger logger = Logger.getLogger("lineage.engine");
/**
* The main pipeline for reconstructing the cell lineage trees
*/
public static void buildLineage(Args args) {
// 1. load SNV data
SNVDataStore db = new SNVDataStore(args.inputFileName, args.clustersFileName, args.normalSampleId);
// 2. get the SNVs partitioned by group tag and create the appropriate SNV group objects
HashMap<String, ArrayList<SNVEntry>> snvsByTag = db.getSomaticGroups();
ArrayList<SNVGroup> groups = new ArrayList<SNVGroup>();
for(String groupTag : snvsByTag.keySet()) {
groups.add(new SNVGroup(groupTag, snvsByTag.get(groupTag), db.isRobustGroup(groupTag)));
}
if(groups.size() == 0) {
logger.warning("All SNV groups have been filtered out.");
return;
}
// 3. cluster SNVs in each group
AAFClusterer clusterer = new AAFClusterer();
for(SNVGroup group : groups) {
if(args.clustersFileName == null) {
Cluster[] clusters = clusterer.clusterSubPopulations(group, ClusteringAlgorithms.EM, 1);
logger.fine("Clustering results for group: " + group.getTag());
for(Cluster c : clusters) {
logger.fine(c.toString());
}
group.setSubPopulations(clusters);
} else {
ArrayList<Cluster> groupClusters = db.getClusters().get(group.getTag());
group.subPopulations = new Cluster[groupClusters.size()];
group.subPopulations = groupClusters.toArray(group.subPopulations);
}
}
// 4. construct the constraint network
PHYNetwork constrNetwork = new PHYNetwork(groups, db.getNumSamples());
logger.fine(constrNetwork.toString());
// 5. find all the lineage trees that pass the VAF constraints
ArrayList<PHYTree> spanningTrees = constrNetwork.getLineageTrees();
logger.info("Found " + spanningTrees.size() + " valid tree(s)");
if(spanningTrees.size() == 0) {
logger.info("Adjusting the network...");
// if no valid trees were found, fix the network
// remove group nodes that are not robust, complete edges
int delta = 0;
do {
int numNodes = constrNetwork.numNodes;
constrNetwork = constrNetwork.fixNetwork();
spanningTrees = constrNetwork.getLineageTrees();
delta = numNodes - constrNetwork.numNodes;
} while((delta != 0) && (spanningTrees.size() <= 0));
if(spanningTrees.size() <= 0) {
Parameters.ALL_EDGES = true;
constrNetwork = new PHYNetwork(groups, db.getNumSamples());
spanningTrees = constrNetwork.getLineageTrees();
}
logger.info("Found " + spanningTrees.size() + " valid trees after network adjustments");
}
// 6. evaluate/rank the trees
if(spanningTrees.size() > 0) {
constrNetwork.evaluateLineageTrees();
logger.fine("Top tree\nError score: " + spanningTrees.get(0).getErrorScore());
logger.fine(spanningTrees.get(0).toString());
}
// 7. result visualization
if(args.showNetwork) {
constrNetwork.displayNetwork();
}
if(spanningTrees.size() > 0) {
for(int i = 0; i < spanningTrees.size(); i++) {
if(i >= args.numShow && i > 0) break;
LineageDisplayConfig display = new LineageDisplayConfig(constrNetwork, spanningTrees.get(i), db.getSampleNames(), args.color);
if(i < args.numShow) {
Visualizer.showLineageTree(display);
}
if(i == 0 && args.outputDOTFileName != null) { // top tree only for simplicity
String parentDir = new File(args.outputDOTFileName).getParent();
writeTreeToDOTFile(display.toDOT(parentDir), db.getSampleNames(), args);
}
}
// 8. persistent storage
if(args.numSave > 0) {
writeTreesToTxtFile(constrNetwork, spanningTrees, db.getSampleNames(), args);
}
}
}
///// I/O /////
private static void writeTreesToTxtFile(PHYNetwork net, ArrayList<PHYTree> trees, ArrayList<String> sampleNames, Args args) {
String treeFileName = args.outputFileName;
try {
FileWriter fw = new FileWriter(treeFileName);
fw.write("Nodes:\n" + net.getNodesWithMembersAsString() + "\n");
for(int i = 0; i < args.numSave; i++) {
if(trees.size() > i) {
fw.write("****Tree " + i + "****\n");
String edges = trees.get(i).toString();
fw.write(edges);
fw.write("Error score: " + trees.get(i).getErrorScore()+"\n\n");
fw.write("Sample decomposition: \n");
String lineage = "";
for(int j = 0; j < sampleNames.size(); j++) {
lineage += trees.get(i).getLineage(j, sampleNames.get(j));
lineage += "\n";
}
fw.write(lineage);
fw.write("\n");
}
}
fw.write("SNV info:\n" + net.getNodeMembersOnlyAsString() + "\n");
fw.close();
} catch (IOException e) {
e.printStackTrace();
System.err.println("Failed to write to the file: " + treeFileName);
System.exit(-1);
}
}
private static void writeTreeToDOTFile(String dotTree, ArrayList<String> sampleNames, Args args) {
String treeFileName = args.outputDOTFileName;
try {
FileWriter fw = new FileWriter(treeFileName);
fw.write(dotTree);
fw.close();
} catch (IOException e) {
e.printStackTrace();
System.err.println("Failed to write to the file: " + treeFileName);
System.exit(-1);
}
}
// ---- LAUNCH ----
private static final String TREES_TXT_FILE_EXTENSION = ".trees.txt";
public static void main(String[] args) {
Options options = new Options();
// Commands
options.addOption("build", false, "Construct the sample lineage trees");
// Input/Output/Display
options.addOption("i", true, "Input file path [required]");
options.addOption("o", true, "Output file path (default: input file with suffix .trees.txt)");
options.addOption("cp", false, "Input data represents cell prevalaence (CP) values");
options.addOption("sampleProfile", false, "Input file contains the SSNV sample presence-absence profile (this will disable the default SSNV calling step)");
options.addOption("n", "normal", true, "Normal sample column id in the list of samples, 0-based (e.g 0 is the first column) [required without -sampleProfile]");
options.addOption("clustersFile", true, "SSNV clusters file path");
options.addOption("s", "save", true, "Maximum number of output trees to save (default: 1)");
options.addOption("showNetwork", "net", false, "Display the constraint network");
options.addOption("showTree", "tree", true, "Number of top-ranking trees to display (default: 0)");
options.addOption("color", false, "Enable lineage tree visualization in color mode");
options.addOption("dot", false, "Enable DOT file export of the top-scoring tree for Graphviz visualization (saved by default to: input file with suffix .dot)");
options.addOption("dotFile", true, "DOT file path");
// SSNV filtering / calling
options.addOption("maxVAFAbsent", "absent", true, "Maximum VAF to robustly consider an SSNV as absent from a sample [required without -sampleProfile]");
options.addOption("minVAFPresent", "present", true, "Minimum VAF to robustly consider an SSNV as present in a sample [required without -sampleProfile]");
options.addOption("maxVAFValid", true, "Maximum allowed VAF in a sample (default: 0.6)");
options.addOption("minProfileSupport", true, "Minimum number of robust SSNVs required for a group presence-absence profile to be labeled robust (default: 2)");
// Network Construction / Tree Search
options.addOption("minClusterSize", true, "Minimum size a cluster must have to be a considered a node in the network (default: 2)");
options.addOption("minPrivateClusterSize", true, "Minimum size a private mutation cluster must have to be a considered a node in the network (default: 1)");
options.addOption("minRobustNodeSupport", true, "Minimum number of robust SSNVs required for a node to be labeled robust during tree search: non-robust nodes can be removed from the network when no valid lineage trees are found (default: 2)");
options.addOption("maxClusterDist", true, "Maximum mean VAF difference up to which two clusters can be collapsed (default: 0.2)");
options.addOption("c", "completeNetwork", false, "Add all possible edges to the constraint network (default: private nodes are connected only to closest level parents; only nodes with no other parents are descendants of root)");
options.addOption("e", true, "VAF error margin (default: 0.1)");
options.addOption("nTreeQPCheck", true, "Number of top-ranking trees on which the QP consistency check is run, we have not seen this check fail in practice (default: 0, for best performance)");
options.addOption("v", "verbose", false, "Verbose mode");
options.addOption("h", "help", false, "Print usage");
// display order
ArrayList<Option> optionsList = new ArrayList<Option>();
optionsList.add(options.getOption("build"));
optionsList.add(options.getOption("i"));
optionsList.add(options.getOption("o"));
optionsList.add(options.getOption("cp"));
optionsList.add(options.getOption("sampleProfile"));
optionsList.add(options.getOption("n"));
optionsList.add(options.getOption("clustersFile"));
optionsList.add(options.getOption("s"));
optionsList.add(options.getOption("net"));
optionsList.add(options.getOption("tree"));
optionsList.add(options.getOption("color"));
optionsList.add(options.getOption("dot"));
optionsList.add(options.getOption("dotFile"));
optionsList.add(options.getOption("maxVAFAbsent"));
optionsList.add(options.getOption("minVAFPresent"));
optionsList.add(options.getOption("maxVAFValid"));
optionsList.add(options.getOption("minProfileSupport"));
optionsList.add(options.getOption("minClusterSize"));
optionsList.add(options.getOption("minPrivateClusterSize"));
optionsList.add(options.getOption("minRobustNodeSupport"));
optionsList.add(options.getOption("maxClusterDist"));
optionsList.add(options.getOption("c"));
optionsList.add(options.getOption("e"));
optionsList.add(options.getOption("nTreeQPCheck"));
optionsList.add(options.getOption("v"));
optionsList.add(options.getOption("h"));
CommandLineParser parser = new BasicParser();
CommandLine cmdLine = null;
HelpFormatter hf = new HelpFormatter();
hf.setOptionComparator(new OptionComarator<Option>(optionsList));
try {
cmdLine = parser.parse(options, args);
} catch (ParseException e) {
System.out.println(e.getMessage());
hf.printHelp("lichee", options);
System.exit(-1);
}
// Set-up input args
Args params = new Args();
if(cmdLine.hasOption("i")) {
params.inputFileName = cmdLine.getOptionValue("i");
} else {
System.out.println("Required parameter: input file path [-i]");
hf.printHelp("lichee", options);
System.exit(-1);
}
if(cmdLine.hasOption("o")) {
params.outputFileName = cmdLine.getOptionValue("o");
} else {
params.outputFileName = params.inputFileName + TREES_TXT_FILE_EXTENSION;
}
if(cmdLine.hasOption("dot")) {
params.outputDOTFileName = params.inputFileName + ".dot";
}
if(cmdLine.hasOption("dotFile")) {
params.outputDOTFileName = cmdLine.getOptionValue("dotFile");
}
if(cmdLine.hasOption("clustersFile")) {
params.clustersFileName = cmdLine.getOptionValue("clustersFile");
}
if(cmdLine.hasOption("sampleProfile")) {
Parameters.INPUT_FORMAT = Format.SNV_WITH_PROFILE;
}
if(cmdLine.hasOption("n")) {
params.normalSampleId = Integer.parseInt(cmdLine.getOptionValue("n"));
} else if(!cmdLine.hasOption("sampleProfile")) {
System.out.println("Required parameter: normal sample id [-n]");
hf.printHelp("lichee", options);
System.exit(-1);
}
if(cmdLine.hasOption("showTree")) {
params.numShow = Integer.parseInt(cmdLine.getOptionValue("showTree"));
}
if(cmdLine.hasOption("showNetwork")) {
params.showNetwork = true;
}
if(cmdLine.hasOption("s")) {
params.numSave = Integer.parseInt(cmdLine.getOptionValue("s"));
}
if(cmdLine.hasOption("color")) {
params.color = true;
}
if(cmdLine.hasOption("maxVAFAbsent")) {
Parameters.MAX_VAF_ABSENT = Double.parseDouble(cmdLine.getOptionValue("maxVAFAbsent"));
} else if(!cmdLine.hasOption("sampleProfile")) {
System.out.println("Required parameter: -maxVAFAbsent");
hf.printHelp("lichee", options);
System.exit(-1);
}
if(cmdLine.hasOption("minVAFPresent")) {
Parameters.MIN_VAF_PRESENT = Double.parseDouble(cmdLine.getOptionValue("minVAFPresent"));
} else if(!cmdLine.hasOption("sampleProfile")) {
System.out.println("Required parameter: -minVAFPresent");
hf.printHelp("lichee", options);
System.exit(-1);
}
if(cmdLine.hasOption("maxVAFValid")) {
Parameters.MAX_ALLOWED_VAF = Double.parseDouble(cmdLine.getOptionValue("maxVAFValid"));
}
if(cmdLine.hasOption("minProfileSupport")) {
Parameters.MIN_GROUP_PROFILE_SUPPORT = Integer.parseInt(cmdLine.getOptionValue("minProfileSupport"));
}
if(cmdLine.hasOption("minClusterSize")) {
Parameters.MIN_CLUSTER_SIZE = Integer.parseInt(cmdLine.getOptionValue("minClusterSize"));
}
if(cmdLine.hasOption("minPrivateClusterSize")) {
Parameters.MIN_PRIVATE_CLUSTER_SIZE = Integer.parseInt(cmdLine.getOptionValue("minPrivateClusterSize"));
}
if(cmdLine.hasOption("minRobustNodeSupport")) {
Parameters.MIN_ROBUST_CLUSTER_SUPPORT = Integer.parseInt(cmdLine.getOptionValue("minRobustNodeSupport"));
}
if(cmdLine.hasOption("maxClusterDist")) {
Parameters.MAX_COLLAPSE_CLUSTER_DIFF = Double.parseDouble(cmdLine.getOptionValue("maxClusterDist"));
}
if(cmdLine.hasOption("c")) {
Parameters.ALL_EDGES = true;
}
if(cmdLine.hasOption("cp")) {
Parameters.CP = true;
Parameters.VAF_MAX = 1.0;
Parameters.MAX_ALLOWED_VAF = 1.0;
}
if(cmdLine.hasOption("e")) {
Parameters.VAF_ERROR_MARGIN = Double.parseDouble(cmdLine.getOptionValue("e"));
}
if(cmdLine.hasOption("nTreeQPCheck")) {
Parameters.NUM_TREES_FOR_CONSISTENCY_CHECK = Integer.parseInt(cmdLine.getOptionValue("nTreeQPCheck"));
}
if(cmdLine.hasOption("h")) {
new HelpFormatter().printHelp(" ", options);
}
// logger
ConsoleHandler h = new ConsoleHandler();
h.setFormatter(new LogFormatter());
h.setLevel(Level.INFO);
logger.setLevel(Level.INFO);
if(cmdLine.hasOption("v")) {
h.setLevel(Level.FINEST);
logger.setLevel(Level.FINEST);
}
logger.addHandler(h);
logger.setUseParentHandlers(false);
if(cmdLine.hasOption("build")) {
buildLineage(params);
} else {
new HelpFormatter().printHelp("lichee", options);
System.exit(-1);
}
}
protected static class Args {
// --- 'build' command ---
String inputFileName;
String outputFileName;
String outputDOTFileName;
String clustersFileName;
int normalSampleId = 0;
String cnvFileName;
String annFileName;
String cosmicFileName;
String tcgaFileName;
// --- 'show' command ---
String showFileNamePrefix;
int numShow = 0;
int numSave = 1;
// flags
boolean showNetwork = false;
boolean verbose = false;
boolean color = false;
}
protected static class LogFormatter extends Formatter {
public String format(LogRecord rec) {
return rec.getMessage() + "\r\n";
}
}
protected static class OptionComarator<T extends Option> implements Comparator<T> {
protected ArrayList<Option> orderedOptions;
public OptionComarator(ArrayList<Option> options) {
orderedOptions = options;
}
public int compare(T o1, T o2) {
return orderedOptions.indexOf(o1) - orderedOptions.indexOf(o2);
}
}
}