-
Notifications
You must be signed in to change notification settings - Fork 0
/
hich-pool.nf
87 lines (73 loc) · 2.46 KB
/
hich-pool.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
include { CAT } from './hickory-common.nf'
params.results = 'results'
process DEDUP {
tag "${sample_id}"
publishDir params.results, mode:'copy'
container "bskubi/pairtools:1.0.4"
input:
tuple \
val(block_id), \
val(sample_id), \
path(pairs),
val(timing)
output:
tuple \
val(block_id), \
val(sample_id), \
path("${sample_id}.dedup_${timing}.pairs.lz4"), \
path("${sample_id}.dups_${timing}.pairs.lz4"), \
path("${sample_id}.dedup_${timing}.stats.txt")
script:
"""
pairtools dedup \
--send-header-to both \
--mark-dups \
--output ${sample_id}.dedup_${timing}.pairs.lz4 \
--output-dups ${sample_id}.dups_${timing}.pairs.lz4 \
--output-stats ${sample_id}.dedup_${timing}.stats.txt \
${pairs}
"""
}
process MERGE {
tag "${sample_id}"
publishDir params.results, mode:'copy'
container "bskubi/pairtools:1.0.4"
input:
tuple \
val(block_id), \
val(sample_ids), \
path(files)
output:
tuple \
val(block_id), \
path("${block_id}.merge.pairs.lz4")
script:
"""
pairtools merge --output "${block_id}.merge.pairs.lz4" ${files}
"""
}
workflow final_dedup {
take: merge_ch
main:
DEDUP(merge_ch.map{tuple(it[0], it[0], it[1], "after")})
}
workflow {
/*
The workflow needs to know
What the input .pairs paths are
How to associate .pairs paths with sample ids.
What the block IDs are
How to associate sample IDs with block IDs
We can plausibly associate .pairs paths with sample ids and sample IDs block IDs in any of the following ways:
- A database with a specific schema from which we obtain a join table
- A join table (from CSV or a database) with FILENAME, SAMPLE_ID, and BLOCK_ID entries
- A regex extracting SAMPLE_ID from FILENAME and BLOCK_ID from SAMPLE_ID
- JSON that stores these associations (i.e. key = BLOCK_ID, value = list of (FILENAME, SAMPLE_ID) pairs)
To extract from CSV, we need to read each row and store it in a channel
*/
csv_ch = CAT(Channel.fromPath('demo.csv')).splitCsv(skip: 1).map {tuple(it[0], it[1], file(it[2]), "before")}
block_id_ch = csv_ch.map{it[0]}.distinct()
dedup_ch = DEDUP(csv_ch).map{tuple(it[0], it[1], it[2])}.groupTuple()
merge_ch = MERGE(dedup_ch)
final_dedup(merge_ch)
}