forked from AmirUCR/allegro
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.yaml
175 lines (145 loc) · 7.3 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# ===============================================================
# Basic Settings
# ===============================================================
experiment_name: '1k_asco_again'
# ---
# ---------------------------------------------------------------
# Path Settings
# ---------------------------------------------------------------
# Ignored if Easy Mode input_csv_path_with_guides above has a value
# input_directory: 'data/input/example_input'
# input_species_path: 'data/input/fifty_example_input_species.csv'
input_species_path_column: 'cds_file_name'
input_directory: 'data/input/cds/delimited_cds_from_gff'
input_species_path: 'data/input/ascomycota_input_species.csv'
# input_species_path_column: 'filename'
# input_directory: 'data/input/cds/'
# input_species_path: 'data/input/hundred.csv'
# input_species_path_column: 'ortho_file_name'
# input_directory: 'data/input/cds/ortho_from_gff'
# input_species_path: 'data/input/fourdbs_input_species.csv'
# ---------------------------------------------------------------
# String value, default: 'track_e'
# track_a: any of the fasta records can be targeted. There will be multiplicity targets per fasta record.
# track_e: each fasta record has to be targeted at least multiplicity times,
track: 'track_a'
# ---
# Integer value, default: 1
# In track_a, each species needs to be targeted at least this many times ANYWHERE.
# In track_e, EACH gene/record needs to be targeted at least this many times in that gene/record.
multiplicity: 1
# ---
# Possible values: True or False. Default: True
# Remove guides with > gc_max and < gc_min from consideration?
filter_by_gc: True
gc_max: 0.6 # Only works if filter_by_gc is True.
gc_min: 0.4 # Only works if filter_by_gc is True.
# ---
# ---------------------------------------------------------------
# Easy Mode - Has priority over Path Settings
# ---------------------------------------------------------------
input_csv_path_with_guides: '' # Default: ''
# ===============================================================
# Advanced Settings
# ===============================================================
# Possible choices: 'dummy' (default), 'ucrispr'
# dummy assigns a score of 1.0 to all gRNAs, essentially treats all guides as the same.
# scorer: 'ucrispr' uses a faster implementation of zhang2019
# ucrispr assigns an efficacy score between [0-100] to each guide.
scorer: 'dummy'
# Integer value, default: 0
# Only used if scorer: 'ucrispr'
# Remove guides with a score under this value from the calculation.
guide_score_threshold: 0
# ---
# Integer value, default: 0
beta: 0 # The final size of the gRNAs set must be <= than this. Think of it as your budget.
# Setting to 0 disables beta and causes ALLEGRO to find the smallest gRNA set IGNORING scores
# (treats all of the gRNAs as equals).
# If set to the number of input species, the final size of the set may be up to
# the number of species you have (worst case, one gRNA per species).
# If set to a number HIGHER than the number of species (track A) or genes (track E),
# finds the best #beta gRNAs.
# ---
# List of strings, Default: ['']
# ALLEGRO will output guides that do not contain any of the patterns in this list.
# Supports up to 5 chained IUPAC codes; e.g., 'RYSN'
# Exception to the 5 rule above is when positional nucleotides are used
# in conjunction with 'N's. E.g., NNNNNNNCNNNNGNNNN will exclude guides
# with C and G in those positions.
# Supports individual nucleotides; e.g., 'TTTT' excludes guides with quad-T in their seq.
# Be careful not to place common nucleotides or IUPAC codes here such as just 'A' or 'AG'
# You may end up excluding most or all guides from the calculation.
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3568203/#:~:text=For%20vertebrates%2C%20as,of%20these%20species.
patterns_to_exclude: ['TTTT', 'AAAA', 'CCCC', 'GGGG']
# ---
# Boolean: True or False. Default: False
# Significantly affects running time.
# True reports gRNA with off-targets.
# report_up_to_n_mismatches reports gRNA with fewer <= N mismatches after the seed region.
output_offtargets: False
report_up_to_n_mismatches: 1 # This may be [0-3]
# The column in the input csv file with the name of the
# background fasta to check off-targets against
input_species_offtarget_dir: 'data/input/cds/ortho_from_gff'
input_species_offtarget_column: 'ortho_file_name'
# ---
# Boolean: True or False. Default: False
# Affects running time performance.
# Allows a guide within up to the set number of mismatches (after the seed region) of another guide
# to "inherit" the second guide's targets, essentially rendering the second guide useless
# and reducing the total guides needed.
# Works best when unscored guides are present as it does not consider scores.
# Uses seed_region_is_n_upstream_of_pam and mismatches_allowed_after_seed_region
preclustering: False
# Boolean: True or False. Default: False
# Affects running time performance.
# Compresses the output gRNA set by clustering similar gRNAs.
# Adds a new column called 'cluster' to output/EXPERIMENT_NAME/EXPERIMENT_NAME.csv
# Uses seed_region_is_n_upstream_of_pam and mismatches_allowed_after_seed_region
postclustering: False
seed_region_is_n_upstream_of_pam: 12
mismatches_allowed_after_seed_region: 2 # Integer value, default: 2
# Integer value, measured in seconds, default: 60
# Only used in solving the ILP if there are remaining feasible guides with
# fractional values after solving the LP.
# Stop searching for an optimal solution after this many seconds.
early_stopping_patience: 60
# ---
# Integer value. Default: 3
# A higher value sacrifices more running time for lower memory consumption.
# A preprocessing step that removes redundant guides.
# Use if you need to save memory.
# Max value is the total number of genes if using track E, and
# the total number of species if using track A.
mp_threshold: 0
# ---
# Boolean: True or False. Default: True
# When a problem is deemed unsolvable (e.g., Status: MPSOLVER_INFEASIBLE)
# Enabling diagnostics will attempt to relax each constraint and resolve the problem.
# If the new problem with the relaxed constraint is solvable, ALLEGRO outputs
# the culprit gene/species.
# Currently, to stop this process, you need to find the PID of
# the python process running ALLEGRO using: $ top
# and kill it manually: $ kill -SIGKILL PID
enable_solver_diagnostics: True
# ---
# left side seed
# GTGCTCAG CTTCGGCGTCAA
# allow 3mm
# guide1 ACTG-CTTCGGCGTCAA: gene A, gene B
# guide2 CCTG-CTTCGGCGTCAA: gene C, gene D
# guide3 TCTG-seed
# newdict
# CTTCGGCGTCAA: [ACTG, CCTG] # median string
# make script for finding conserved domains for high hitter guides
# preclustering replace greedy
# we need time analysis. if you give the ILP 10 hours, it will run for 10 hours if it doesn't
# find an optimal solution before that. If it finds a feasible solution by 10 hours, that's what
# you get.
# You can give A7 5 minutes and the whole process takes 9 minutes. or you can give 3 hours
# to A1 and it will take about 3 hours to finish. A1 does not need 3 hours...
# How long until first feasible? How long until optimal? Currently, for E1,
# it's between 10 minutes and 3 hours... when does it find the first feasible?
# explain in wiki how to use preclustering. if you turn it on, the csv file doesnt show that all targets are covered.
# you need to turn on offtarget finder and analize that