Skip to content

Commit

Permalink
Merge branch 'release/0.16.1'
Browse files Browse the repository at this point in the history
  • Loading branch information
simonvh committed Jun 28, 2021
2 parents e4582d6 + eb4b69f commit 984399e
Show file tree
Hide file tree
Showing 9 changed files with 104 additions and 40 deletions.
17 changes: 16 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,24 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Changed

### Fixed

### Removed

### Changed

## [0.16.1] - 2021-06-28

Bugfix release.

### Added

* Added warning when the number of sequences used for de novo motif prediction is low.

### Fixed

* Fixed bug with `gimme motif2factors`.
* Fixed "Motif does not occur in motif database when running maelstrom" (#192).
* Fixed bugs related to runs where no (significant) motifs is found.

## [0.16.0] - 2021-05-28

Expand Down
4 changes: 2 additions & 2 deletions gimmemotifs/background.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,9 +440,9 @@ def gc_bin_bedfile(
# print(col, b_start, b_end)
df_bin = df[(df[col] > b_start) & (df[col] <= b_end)]
# print(df_bin)
df_bin["start"] = df_bin["end"] - length
df_bin = df_bin[df_bin["start"] > 0]
if df_bin.shape[0] > 0:
df_bin["start"] = df_bin["end"] - length
df_bin = df_bin[df_bin["start"] > 0]
df_bin = df_bin.sample(n, replace=True, random_state=random_state)
df_bin["bin"] = "{:.2f}-{:.2f}".format(b_start, b_end)
df_bin[["chrom", "start", "end", "bin"]].to_csv(
Expand Down
14 changes: 9 additions & 5 deletions gimmemotifs/commands/motifs.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def motifs(args):
motifs = read_motifs(pfmfile, fmt="pfm")

if args.denovo:
gimme_motifs(
denovo = gimme_motifs(
sample,
args.outdir,
params={
Expand All @@ -121,10 +121,10 @@ def motifs(args):
"use_strand": not (args.single),
},
)
denovo = read_motifs(os.path.join(args.outdir, "gimme.denovo.pfm"))
mc = MotifComparer()
result = mc.get_closest_match(denovo, dbmotifs=pfmfile, metric="seqcor")
match_motifs = read_motifs(pfmfile, as_dict=True)
if len(denovo) > 0:
mc = MotifComparer()
result = mc.get_closest_match(denovo, dbmotifs=pfmfile, metric="seqcor")
match_motifs = read_motifs(pfmfile, as_dict=True)
new_map_file = os.path.join(args.outdir, "combined.motif2factors.txt")
base = os.path.splitext(pfmfile)[0]
map_file = base + ".motif2factors.txt"
Expand Down Expand Up @@ -153,6 +153,10 @@ def motifs(args):
else:
logger.info("skipping de novo")

if len(motifs) == 0:
logger.info("no motifs to report!")
sys.exit()

stats = [
"phyper_at_fpr",
"roc_auc",
Expand Down
4 changes: 4 additions & 0 deletions gimmemotifs/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -940,6 +940,10 @@ def select_nonredundant_motifs(
)
result = result[motif]
redundant_motifs += [m for m in result.keys() if result[m][0] >= 0.7]

if len(keep) < 2:
return keep

logger.debug(f"Selected {len(keep)} motifs for feature elimination")

# Read motif scan results
Expand Down
54 changes: 39 additions & 15 deletions gimmemotifs/denovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,17 @@ def prepare_denovo_input_bed(inputfile, params, outdir):
pred_bedfile = os.path.join(outdir, "prediction.bed")
val_bedfile = os.path.join(outdir, "validation.bed")

with open(bedfile) as f:
n_regions = len(f.readlines())

if n_regions < 500 and fraction <= 0.2:
logger.warn(
f"You have {n_regions} input regions and only {int(fraction * n_regions)} will be used for motif prediction."
)
logger.warn(
"You may consider to increase the fraction for prediction (-f, --fraction)"
)

# Split input into prediction and validation set
logger.debug(
"Splitting %s into prediction set (%s) and validation set (%s)",
Expand Down Expand Up @@ -144,6 +155,18 @@ def prepare_denovo_input_fa(inputfile, params, outdir):
logger.info("preparing input (FASTA)")

pred_fa = os.path.join(outdir, "prediction.fa")

fa = Fasta(inputfile)
n_regions = len(fa)

if n_regions < 500 and fraction <= 0.2:
logger.warn(
f"You have {n_regions} input regions and only {int(fraction * n_regions)} will be used for motif prediction."
)
logger.warn(
"You may consider to increase the fraction for prediction (-f, --fraction)"
)

val_fa = os.path.join(outdir, "validation.fa")
loc_fa = os.path.join(outdir, "localization.fa")

Expand Down Expand Up @@ -450,21 +473,22 @@ def best_motif_in_cluster(
motifs = dict([(str(m), m) for m in motifs])

# get the statistics for those motifs that were not yet checked
clustered_motifs = []
eval_motifs = []
for clus, singles in clusters:
for motif in set([clus] + singles):
if str(motif) not in stats:
clustered_motifs.append(motifs[str(motif)])

new_stats = {}
for bg, bg_fa in background.items():
for m, s in calc_stats(
fg_file=fg_fa, bg_file=bg_fa, motifs=clustered_motifs, genome=genome
).items():
if m not in new_stats:
new_stats[m] = {}
new_stats[m][bg] = s
stats.update(new_stats)
eval_motifs.append(motifs[str(motif)])

if len(eval_motifs) > 0:
new_stats = {}
for bg, bg_fa in background.items():
for m, s in calc_stats(
fg_file=fg_fa, bg_file=bg_fa, motifs=eval_motifs, genome=genome
).items():
if m not in new_stats:
new_stats[m] = {}
new_stats[m][bg] = s
stats.update(new_stats)

rank = rank_motifs(stats, metrics)

Expand Down Expand Up @@ -620,7 +644,7 @@ def gimme_motifs(
)

if len(result.motifs) == 0:
logger.info("finished")
logger.info("de novo finished")
return []

# Write statistics
Expand All @@ -634,7 +658,7 @@ def gimme_motifs(
)
if len(motifs) == 0:
logger.info("no significant motifs")
return
return []

pfmfile = os.path.join(tmpdir, "significant_motifs.pfm")
else:
Expand Down Expand Up @@ -701,7 +725,7 @@ def gimme_motifs(
)
shutil.rmtree(tmpdir)

logger.info("finished")
logger.info("de novo finished")
logger.info("output dir: %s", outdir)
if motifs_found and cluster:
logger.info("de novo report: %s", os.path.join(outdir, "gimme.denovo.html"))
Expand Down
4 changes: 2 additions & 2 deletions gimmemotifs/maelstrom.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,12 +414,12 @@ def run_maelstrom(
m2f.loc[m2f["Motif"] != m2f["motif_nr"], "Curated"] = "N"
m2f["Motif"] = m2f["motif_nr"]
m2f = m2f.drop(columns=["motif_nr"])

motifs = read_motifs(pfmfile)
pfmfile = os.path.join(outdir, "nonredundant.motifs.pfm")

with open(pfmfile, "w") as f:
for motif in motifs:
if motif.id in m2f["Motif"]:
if motif.id in selected_motifs:
f.write(f"{motif.to_pfm()}\n")

mapfile = pfmfile.replace(".pfm", ".motif2factors.txt")
Expand Down
28 changes: 16 additions & 12 deletions gimmemotifs/orthologs.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def _orthofinder(peptide_folder, threads):
"""
# run orthofinder on the primary transcripts
result = subprocess.run(
["orthofinder", "-f", peptide_folder, "-t", threads], capture_output=True
["orthofinder", "-f", peptide_folder, "-t", str(threads)], capture_output=True
)

logger.debug(f"""stdout of orthofinder:\n {result.stdout.decode("utf-8")}""")
Expand Down Expand Up @@ -559,17 +559,21 @@ def mygeneinfo(field):
# only keep aliases, gene names and HGNC symbols
for hit in hits:
for field in ["alias", "name", "symbol", "ensembl"]:
if field in hit:
if "'" not in hit[field]:
if field == "ensembl" and "gene" in hit[field]:
symbols.add(hit[field]["gene"])
else:
symbols.add(hit[field])
elif isinstance(hit[field], list):
symbols.update(
{each["gene"] for each in hit[field] if "gene" in each}
)
return list(symbols)
if field not in hit:
continue

if isinstance(hit[field], str):
hit[field] = [hit[field]]

for subhit in hit[field]:
if isinstance(subhit, str):
symbols.add(subhit)
elif isinstance(subhit, dict) and "gene" in subhit:
symbols.add(subhit["gene"])
else:
raise AssertionError()

return [symbol for symbol in symbols if "'" not in symbol]


def _has_annotation(genome):
Expand Down
14 changes: 13 additions & 1 deletion gimmemotifs/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -932,12 +932,24 @@ def roc_html_report(
df.rename_axis(None, inplace=True)

motifs = read_motifs(pfmfile, as_dict=True)
if use_motifs is not None and len(use_motifs) == 0:
with open(os.path.join(outdir, outname), "w", encoding="utf-8") as f:
f.write("<body>No enriched motifs found.</body>")
return

if use_motifs is not None:
motifs = {k: v for k, v in motifs.items() if k in use_motifs}

idx = list(motifs.keys())
df = df.loc[idx]

df.insert(2, "corrected P-value", multipletests(df["P-value"], method="fdr_bh")[1])
try:
df.insert(
2, "corrected P-value", multipletests(df["P-value"], method="fdr_bh")[1]
)
except ZeroDivisionError:
logger.error(f"ZeroDivisionError when correcting {df['P-value']}")
df.insert(2, "corrected P-value", df["P-value"])
df.insert(3, "-log10 P-value", -np.log10(df["corrected P-value"]))
df = df[df["corrected P-value"] <= threshold]

Expand Down
5 changes: 3 additions & 2 deletions gimmemotifs/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -805,7 +805,7 @@ def set_meanstd(self, gc=False):

v = float(gc_bin.split("-")[1])
_, bstr = sorted(valid_bins, key=lambda x: abs(x[0] - v))[0]
logger.warn(f"Using {bstr}")
# logger.warn(f"Using {bstr}")
self.meanstd[gc_bin] = self.meanstd[bstr]

def set_background(
Expand Down Expand Up @@ -847,7 +847,8 @@ def set_background(
nseq = max(10000, len(gc_bins) * 1000)

if genome and fname:
raise ValueError("Need either genome or filename for background.")
logger.debug("using genome for background")
fname = None

if fname:
if not os.path.exists(fname):
Expand Down

0 comments on commit 984399e

Please sign in to comment.