Merge branch 'release/0.16.1'

vanheeringen-lab · Jun 28, 2021 · 984399e · 984399e
2 parents e4582d6 + eb4b69f
commit 984399e
Show file tree

Hide file tree

Showing 9 changed files with 104 additions and 40 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,9 +9,24 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
+### Fixed
+
 ### Removed
 
-### Changed
+
+## [0.16.1] - 2021-06-28
+
+Bugfix release.
+
+### Added
+
+* Added warning when the number of sequences used for de novo motif prediction is low.
+
+### Fixed
+
+* Fixed bug with `gimme motif2factors`.
+* Fixed "Motif does not occur in motif database when running maelstrom" (#192).
+* Fixed bugs related to runs where no (significant) motifs is found.
 
 ## [0.16.0] - 2021-05-28
 

diff --git a/gimmemotifs/background.py b/gimmemotifs/background.py
@@ -440,9 +440,9 @@ def gc_bin_bedfile(
             # print(col, b_start, b_end)
             df_bin = df[(df[col] > b_start) & (df[col] <= b_end)]
             # print(df_bin)
+            df_bin["start"] = df_bin["end"] - length
+            df_bin = df_bin[df_bin["start"] > 0]
             if df_bin.shape[0] > 0:
-                df_bin["start"] = df_bin["end"] - length
-                df_bin = df_bin[df_bin["start"] > 0]
                 df_bin = df_bin.sample(n, replace=True, random_state=random_state)
                 df_bin["bin"] = "{:.2f}-{:.2f}".format(b_start, b_end)
                 df_bin[["chrom", "start", "end", "bin"]].to_csv(

diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py
@@ -107,7 +107,7 @@ def motifs(args):
         motifs = read_motifs(pfmfile, fmt="pfm")
 
     if args.denovo:
-        gimme_motifs(
+        denovo = gimme_motifs(
             sample,
             args.outdir,
             params={
@@ -121,10 +121,10 @@ def motifs(args):
                 "use_strand": not (args.single),
             },
         )
-        denovo = read_motifs(os.path.join(args.outdir, "gimme.denovo.pfm"))
-        mc = MotifComparer()
-        result = mc.get_closest_match(denovo, dbmotifs=pfmfile, metric="seqcor")
-        match_motifs = read_motifs(pfmfile, as_dict=True)
+        if len(denovo) > 0:
+            mc = MotifComparer()
+            result = mc.get_closest_match(denovo, dbmotifs=pfmfile, metric="seqcor")
+            match_motifs = read_motifs(pfmfile, as_dict=True)
         new_map_file = os.path.join(args.outdir, "combined.motif2factors.txt")
         base = os.path.splitext(pfmfile)[0]
         map_file = base + ".motif2factors.txt"
@@ -153,6 +153,10 @@ def motifs(args):
     else:
         logger.info("skipping de novo")
 
+    if len(motifs) == 0:
+        logger.info("no motifs to report!")
+        sys.exit()
+
     stats = [
         "phyper_at_fpr",
         "roc_auc",

diff --git a/gimmemotifs/comparison.py b/gimmemotifs/comparison.py
@@ -940,6 +940,10 @@ def select_nonredundant_motifs(
         )
         result = result[motif]
         redundant_motifs += [m for m in result.keys() if result[m][0] >= 0.7]
+
+    if len(keep) < 2:
+        return keep
+
     logger.debug(f"Selected {len(keep)} motifs for feature elimination")
 
     # Read motif scan results

diff --git a/gimmemotifs/denovo.py b/gimmemotifs/denovo.py
@@ -106,6 +106,17 @@ def prepare_denovo_input_bed(inputfile, params, outdir):
     pred_bedfile = os.path.join(outdir, "prediction.bed")
     val_bedfile = os.path.join(outdir, "validation.bed")
 
+    with open(bedfile) as f:
+        n_regions = len(f.readlines())
+
+    if n_regions < 500 and fraction <= 0.2:
+        logger.warn(
+            f"You have {n_regions} input regions and only {int(fraction * n_regions)} will be used for motif prediction."
+        )
+        logger.warn(
+            "You may consider to increase the fraction for prediction (-f, --fraction)"
+        )
+
     # Split input into prediction and validation set
     logger.debug(
         "Splitting %s into prediction set (%s) and validation set (%s)",
@@ -144,6 +155,18 @@ def prepare_denovo_input_fa(inputfile, params, outdir):
     logger.info("preparing input (FASTA)")
 
     pred_fa = os.path.join(outdir, "prediction.fa")
+
+    fa = Fasta(inputfile)
+    n_regions = len(fa)
+
+    if n_regions < 500 and fraction <= 0.2:
+        logger.warn(
+            f"You have {n_regions} input regions and only {int(fraction * n_regions)} will be used for motif prediction."
+        )
+        logger.warn(
+            "You may consider to increase the fraction for prediction (-f, --fraction)"
+        )
+
     val_fa = os.path.join(outdir, "validation.fa")
     loc_fa = os.path.join(outdir, "localization.fa")
 
@@ -450,21 +473,22 @@ def best_motif_in_cluster(
     motifs = dict([(str(m), m) for m in motifs])
 
     # get the statistics for those motifs that were not yet checked
-    clustered_motifs = []
+    eval_motifs = []
     for clus, singles in clusters:
         for motif in set([clus] + singles):
             if str(motif) not in stats:
-                clustered_motifs.append(motifs[str(motif)])
-
-    new_stats = {}
-    for bg, bg_fa in background.items():
-        for m, s in calc_stats(
-            fg_file=fg_fa, bg_file=bg_fa, motifs=clustered_motifs, genome=genome
-        ).items():
-            if m not in new_stats:
-                new_stats[m] = {}
-            new_stats[m][bg] = s
-    stats.update(new_stats)
+                eval_motifs.append(motifs[str(motif)])
+
+    if len(eval_motifs) > 0:
+        new_stats = {}
+        for bg, bg_fa in background.items():
+            for m, s in calc_stats(
+                fg_file=fg_fa, bg_file=bg_fa, motifs=eval_motifs, genome=genome
+            ).items():
+                if m not in new_stats:
+                    new_stats[m] = {}
+                new_stats[m][bg] = s
+        stats.update(new_stats)
 
     rank = rank_motifs(stats, metrics)
 
@@ -620,7 +644,7 @@ def gimme_motifs(
     )
 
     if len(result.motifs) == 0:
-        logger.info("finished")
+        logger.info("de novo finished")
         return []
 
     # Write statistics
@@ -634,7 +658,7 @@ def gimme_motifs(
         )
         if len(motifs) == 0:
             logger.info("no significant motifs")
-            return
+            return []
 
         pfmfile = os.path.join(tmpdir, "significant_motifs.pfm")
     else:
@@ -701,7 +725,7 @@ def gimme_motifs(
         )
         shutil.rmtree(tmpdir)
 
-    logger.info("finished")
+    logger.info("de novo finished")
     logger.info("output dir: %s", outdir)
     if motifs_found and cluster:
         logger.info("de novo report: %s", os.path.join(outdir, "gimme.denovo.html"))

diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
@@ -414,12 +414,12 @@ def run_maelstrom(
         m2f.loc[m2f["Motif"] != m2f["motif_nr"], "Curated"] = "N"
         m2f["Motif"] = m2f["motif_nr"]
         m2f = m2f.drop(columns=["motif_nr"])
-
         motifs = read_motifs(pfmfile)
         pfmfile = os.path.join(outdir, "nonredundant.motifs.pfm")
+
         with open(pfmfile, "w") as f:
             for motif in motifs:
-                if motif.id in m2f["Motif"]:
+                if motif.id in selected_motifs:
                     f.write(f"{motif.to_pfm()}\n")
 
         mapfile = pfmfile.replace(".pfm", ".motif2factors.txt")

diff --git a/gimmemotifs/orthologs.py b/gimmemotifs/orthologs.py
@@ -154,7 +154,7 @@ def _orthofinder(peptide_folder, threads):
     """
     # run orthofinder on the primary transcripts
     result = subprocess.run(
-        ["orthofinder", "-f", peptide_folder, "-t", threads], capture_output=True
+        ["orthofinder", "-f", peptide_folder, "-t", str(threads)], capture_output=True
     )
 
     logger.debug(f"""stdout of orthofinder:\n {result.stdout.decode("utf-8")}""")
@@ -559,17 +559,21 @@ def mygeneinfo(field):
     # only keep aliases, gene names and HGNC symbols
     for hit in hits:
         for field in ["alias", "name", "symbol", "ensembl"]:
-            if field in hit:
-                if "'" not in hit[field]:
-                    if field == "ensembl" and "gene" in hit[field]:
-                        symbols.add(hit[field]["gene"])
-                    else:
-                        symbols.add(hit[field])
-                elif isinstance(hit[field], list):
-                    symbols.update(
-                        {each["gene"] for each in hit[field] if "gene" in each}
-                    )
-    return list(symbols)
+            if field not in hit:
+                continue
+
+            if isinstance(hit[field], str):
+                hit[field] = [hit[field]]
+
+            for subhit in hit[field]:
+                if isinstance(subhit, str):
+                    symbols.add(subhit)
+                elif isinstance(subhit, dict) and "gene" in subhit:
+                    symbols.add(subhit["gene"])
+                else:
+                    raise AssertionError()
+
+    return [symbol for symbol in symbols if "'" not in symbol]
 
 
 def _has_annotation(genome):

diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
@@ -932,12 +932,24 @@ def roc_html_report(
     df.rename_axis(None, inplace=True)
 
     motifs = read_motifs(pfmfile, as_dict=True)
+    if use_motifs is not None and len(use_motifs) == 0:
+        with open(os.path.join(outdir, outname), "w", encoding="utf-8") as f:
+            f.write("<body>No enriched motifs found.</body>")
+            return
+
     if use_motifs is not None:
         motifs = {k: v for k, v in motifs.items() if k in use_motifs}
+
     idx = list(motifs.keys())
     df = df.loc[idx]
 
-    df.insert(2, "corrected P-value", multipletests(df["P-value"], method="fdr_bh")[1])
+    try:
+        df.insert(
+            2, "corrected P-value", multipletests(df["P-value"], method="fdr_bh")[1]
+        )
+    except ZeroDivisionError:
+        logger.error(f"ZeroDivisionError when correcting {df['P-value']}")
+        df.insert(2, "corrected P-value", df["P-value"])
     df.insert(3, "-log10 P-value", -np.log10(df["corrected P-value"]))
     df = df[df["corrected P-value"] <= threshold]
 

diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py
@@ -805,7 +805,7 @@ def set_meanstd(self, gc=False):
 
                 v = float(gc_bin.split("-")[1])
                 _, bstr = sorted(valid_bins, key=lambda x: abs(x[0] - v))[0]
-                logger.warn(f"Using {bstr}")
+                # logger.warn(f"Using {bstr}")
                 self.meanstd[gc_bin] = self.meanstd[bstr]
 
     def set_background(
@@ -847,7 +847,8 @@ def set_background(
             nseq = max(10000, len(gc_bins) * 1000)
 
         if genome and fname:
-            raise ValueError("Need either genome or filename for background.")
+            logger.debug("using genome for background")
+            fname = None
 
         if fname:
             if not os.path.exists(fname):