Merge branch 'release/0.15.3'

vanheeringen-lab · Feb 1, 2021 · 0c7b341 · 0c7b341
2 parents 50abbc2 + d37b352
commit 0c7b341
Show file tree

Hide file tree

Showing 6 changed files with 61 additions and 21 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,5 +1,5 @@
 repos:
   - repo: https://github.com/ambv/black
-    rev: stable
+    rev: 20.8b1
     hooks:
     - id: black
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,19 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+## [0.15.3] - 2021-02-01
+
+### Fixed
+
+* `_non_reducing_slice` vs `non_reducing_slice` for pandas>=1.2 (#168)
+* When using original region size, skip regions smaller than 10bp and warn if no
+  regions are left. 
+* Fixed creating statistics report crashed with `KeyError: 'Factor'` (#170)
+* Fixed bug with creating GC bins for a genome with unusual GC% (like Plasmodium).
+* Fixed bug that occurs when upgrading pyarrow with an existing GimmeMotifs
+  cache.
+
+
 ## [0.15.2] - 2020-11-26
 
 ### Changed

diff --git a/gimmemotifs/background.py b/gimmemotifs/background.py
@@ -26,6 +26,7 @@
 import pandas as pd
 import pybedtools
 from genomepy import Genome
+from pyarrow.lib import ArrowInvalid
 
 # GimmeMotifs imports
 from gimmemotifs import mytmpdir
@@ -397,12 +398,13 @@ def gc_bin_bedfile(
     fname = os.path.join(
         CACHE_DIR, "{}.gcfreq.{}.feather".format(os.path.basename(genome), min_bin_size)
     )
-    if not os.path.exists(fname):
+    try:
+        df = pd.read_feather(fname)
+    except (ArrowInvalid, FileNotFoundError):
         if not os.path.exists(CACHE_DIR):
             os.makedirs(CACHE_DIR)
         create_gc_bin_index(genome, fname, min_bin_size=min_bin_size)
-
-    df = pd.read_feather(fname)
+        df = pd.read_feather(fname)
 
     if length >= min_bin_size:
         col = "w{}".format(
@@ -477,7 +479,10 @@ def matched_gc_bedfile(bedfile, matchfile, genome, number, size=None, min_bin_si
         try:
             # pylint: disable=unexpected-keyword-arg
             fields = pd.read_csv(matchfile, comment="#", nrows=10, sep="\t").shape[1]
-            bed = pybedtools.BedTool(matchfile)
+            tmp = (
+                pybedtools.BedTool(matchfile).filter(lambda x: len(x) >= 10).saveas().fn
+            )
+            bed = pybedtools.BedTool(tmp)
             gc = np.array(
                 [float(x[fields + 1]) for x in bed.nucleotide_content(fi=genome_fa)]
             )
@@ -511,9 +516,15 @@ def matched_gc_bedfile(bedfile, matchfile, genome, number, size=None, min_bin_si
             int(np.sum((gc > round(b_start, 2)) & (gc <= round(b_end, 2))) * fraction)
         )
 
+    # To make te requested number, divide remaining over
+    # all bins that have counts
     rest = number - sum(bin_count)
-    for i in range(rest):
-        bin_count[i] += 1
+    i = 0
+    for _ in range(rest):
+        while bin_count[i % len(bins)] == 0:
+            i += 1
+        bin_count[i % len(bins)] += 1
+        i += 1
 
     nseqs = max(bin_count) * len(bins)
 
@@ -533,6 +544,8 @@ def matched_gc_bedfile(bedfile, matchfile, genome, number, size=None, min_bin_si
         pass
     with open(bedfile, "a") as f:
         for (b_start, b_end), n in zip(bins, bin_count):
+            if n == 0:
+                continue
             # print(b_start, b_end, n)
             b = "{:.2f}-{:.2f}".format(b_start, b_end)
             df.loc[df["bin"] == b, ["chrom", "start", "end"]].sample(n).to_csv(

diff --git a/gimmemotifs/motif.py b/gimmemotifs/motif.py
@@ -1329,7 +1329,7 @@ def format_factors(
             fmt_d = fmt_i = "{}"
 
         if hasattr(self, "factor_info"):
-            fcount = Counter([x.upper() for x in self.factor_info["Factor"]])
+            fcount = Counter([x.upper() for x in self.factor_info.get("Factor", "")])
         else:
             fcount = Counter(self.factors[DIRECT_NAME] + self.factors[INDIRECT_NAME])
 

diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
@@ -16,7 +16,12 @@
 import numpy as np
 import pandas as pd
 from statsmodels.stats.multitest import multipletests
-from pandas.core.indexing import _non_reducing_slice
+
+try:
+    from pandas.core.indexing import non_reducing_slice
+except ImportError:
+    from pandas.core.indexing import _non_reducing_slice as non_reducing_slice
+
 from pandas.io.formats.style import Styler
 import seaborn as sns
 
@@ -121,7 +126,7 @@ def set_font(self, font_name):
 
     def _current_index(self, subset):
         subset = pd.IndexSlice[:, :] if subset is None else subset
-        subset = _non_reducing_slice(subset)
+        subset = non_reducing_slice(subset)
         selected = self.data.loc[subset]
         idx_slice = pd.IndexSlice[
             self.data.index.get_indexer(selected.index),
@@ -154,7 +159,7 @@ def _compute_data(self):
 
     def _tooltip(self, tip, subset=None, part=None):
         subset = pd.IndexSlice[:, :] if subset is None else subset
-        subset = _non_reducing_slice(subset)
+        subset = non_reducing_slice(subset)
 
         if part is None:
             part = "data"
@@ -202,7 +207,7 @@ def _wrap_iterable(self, it):
 
     def _wrap(self, subset=None, axis=0):
         subset = pd.IndexSlice[:, :] if subset is None else subset
-        subset = _non_reducing_slice(subset)
+        subset = non_reducing_slice(subset)
 
         if axis in [0, "columns"]:
             idx = self._current_index(subset)[1]
@@ -228,7 +233,7 @@ def _wrap(self, subset=None, axis=0):
 
     def _convert_to_image(self, subset=None, height=30):
         subset = pd.IndexSlice[:, :] if subset is None else subset
-        subset = _non_reducing_slice(subset)
+        subset = non_reducing_slice(subset)
 
         self.display_data.loc[subset] = (
             f'<div style="height:{height}px;object-fit:contain;"><img src="'
@@ -324,7 +329,7 @@ def align(self, subset=None, location="center", axis=0):
 
     def to_precision_str(self, subset=None, precision=0, include_zero=True):
         subset = pd.IndexSlice[:, :] if subset is None else subset
-        subset = _non_reducing_slice(subset)
+        subset = non_reducing_slice(subset)
 
         def precision_str(x, precision=precision):
             if (include_zero or x > 0) and x <= 10 ** -precision:
@@ -349,7 +354,7 @@ def _circle(
         morph=False,
     ):
         subset = pd.IndexSlice[:, :] if subset is None else subset
-        subslice = _non_reducing_slice(subset)
+        subslice = non_reducing_slice(subset)
 
         if color:
             palette = sns.color_palette([color])
@@ -502,7 +507,7 @@ def _emoji_scale(self, series, emojis=None, bins=None):
 
     def emoji_scale(self, subset=None, emojis=None, bins=None, axis=0):
         subset = pd.IndexSlice[:, :] if subset is None else subset
-        subset = _non_reducing_slice(subset)
+        subset = non_reducing_slice(subset)
 
         idx = self._current_index(subset=subset)
 
@@ -515,7 +520,7 @@ def emoji_scale(self, subset=None, emojis=None, bins=None, axis=0):
 
     def emoji_score(self, subset=None, emoji_str=None, bins=None, axis=0):
         subset = pd.IndexSlice[:, :] if subset is None else subset
-        subset = _non_reducing_slice(subset)
+        subset = non_reducing_slice(subset)
 
         idx = self._current_index(subset=subset)
         result = self.display_data.iloc[idx].apply(
@@ -527,7 +532,7 @@ def emoji_score(self, subset=None, emoji_str=None, bins=None, axis=0):
 
     def emojify(self, subset=None):
         subset = pd.IndexSlice[:, :] if subset is None else subset
-        subset = _non_reducing_slice(subset)
+        subset = non_reducing_slice(subset)
 
         idx = self._current_index(subset=subset)
         result = self.display_data.iloc[idx].applymap(emoji.emojize)
@@ -547,7 +552,7 @@ def scaled_background_gradient(
     ):
         if center_zero:
             sub = pd.IndexSlice[:, :] if subset is None else subset
-            sub = _non_reducing_slice(sub)
+            sub = non_reducing_slice(sub)
 
             vmax = (
                 self.data.loc[sub]

diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py
@@ -19,7 +19,6 @@
 from functools import singledispatch
 from subprocess import Popen
 from tempfile import NamedTemporaryFile
-from shutil import copyfile
 
 # External imports
 import pyfaidx
@@ -207,7 +206,17 @@ def write_equalsize_bedfile(bedfile, size, outfile):
     write the result to <outfile>.
     Input file needs to be in BED or WIG format."""
     if size is None or size <= 0:
-        copyfile(bedfile, outfile)
+        bed = pybedtools.BedTool(bedfile)
+        filtered_bed = pybedtools.BedTool(
+            bed.filter(lambda x: len(x) >= 10).saveas().fn
+        )
+
+        if len(bed) != len(filtered_bed):
+            logger.warn(
+                "Using original size of input file regions, however, some regions are smaller than 10nt!"
+            )
+            logger.warn("Removing all these smaller regions.")
+        filtered_bed.saveas(outfile)
         return
 
     BUFSIZE = 10000