Skip to content

Commit

Permalink
Merge branch 'release/0.15.3'
Browse files Browse the repository at this point in the history
  • Loading branch information
simonvh committed Feb 1, 2021
2 parents 50abbc2 + d37b352 commit 0c7b341
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 21 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
repos:
- repo: https://github.com/ambv/black
rev: stable
rev: 20.8b1
hooks:
- id: black
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,19 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Fixed

## [0.15.3] - 2021-02-01

### Fixed

* `_non_reducing_slice` vs `non_reducing_slice` for pandas>=1.2 (#168)
* When using original region size, skip regions smaller than 10bp and warn if no
regions are left.
* Fixed creating statistics report crashed with `KeyError: 'Factor'` (#170)
* Fixed bug with creating GC bins for a genome with unusual GC% (like Plasmodium).
* Fixed bug that occurs when upgrading pyarrow with an existing GimmeMotifs
cache.


## [0.15.2] - 2020-11-26

### Changed
Expand Down
25 changes: 19 additions & 6 deletions gimmemotifs/background.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import pandas as pd
import pybedtools
from genomepy import Genome
from pyarrow.lib import ArrowInvalid

# GimmeMotifs imports
from gimmemotifs import mytmpdir
Expand Down Expand Up @@ -397,12 +398,13 @@ def gc_bin_bedfile(
fname = os.path.join(
CACHE_DIR, "{}.gcfreq.{}.feather".format(os.path.basename(genome), min_bin_size)
)
if not os.path.exists(fname):
try:
df = pd.read_feather(fname)
except (ArrowInvalid, FileNotFoundError):
if not os.path.exists(CACHE_DIR):
os.makedirs(CACHE_DIR)
create_gc_bin_index(genome, fname, min_bin_size=min_bin_size)

df = pd.read_feather(fname)
df = pd.read_feather(fname)

if length >= min_bin_size:
col = "w{}".format(
Expand Down Expand Up @@ -477,7 +479,10 @@ def matched_gc_bedfile(bedfile, matchfile, genome, number, size=None, min_bin_si
try:
# pylint: disable=unexpected-keyword-arg
fields = pd.read_csv(matchfile, comment="#", nrows=10, sep="\t").shape[1]
bed = pybedtools.BedTool(matchfile)
tmp = (
pybedtools.BedTool(matchfile).filter(lambda x: len(x) >= 10).saveas().fn
)
bed = pybedtools.BedTool(tmp)
gc = np.array(
[float(x[fields + 1]) for x in bed.nucleotide_content(fi=genome_fa)]
)
Expand Down Expand Up @@ -511,9 +516,15 @@ def matched_gc_bedfile(bedfile, matchfile, genome, number, size=None, min_bin_si
int(np.sum((gc > round(b_start, 2)) & (gc <= round(b_end, 2))) * fraction)
)

# To make te requested number, divide remaining over
# all bins that have counts
rest = number - sum(bin_count)
for i in range(rest):
bin_count[i] += 1
i = 0
for _ in range(rest):
while bin_count[i % len(bins)] == 0:
i += 1
bin_count[i % len(bins)] += 1
i += 1

nseqs = max(bin_count) * len(bins)

Expand All @@ -533,6 +544,8 @@ def matched_gc_bedfile(bedfile, matchfile, genome, number, size=None, min_bin_si
pass
with open(bedfile, "a") as f:
for (b_start, b_end), n in zip(bins, bin_count):
if n == 0:
continue
# print(b_start, b_end, n)
b = "{:.2f}-{:.2f}".format(b_start, b_end)
df.loc[df["bin"] == b, ["chrom", "start", "end"]].sample(n).to_csv(
Expand Down
2 changes: 1 addition & 1 deletion gimmemotifs/motif.py
Original file line number Diff line number Diff line change
Expand Up @@ -1329,7 +1329,7 @@ def format_factors(
fmt_d = fmt_i = "{}"

if hasattr(self, "factor_info"):
fcount = Counter([x.upper() for x in self.factor_info["Factor"]])
fcount = Counter([x.upper() for x in self.factor_info.get("Factor", "")])
else:
fcount = Counter(self.factors[DIRECT_NAME] + self.factors[INDIRECT_NAME])

Expand Down
27 changes: 16 additions & 11 deletions gimmemotifs/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@
import numpy as np
import pandas as pd
from statsmodels.stats.multitest import multipletests
from pandas.core.indexing import _non_reducing_slice

try:
from pandas.core.indexing import non_reducing_slice
except ImportError:
from pandas.core.indexing import _non_reducing_slice as non_reducing_slice

from pandas.io.formats.style import Styler
import seaborn as sns

Expand Down Expand Up @@ -121,7 +126,7 @@ def set_font(self, font_name):

def _current_index(self, subset):
subset = pd.IndexSlice[:, :] if subset is None else subset
subset = _non_reducing_slice(subset)
subset = non_reducing_slice(subset)
selected = self.data.loc[subset]
idx_slice = pd.IndexSlice[
self.data.index.get_indexer(selected.index),
Expand Down Expand Up @@ -154,7 +159,7 @@ def _compute_data(self):

def _tooltip(self, tip, subset=None, part=None):
subset = pd.IndexSlice[:, :] if subset is None else subset
subset = _non_reducing_slice(subset)
subset = non_reducing_slice(subset)

if part is None:
part = "data"
Expand Down Expand Up @@ -202,7 +207,7 @@ def _wrap_iterable(self, it):

def _wrap(self, subset=None, axis=0):
subset = pd.IndexSlice[:, :] if subset is None else subset
subset = _non_reducing_slice(subset)
subset = non_reducing_slice(subset)

if axis in [0, "columns"]:
idx = self._current_index(subset)[1]
Expand All @@ -228,7 +233,7 @@ def _wrap(self, subset=None, axis=0):

def _convert_to_image(self, subset=None, height=30):
subset = pd.IndexSlice[:, :] if subset is None else subset
subset = _non_reducing_slice(subset)
subset = non_reducing_slice(subset)

self.display_data.loc[subset] = (
f'<div style="height:{height}px;object-fit:contain;"><img src="'
Expand Down Expand Up @@ -324,7 +329,7 @@ def align(self, subset=None, location="center", axis=0):

def to_precision_str(self, subset=None, precision=0, include_zero=True):
subset = pd.IndexSlice[:, :] if subset is None else subset
subset = _non_reducing_slice(subset)
subset = non_reducing_slice(subset)

def precision_str(x, precision=precision):
if (include_zero or x > 0) and x <= 10 ** -precision:
Expand All @@ -349,7 +354,7 @@ def _circle(
morph=False,
):
subset = pd.IndexSlice[:, :] if subset is None else subset
subslice = _non_reducing_slice(subset)
subslice = non_reducing_slice(subset)

if color:
palette = sns.color_palette([color])
Expand Down Expand Up @@ -502,7 +507,7 @@ def _emoji_scale(self, series, emojis=None, bins=None):

def emoji_scale(self, subset=None, emojis=None, bins=None, axis=0):
subset = pd.IndexSlice[:, :] if subset is None else subset
subset = _non_reducing_slice(subset)
subset = non_reducing_slice(subset)

idx = self._current_index(subset=subset)

Expand All @@ -515,7 +520,7 @@ def emoji_scale(self, subset=None, emojis=None, bins=None, axis=0):

def emoji_score(self, subset=None, emoji_str=None, bins=None, axis=0):
subset = pd.IndexSlice[:, :] if subset is None else subset
subset = _non_reducing_slice(subset)
subset = non_reducing_slice(subset)

idx = self._current_index(subset=subset)
result = self.display_data.iloc[idx].apply(
Expand All @@ -527,7 +532,7 @@ def emoji_score(self, subset=None, emoji_str=None, bins=None, axis=0):

def emojify(self, subset=None):
subset = pd.IndexSlice[:, :] if subset is None else subset
subset = _non_reducing_slice(subset)
subset = non_reducing_slice(subset)

idx = self._current_index(subset=subset)
result = self.display_data.iloc[idx].applymap(emoji.emojize)
Expand All @@ -547,7 +552,7 @@ def scaled_background_gradient(
):
if center_zero:
sub = pd.IndexSlice[:, :] if subset is None else subset
sub = _non_reducing_slice(sub)
sub = non_reducing_slice(sub)

vmax = (
self.data.loc[sub]
Expand Down
13 changes: 11 additions & 2 deletions gimmemotifs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from functools import singledispatch
from subprocess import Popen
from tempfile import NamedTemporaryFile
from shutil import copyfile

# External imports
import pyfaidx
Expand Down Expand Up @@ -207,7 +206,17 @@ def write_equalsize_bedfile(bedfile, size, outfile):
write the result to <outfile>.
Input file needs to be in BED or WIG format."""
if size is None or size <= 0:
copyfile(bedfile, outfile)
bed = pybedtools.BedTool(bedfile)
filtered_bed = pybedtools.BedTool(
bed.filter(lambda x: len(x) >= 10).saveas().fn
)

if len(bed) != len(filtered_bed):
logger.warn(
"Using original size of input file regions, however, some regions are smaller than 10nt!"
)
logger.warn("Removing all these smaller regions.")
filtered_bed.saveas(outfile)
return

BUFSIZE = 10000
Expand Down

0 comments on commit 0c7b341

Please sign in to comment.