Skip to content

Commit

Permalink
feat: add bed file option inference (#156)
Browse files Browse the repository at this point in the history
  • Loading branch information
tshauck committed Jun 22, 2024
1 parent 26df013 commit 6fa6f30
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 29 deletions.
10 changes: 10 additions & 0 deletions python/tests/data/test-three.bedd
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
chr1 11873 12227
chr1 12612 12721
chr1 13220 14409
chr1 14361 14829
chr1 14969 15038
chr1 15795 15947
chr1 16606 16765
chr1 16857 17055
chr1 17232 17368
chr1 17605 17742
36 changes: 30 additions & 6 deletions python/tests/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test the session context."""

from pathlib import Path
import importlib
Expand Down Expand Up @@ -64,7 +65,8 @@ def test_read_fastq_with_qs_to_list():

df = session.sql(
f"""
SELECT quality_scores_to_list(quality_scores) quality_score_list, locate_regex(sequence, '[AC]AT') locate
SELECT quality_scores_to_list(quality_scores) quality_score_list,
locate_regex(sequence, '[AC]AT') locate
FROM fastq_scan('{fastq_path}')
"""
).to_polars()
Expand Down Expand Up @@ -247,15 +249,24 @@ def test_read_fasta_fa_no_options():
"""Test reading a fasta file."""
session = connect()

# Test reading a fasta file with no options no compression
fasta_path = DATA / "test.fa"
df = session.read_fasta_file(str(fasta_path)).to_polars()

assert len(df) == 2

# Test reading a fasta file with no options with compression
fasta_path = DATA / "test.fa.gz"
df = session.read_fasta_file(str(fasta_path)).to_polars()

assert len(df) == 2
# Test conflicting options
fasta_path = DATA / "test.fa"
df = session.read_fasta_file(
str(fasta_path),
options=FASTAReadOptions(file_extension="fasta"),
).to_polars()

assert len(df) == 0


@pytest.mark.skipif(
Expand Down Expand Up @@ -543,8 +554,6 @@ def test_gff_reader_polars():
not importlib.util.find_spec("polars"), reason="polars not installed"
)
def test_gff_attr_struct():
import polars as pl

session = new_session()

reader = session.read_gff_file((DATA / "test.gff").as_posix())
Expand Down Expand Up @@ -602,8 +611,6 @@ def test_gtf_reader_to_polars():
not importlib.util.find_spec("polars"), reason="polars not installed"
)
def test_gtf_attr_struct():
import polars as pl

session = new_session()

result = session.read_gtf_file((DATA / "test.gtf").as_posix())
Expand Down Expand Up @@ -700,6 +707,23 @@ def test_bed_reader():
assert result.to_polars().shape == (10, 12)


def test_bed_reader_no_options():
session = new_session()

bed_file = DATA / "test-three.bedd"
result = session.read_bed_file(
bed_file.as_posix(), options=BEDReadOptions(n_fields=3)
)

assert result.to_polars().shape == (10, 3)

bed_file = DATA / "test-three.bedd"
result = session.read_bed_file(bed_file.as_posix())

# 12 here because the bed file has 12 fields by default
assert result.to_polars().shape == (10, 12)


def test_bed_three():
session = new_session()

Expand Down
63 changes: 41 additions & 22 deletions src/datasources/bed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,52 +15,71 @@
use exon::datasources::bed::table_provider::ListingBEDTableOptions;
use pyo3::{pyclass, pymethods};

use crate::{error::BioBearResult, file_options::FileOptions, FileCompressionType};

#[pyclass]
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Default)]
/// Options for reading BED files.
pub struct BEDReadOptions {
/// The type of compression used in the file.
file_compression_type: crate::FileCompressionType,
file_compression_type: Option<FileCompressionType>,

/// The number of fields in the file.
n_fields: usize,
n_fields: Option<usize>,

/// The file extension.
file_extension: String,
}

impl Default for BEDReadOptions {
fn default() -> Self {
Self {
file_compression_type: crate::FileCompressionType::UNCOMPRESSED,
n_fields: 12,
file_extension: "bed".to_string(),
}
}
file_extension: Option<String>,
}

#[pymethods]
impl BEDReadOptions {
#[new]
#[pyo3(signature = (/, file_compression_type = None, n_fields = None, file_extension = None))]
fn new(
file_compression_type: Option<crate::FileCompressionType>,
file_compression_type: Option<FileCompressionType>,
n_fields: Option<usize>,
file_extension: Option<String>,
) -> Self {
Self {
file_compression_type: file_compression_type
.unwrap_or(crate::FileCompressionType::UNCOMPRESSED),
n_fields: n_fields.unwrap_or(12),
file_extension: file_extension.unwrap_or("bed".to_string()),
file_compression_type,
n_fields,
file_extension,
}
}
}

impl BEDReadOptions {
pub(crate) fn update_from_file_options(
&mut self,
file_options: &FileOptions,
) -> BioBearResult<()> {
if let Some(file_extension) = file_options.file_extension() {
if self.file_extension.is_none() {
self.file_extension = Some(file_extension.to_string());
}
}

if let Some(file_compression_type) = file_options.file_compression_type() {
if self.file_compression_type.is_none() {
let fct = FileCompressionType::try_from(file_compression_type)?;
self.file_compression_type = Some(fct);
}
}

Ok(())
}
}

impl From<BEDReadOptions> for ListingBEDTableOptions {
fn from(options: BEDReadOptions) -> Self {
ListingBEDTableOptions::new(options.file_compression_type.into())
.with_n_fields(options.n_fields)
.with_file_extension(options.file_extension)
let file_compression_type = options
.file_compression_type
.unwrap_or(FileCompressionType::UNCOMPRESSED);
let n_fields = options.n_fields.unwrap_or(12);
let file_extension = options.file_extension.unwrap_or_default();

ListingBEDTableOptions::new(file_compression_type.into())
.with_n_fields(n_fields)
.with_file_extension(file_extension)
}
}
4 changes: 3 additions & 1 deletion src/session_context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,9 @@ impl BioBearSessionContext {
options: Option<crate::datasources::bed::BEDReadOptions>,
py: Python,
) -> PyResult<ExecutionResult> {
let options = options.unwrap_or_default();
let file_options = FileOptions::from(file_path);
let mut options = options.unwrap_or_default();
options.update_from_file_options(&file_options)?;

let result = self.ctx.read_bed(file_path, options.into());
let df = wait_for_future(py, result).map_err(error::BioBearError::from)?;
Expand Down

0 comments on commit 6fa6f30

Please sign in to comment.