Skip to content
This repository has been archived by the owner on Dec 21, 2022. It is now read-only.

Commit

Permalink
First pass at adding options to parsing.
Browse files Browse the repository at this point in the history
  • Loading branch information
tshauck committed Nov 22, 2018
1 parent ef1911a commit af35e4e
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 8 deletions.
22 changes: 20 additions & 2 deletions gcgc/cli.py
Expand Up @@ -29,22 +29,40 @@ def version():


def to_path(ctx, param, value) -> pathlib.Path:
"""
Simple callback to convert the string in the option to a command.
"""

return pathlib.Path(value)


@main.command()
@click.argument("filename", callback=to_path)
@click.argument("format")
def convert_file_to_tf_records(filename, format):
@click.option("-e", "--encapsulate", is_flag=True)
@click.option("-c", "--conform-to-length", "conform_to", type=int)
@click.option("-p", "--pad-to-length", "pad_to", type=int)
def convert_file_to_tf_records(filename, format, encapsulate, conform_to, pad_to):

output_file = filename.with_suffix(".tf_records")
output_file = filename.with_suffix(".tf-records")
logger.info(f"Reading from {filename} in format {format} and writing to {output_file}.")

writer = tf.python_io.TFRecordWriter(str(output_file))
try:
with open(filename, "rU") as handle:
for seq_record in SeqIO.parse(handle, format):

encoded_seq = EncodedSeq.from_seq(seq_record.seq)

if encapsulate:
encoded_seq = encoded_seq.encapsulate()

if conform_to:
encoded_seq = encoded_seq.conform(conform_to)

if pad_to:
encoded_seq = encoded_seq.pad(pad_to)

example = record.to_tensorflow_record(encoded_seq)
writer.write(example.SerializeToString())
finally:
Expand Down
1 change: 0 additions & 1 deletion gcgc/encoded_seq/encoded_seq.py
Expand Up @@ -85,7 +85,6 @@ def __add__(self, other) -> "EncodedSeq":
return self.from_seq(added_seq)

def __getitem__(self, index) -> "EncodedSeq":

got_item = super().__getitem__(index)
if isinstance(index, int):
return got_item
Expand Down
Expand Up @@ -8,7 +8,6 @@
import tensorflow as tf

from gcgc.encoded_seq import EncodedSeq
from gcgc.alphabet.iupac import ExtendedIUPACDNAEncoding
from gcgc.third_party.tensorflow_utils import record as gcgc_record
from gcgc.tests.fixtures import P53_HUMAN

Expand Down
13 changes: 9 additions & 4 deletions gcgc/third_party/tensorflow_utils/record.py
@@ -1,13 +1,13 @@
# (c) Copyright 2018 Trent Hauck
# All Rights Reserved

from typing import NamedTuple, Sequence
from typing import NamedTuple

import tensorflow as tf
import numpy as np


def to_tensorflow_record(encoded_seq):
def to_tensorflow_record(encoded_seq) -> tf.train.Example:
"""
Convert the sequence to a tensorflow record.
"""
Expand All @@ -28,13 +28,18 @@ def to_tensorflow_record(encoded_seq):
return example


# TODO(trent): Fully functional EncodedSeq?
# TODO: Should this be SequenceRecordExample?? I.e. probably want to have the actual id of the
# sequence for later use.
class ParsedEncodedSequence(NamedTuple):
integer_encoded: np.ndarray
alphabet_letters: np.ndarray


def from_tensorflow_example(example):
def from_tensorflow_example(example: tf.train.Example) -> ParsedEncodedSequence:
"""
Given the example return an encoded sequence with the integer encoding and alphabet.
"""

features = {
"integer_encoded": tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True),
"alphabet_letters": tf.FixedLenSequenceFeature((), tf.string, allow_missing=True),
Expand Down
2 changes: 2 additions & 0 deletions mypy.ini
@@ -0,0 +1,2 @@
[mypy]
ignore_missing_imports = True

0 comments on commit af35e4e

Please sign in to comment.