diff --git a/setup.cfg b/setup.cfg index bd92128..f0c25bd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,8 +53,6 @@ install_requires = cachetools==5.0.0 click==7.1.2 freezegun==1.1.0 - numpy==1.22.3 - pandas==1.3.0 pydantic==1.9.1 pynmrstar==3.3.1 pyparsing==3.0.9 diff --git a/src/nef_pipelines/__init__.py b/src/nef_pipelines/__init__.py new file mode 100644 index 0000000..e451f10 --- /dev/null +++ b/src/nef_pipelines/__init__.py @@ -0,0 +1,16 @@ +import sys + +if sys.version_info[:2] >= (3, 8): + # TODO: Import directly (no need for conditional) when `python_requires = >= 3.8` + from importlib.metadata import PackageNotFoundError, version # pragma: no cover +else: + from importlib_metadata import PackageNotFoundError, version # pragma: no cover + +try: + # Change here if project is renamed and does not equal the package name + dist_name = __name__ + __version__ = version(dist_name) +except PackageNotFoundError: # pragma: no cover + __version__ = "unknown" +finally: + del version, PackageNotFoundError diff --git a/src/nef_pipelines/__main__.py b/src/nef_pipelines/__main__.py new file mode 100644 index 0000000..40e2b01 --- /dev/null +++ b/src/nef_pipelines/__main__.py @@ -0,0 +1,4 @@ +from .main import main + +if __name__ == "__main__": + main() diff --git a/src/nef_pipelines/lib/__init__.py b/src/nef_pipelines/lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/nef_pipelines/lib/constants.py b/src/nef_pipelines/lib/constants.py new file mode 100644 index 0000000..e9b34f3 --- /dev/null +++ b/src/nef_pipelines/lib/constants.py @@ -0,0 +1,11 @@ +NEF_PIPELINES = "NEFPipelines" + +NEF_PIPELINES_VERSION = "0.0.1" + +NEF_VERSION = "1.1" + +NEF_META_DATA = "nef_nmr_meta_data" + +NEF_UNKNOWN = "." + +EXIT_ERROR = 1 diff --git a/src/nef_pipelines/lib/header_lib.py b/src/nef_pipelines/lib/header_lib.py new file mode 100644 index 0000000..87d5062 --- /dev/null +++ b/src/nef_pipelines/lib/header_lib.py @@ -0,0 +1,40 @@ +import datetime +from random import randint + +from pynmrstar import Loop, Saveframe + +from nef_pipelines.lib.constants import NEF_PIPELINES, NEF_VERSION + + +def get_creation_time(): + return datetime.datetime.now().isoformat() + + +def get_uuid(name, creation_time): + random_value = "".join(["{}".format(randint(0, 9)) for _ in range(10)]) + return f"{name}-{creation_time}-{random_value}" + + +def create_header_frame(program_name, program_version, script_name): + frame = Saveframe.from_scratch("nef_nmr_meta_data", "nef_nmr_meta_data") + + frame.add_tag("sf_category", "nef_nmr_meta_data") + frame.add_tag("sf_framecode", "nef_nmr_meta_data") + frame.add_tag("format_name", "nmr_exchange_format") + frame.add_tag("nef_nmr_meta_data.format_version", NEF_VERSION) + frame.add_tag("program_name", program_name) + frame.add_tag("script_name", script_name) + frame.add_tag("program_version", program_version) + + creation_time = get_creation_time() + uuid = get_uuid(NEF_PIPELINES, creation_time) + frame.add_tag("creation_date", creation_time) + frame.add_tag("uuid", uuid) + + loop = Loop.from_scratch("nef_run_history") + frame.add_loop(loop) + + history_tags = "run_number", "program_name", "program_version", "script_name" + loop.add_tag(history_tags) + + return frame diff --git a/src/nef_pipelines/lib/nef_lib.py b/src/nef_pipelines/lib/nef_lib.py new file mode 100644 index 0000000..1eec31e --- /dev/null +++ b/src/nef_pipelines/lib/nef_lib.py @@ -0,0 +1,266 @@ +import sys +from argparse import Namespace +from enum import auto +from fnmatch import fnmatch +from pathlib import Path +from typing import Dict, Iterator, List, Union + +# from pandas import DataFrame +from pynmrstar import Entry, Loop, Saveframe +from strenum import LowercaseStrEnum + +from nef_pipelines.lib.util import ( + cached_stdin, + exit_error, + is_float, + is_int, + running_in_pycharm, +) + +NEF_CATEGORY_ATTR = "__NEF_CATEGORY__" + +UNUSED = "." + + +# what the selection type is for Star SaveFrames +class SelectionType(LowercaseStrEnum): + NAME = auto() + CATEGORY = auto() + ANY = auto() + + # see https://github.com/irgeek/StrEnum/issues/9 + def _cmp_values(self, other): + return self.value, str(other).upper() + + +# currently disabled as they add a dependency on pandas and numpy +# def loop_to_dataframe(loop: Loop) -> DataFrame: +# """ +# convert a pynmrstar Loop to a pandas DataFrame. Note the Loop category is +# saved in the dataframe's attrs['__NEF_CATEGORY__'] +# +# :param loop: the pynmrstar Loop +# :return: a pandas DataFrame +# """ +# tags = loop.tags +# data = DataFrame() +# +# # note this strips the preceding _ +# data.attrs[NEF_CATEGORY_ATTR] = loop.category[1:] +# +# for tag in tags: +# if tag != "index": +# data[tag] = loop.get_tag(tag) +# +# return data +# +# +# def dataframe_to_loop(frame: DataFrame, category: str = None) -> Loop: +# """ +# convert a pandas DataFrame to a pynmrstar Loop +# +# :param frame: the pandas DataFrame +# :param category: the star category note this will override any category stored in attrs +# :return: the new pynmrstar Loop +# """ +# loop = Loop.from_scratch(category=category) +# loop_data = {} +# for column in frame.columns: +# loop.add_tag(column) +# loop_data[column] = list(frame[column]) +# +# loop.add_data(loop_data) +# +# if NEF_CATEGORY_ATTR in frame.attrs and not category: +# loop.set_category(frame.attrs[NEF_CATEGORY_ATTR]) +# elif category: +# loop.set_category(category) +# +# return loop + + +def select_frames_by_name( + frames: Union[List[Saveframe], Entry], + name_selectors: Union[List[str], str], + exact=False, +) -> List[Saveframe]: + """ + select frames by names and wild cards, to avoid typing *s on the command line the match is greedy by default + if an exact match is not found for one of the frames first time we search will all the name selectors turned into + wild cards by surrounding them with the * as a fence so name_selector-> *name_selector* + + :param frames: the list of frames or entry to search + :param name_selectors: a single string or list of strings to use to match frame names, selectors can contain + wild cards used by pythons fnmatch + :param exact: do exact matching and don't search again with wildcards added if no frames are selected + :return: a list or matching frames + """ + + def match_frames(frames, name_selectors): + + result = {} + + for frame in frames: + for selector in name_selectors: + + if fnmatch(frame.name, selector): + # frames aren't hashable and so can't be saved in a set + # but names should be unique + result[frame.name] = frame + + return result + + if isinstance(name_selectors, str): + name_selectors = [ + name_selectors, + ] + + result = match_frames(frames, name_selectors) + + if not exact and len(result) == 0: + + name_selectors = [f"*{selector}*" for selector in name_selectors] + result = match_frames(frames, name_selectors) + + return list(result.values()) + + +# refactor to two functions one of which gets a TextIO +def create_entry_from_stdin_or_exit() -> Entry: + + """ + read a star file entry from stdin or exit withan error message + :return: a star file entry + """ + + try: + if not sys.stdin.isatty() or running_in_pycharm(): + stdin = cached_stdin() + if cached_stdin is None: + lines = "" + else: + lines = "".join(stdin) + + if len(lines.strip()) == 0: + raise Exception("stdin is empty") + else: + entry = Entry.from_string(lines) + else: + exit_error("you appear to be reading from an empty stdin") + except Exception as e: + exit_error( + f"failed to read nef entry from stdin because the NEF parser replied: {e}", + e, + ) + + return entry + + +# TODO we should examine columns for types not individual rows entries +def loop_row_dict_iter( + loop: Loop, convert: bool = True +) -> Iterator[Dict[str, Union[str, int, float]]]: + """ + create an iterator that loops over the rows in a star file Loop as dictionaries, by default sensible + conversions from strings to ints and floats are made + :param loop: the Loop + :param convert: try to convert values to ints or floats if possible [default is True] + :return: iterator of rows as dictionaries + """ + for row in loop: + row = {tag: value for tag, value in zip(loop.tags, row)} + + if convert: + for key in row: + row[key] = do_reasonable_type_conversions(row[key]) + + yield row + + +def do_reasonable_type_conversions(value: str) -> Union[str, float, int]: + """ + do reasonable type conversion from str to int or float + :param value: the string to convert + :return: value converted from str to int or float if possible + """ + if is_int(value): + value = int(value) + elif is_float(value): + value = float(value) + return value + + +def loop_row_namespace_iter(loop: Loop, convert: bool = True) -> Iterator[Namespace]: + """ + create an iterator that loops over the rows in a star file Loop as Namespaces, by default sensible + conversions from strings to ints and floats are made + :param loop: thr Loop + :param convert: try to convert values to ints or floats if possible [default is True] + :return: iterator of rows as dictionaries + """ + for row in loop_row_dict_iter(loop, convert=convert): + yield Namespace(**row) + + +# TODO this partially overlaps with select_frames_by_name in this file, combine and simplify! +def select_frames( + entry: Entry, selector_type: SelectionType, filters: List[str] +) -> List[Saveframe]: + """ + select a list of frames by name of either category or name + + :param entry: the entry in which frames are looked for + :param selector_type: the matching type frame.name or frame.category or both [default, search order + frame.name frame.category] + :param filters: a list of strings to use as filters as defined by fnmatch + :return: a list of selected saveframes + """ + + star_filters = [f"*{filter}*" for filter in filters] + filters = list(filters) + filters.extend(star_filters) + + result = [] + for frame in entry.frame_dict.values(): + + accept_frame_category = any( + [fnmatch(frame.category, filter) for filter in filters] + ) + accept_frame_name = any([fnmatch(frame.name, filter) for filter in filters]) + + if ( + selector_type in (SelectionType.NAME, SelectionType.ANY) + and not accept_frame_name + ): + continue + + if ( + selector_type in (SelectionType.CATEGORY, SelectionType.ANY) + and not accept_frame_category + ): + continue + + result.append(frame) + + return result + + +def read_entry_from_file_or_stdin_or_exit_error(file: Path) -> Entry: + """ + read a star entry from stdin or a file or exit. + note 1. the stdin stream is cached + 2. this exits with an error if stdin can't be read because its a terminal + + :param file: + :return: + """ + if file is None: + entry = create_entry_from_stdin_or_exit() + else: + try: + with open(file) as fh: + entry = Entry.from_file(fh) + + except IOError as e: + exit_error(f"couldn't read from the file {file}", e) + return entry diff --git a/src/nef_pipelines/lib/sequence_lib.py b/src/nef_pipelines/lib/sequence_lib.py new file mode 100644 index 0000000..b03de30 --- /dev/null +++ b/src/nef_pipelines/lib/sequence_lib.py @@ -0,0 +1,530 @@ +import string +import sys +from dataclasses import replace +from textwrap import dedent +from typing import Dict, Iterable, List, Optional, Tuple, Union + +from ordered_set import OrderedSet +from pynmrstar import Entry, Loop, Saveframe + +from nef_pipelines.lib.constants import NEF_UNKNOWN +from nef_pipelines.lib.nef_lib import loop_to_dataframe +from nef_pipelines.lib.structures import Linking, SequenceResidue +from nef_pipelines.lib.util import ( + cached_stdin, + chunks, + exit_error, + is_int, + running_in_pycharm, +) + +NEF_CHAIN_CODE = "chain_code" + + +def chain_code_iter( + user_chain_codes: Union[str, List[str]] = "", exclude: Union[str, List[str]] = () +) -> Iterable[str]: + """ + split input string into chain codes separated by .s, and yield them. + Then yield any remaining letters of the upper case alphabet till they run out + + Args: + user_chain_codes (str): string of dot separated chain codes + + Returns: + Iterable[str] single codes + """ + + ascii_uppercase = list(string.ascii_uppercase) + + if isinstance(user_chain_codes, str): + if "." in user_chain_codes: + user_chain_codes = user_chain_codes.split(".") + + if isinstance(exclude, str): + if "." in exclude: + exclude = exclude.split(".") + + chain_codes = user_chain_codes if user_chain_codes else ascii_uppercase + + seen_codes = set() + + for chain_code in chain_codes: + seen_codes.add(chain_code) + if chain_code not in exclude: + yield chain_code + + for chain_code in ascii_uppercase: + if chain_code not in seen_codes and chain_code not in exclude: + yield chain_code + + raise ValueError("run out of chain names!") + + +def get_linking( + target_sequence: List[SequenceResidue], + no_chain_start=List[str], + no_chain_end=List[str], +) -> List[SequenceResidue]: + + """ + get the correct linking for residue in a standard sequenced chain + + Args: + target_index (int): index in the sequence + target_sequence (List[SequenceResidue]) : the sequence + no_chain_start (List[str]): if the chain isn't in the list cap it at the start + no_end (List[str]): if the chain isn't in the list cap it at the end + + Returns: + List[SequenceResidue]: correctly linked residues + """ + + by_chain = {} + for residue in target_sequence: + by_chain.setdefault(residue.chain_code, []).append((residue, "middle")) + + result = [] + for chain, residues_and_linkages in by_chain.items(): + + if chain not in no_chain_start: + residues_and_linkages[0] = (residues_and_linkages[0][0], "start") + + if chain not in no_chain_end: + residues_and_linkages[-1] = (residues_and_linkages[-1][0], "end") + + result.extend(residues_and_linkages) + + return result + + +def sequence_to_nef_frame( + sequences: List[SequenceResidue], + no_chain_start: Union[List[str], Tuple[str]] = (), + no_chain_end: Union[List[str], Tuple[str]] = (), +) -> Saveframe: + """ + + Args: + sequences (List[SequenceResidue]): the sequence as a list of sequence residues [will be sorted] + no_chain_start (List[str]): if the chain isn't in the list cap it at the start + no_chain_end (List[str]): if the chain isn't in the list cap it at the end + + Returns: + Saveframe: a NEF saveframe + """ + + sequences = sorted(sequences) + + category = "nef_molecular_system" + + frame_code = f"{category}" + + nef_frame = Saveframe.from_scratch(frame_code, category) + + nef_frame.add_tag("sf_category", category) + nef_frame.add_tag("sf_framecode", frame_code) + + nef_loop = Loop.from_scratch("nef_sequence") + nef_frame.add_loop(nef_loop) + + tags = ( + "index", + "chain_code", + "sequence_code", + "residue_name", + "linking", + "residue_variant", + "cis_peptide", + ) + + nef_loop.add_tag(tags) + + # TODO need tool to set ionisation correctly + residue_and_linkages = get_linking(sequences, no_chain_start, no_chain_end) + + for index, (sequence_residue, linking) in enumerate(residue_and_linkages): + + nef_loop.add_data_by_tag("index", index + 1) + nef_loop.add_data_by_tag("chain_code", sequence_residue.chain_code) + nef_loop.add_data_by_tag("sequence_code", sequence_residue.sequence_code) + nef_loop.add_data_by_tag("residue_name", sequence_residue.residue_name.upper()) + nef_loop.add_data_by_tag("linking", linking) + nef_loop.add_data_by_tag("residue_variant", NEF_UNKNOWN) + nef_loop.add_data_by_tag("cis_peptide", NEF_UNKNOWN) + + return nef_frame + + +TRANSLATIONS_3_1 = { + "ALA": "A", + "ARG": "R", + "ASN": "N", + "ASP": "D", + "CYS": "C", + "GLU": "E", + "GLN": "Q", + "GLY": "G", + "HIS": "H", + "ILE": "I", + "LEU": "L", + "LYS": "K", + "MET": "M", + "PHE": "F", + "PRO": "P", + "SER": "S", + "THR": "T", + "TRP": "W", + "TYR": "Y", + "VAL": "V", +} +TRANSLATIONS_1_3 = {value: key for (key, value) in TRANSLATIONS_3_1.items()} + + +class BadResidue(Exception): + """ + Bad residue found in a sequence + """ + + pass + + +def translate_1_to_3( + sequence: str, + translations: Dict[str, str] = TRANSLATIONS_1_3, + unknown: Optional[str] = None, +) -> List[str]: + """ + Translate a 1 letter sequence to a 3 letter sequence + Args: + sequence (str): 1 letter sequence + translations (Dict[str, str]): a list of translations for single amino acid codes to 3 letter residue names + unknown (Optional[str]): optional name for residues if they are unknown, if set no error is raised if a + 1 letter residue name is not recognised + Returns List[str]: + a list of 3 residue codes + + """ + result = [] + for i, residue_name_1let in enumerate(sequence): + residue_name_1let = residue_name_1let.upper() + if residue_name_1let in translations: + result.append(translations[residue_name_1let]) + else: + if unknown: + result.append(unknown) + else: + msg = f"""\ + unknown residue {residue_name_1let} at residue {i+1} + sequence: {sequence} + {(' ' * i) + '^'} + """ + msg = dedent(msg) + raise BadResidue(msg) + + return result + + +def translate_3_to_1( + sequence: List[str], translations: Dict[str, str] = TRANSLATIONS_3_1 +) -> List[str]: + """ + + Translate a 3 letter sequence to a 1 letter sequence + Args: + sequence (str): 3 letter sequence + translations (Dict[str, str]): a list of translations for single amino acid codes to 3 letter residue names + + Returns List[str]: + a list of 1 residue codes + + :param sequence: + :return: + """ + result = [] + for i, resn in enumerate(sequence, start=1): + resn = resn.upper() + if resn in translations: + result.append(translations[resn]) + else: + msg = f""" + unknown residue {resn} + in sequence {chunks(' '.join(sequence), 10)} + at residue number {i} + """ + exit_error(msg) + + return result + + +def sequence_3let_to_sequence_residues( + sequence_3let: List[str], chain_code: str = "A" +) -> List[SequenceResidue]: + """ + Translate a list of 3 residue sequence codes to SequenceResidues + Args: + sequence_3let (List([str]): list of 3 letter residue names + chain_code (str): the chain code [defaults to A] + offset (int): the sequence offset [defaults to 0] + + Returns List[SequenceResidue]: + the sequence as a list of SequenceResidues + + """ + + return [ + SequenceResidue(chain_code, i, residue) + for (i, residue) in enumerate(sequence_3let, start=1) + ] + + +def sequence_residues_to_sequence_3let( + sequence: List[SequenceResidue], chain_code: str = "A" +) -> List[str]: + """ + Translate a list of SequenceResidues to 3 letter sequence codes t + Args: + sequence (List([SequenceResidues]): list of residues + chain_code (str): the chain code [defaults to A] only residues fromthis chain will be converted + + Returns List[str]: + the sequence as a list of 3 letter residues + + """ + return [ + residue.residue_name for residue in sequence if residue.chain_code == chain_code + ] + + +def frame_to_chains(sequence_frame: Saveframe) -> List[str]: + """ + given a list of nef molecular systems list the chains found + + :param sequence_frames: list of nef molecular system save frames + :return: a list of chain_codes + """ + + chains = set() + for loop in sequence_frame.loop_dict.values(): + data_frame = loop_to_dataframe(loop) + chains.update(data_frame[NEF_CHAIN_CODE].unique()) + + if NEF_UNKNOWN in chains: + chains.remove(NEF_UNKNOWN) + + chains = sorted(chains) + + return chains + + +def count_residues(sequence_frame: Saveframe, chain_code: str) -> Dict[str, int]: + """ + given a nef molecular system list and a chain list the number of residues + + :param sequence_frames: list of nef molecular system save frames + :return: a list of chain_codes + """ + + result = {} + sequence_dataframe = loop_to_dataframe(list(sequence_frame.loop_dict.values())[0]) + + rows_with_chain = sequence_dataframe[sequence_dataframe.chain_code == chain_code] + residues = rows_with_chain.residue_name.unique() + + for residue in sorted(residues): + residue_selector = rows_with_chain.residue_name == residue + count = rows_with_chain.residue_name[residue_selector].count() + result[residue] = count + + return result + + +def get_sequence_or_exit() -> Optional[List[SequenceResidue]]: + """ + read a sequence from a nef molecular system on stdin or exit + + :return: a list of parsed residues and chains, in the order they were read + """ + try: + result = get_sequence() + except Exception as e: + msg = "couldn't read sequence from nef input stream, either no stream or no nef molecular system frame" + exit_error(msg, e) + + if len(result) == 0: + msg = "no sequence read from nef molecular system in nef input stream" + exit_error(msg) + + return result + + +def get_sequence() -> List[SequenceResidue]: + """ + read a sequence from a nef molecular system on stdin and return a list of residues + + can raise Exceptions + + :return:a list of parsed residues and chains, in the order they were read + """ + + result = [] + + if sys.stdin.isatty(): + exit_error( + "trying to read sequence from stdin but stdin is not a stream [did you forget to add a nef file to your " + "pipeline?]" + ) + + if running_in_pycharm(): + exit_error( + "reading from stdin doesn't work in pycharm debug environment as there is no shell..." + ) + + stream = cached_stdin() + + if stream: + text = "".join(stream) + + entry = Entry.from_string(text) + + if entry is not None: + frames = entry.get_saveframes_by_category("nef_molecular_system") + + if frames: + result = sequence_from_frame(frames[0]) + + return result + + +def sequence_from_frame(frame: Saveframe) -> List[SequenceResidue]: + + """ + read sequences from a nef molecular system save frame + can raise Exceptions + + :param frame: the save frame to read residues from, must have category nef molecular system + :return: a list of parsed residues and chains, in the order they were read + """ + residues = OrderedSet() + + if frame.category != "nef_molecular_system": + raise Exception( + f"sequences can only be read from nef molecular system frames, the category of the provided frame " + f"was {frame.category}" + ) + + loop = frame.loops[0] + + chain_code_index = loop.tag_index("chain_code") + sequence_code_index = loop.tag_index("sequence_code") + residue_name_index = loop.tag_index("residue_name") + linking_index = loop.tag_index("linking") + + for line in loop: + chain_code = line[chain_code_index] + sequence_code = line[sequence_code_index] + residue_name = line[residue_name_index] + linking = ( + Linking[line[linking_index].upper()] + if line[linking_index] != NEF_UNKNOWN + else None + ) + residue = SequenceResidue( + chain_code=chain_code, + sequence_code=sequence_code, + residue_name=residue_name, + linking=linking, + ) + residues.append(residue) + + return list(residues) + + +def sequence_3let_to_res( + sequence_3_let: List[str], chain_code: str, start: int = 1 +) -> List[SequenceResidue]: + """ + convert a list of 3 letter residue names to a list of sequence residues + :param sequence_3_let: 3 letter names + :param chain_code: chain code to use + :param start: start of the chain [default is 1] + :return: a list of sequeence residues + """ + result = set() + for res_number, residue_name in enumerate(sequence_3_let, start=start): + result.add(SequenceResidue(chain_code, res_number, residue_name)) + + return list(result) + + +def get_chain_starts(residues: List[SequenceResidue]) -> Dict[str, int]: + """ + from a list of residues get the lowest residue number for each chain ignoring any sequence codes that + can't be convetrted to an integer + + :param residues: a list of residues from one ore more chains + :return: a dictionary of chain starts by chain_code + """ + + chain_residue_numbers = {} + + for residue in residues: + + if is_int(residue_number := residue.sequence_code): + chain_residue_numbers.setdefault(residue.chain_code, []).append( + residue_number + ) + + return { + chain_code: min(residue_numbers) + for chain_code, residue_numbers in chain_residue_numbers.items() + } + + +def offset_chain_residues( + residues: List[SequenceResidue], chain_residue_offsets: Dict[str, int] +) -> List[SequenceResidue]: + """ + take a list of residues and offset residue sequence_codes by any offsets in chains if the + sequence_code can be converted to an integer and the chain_code matches + + :param residues: a list of residues from one ore more chains + :return: a list of residues + """ + + result = [] + for residue in residues: + if residue.chain_code in chain_residue_offsets and is_int( + residue.sequence_code + ): + offset = chain_residue_offsets[residue.chain_code] + result.append( + replace(residue, sequence_code=residue.sequence_code + offset) + ) + else: + result.append(residue) + + return result + + +def make_chunked_sequence_1let( + sequence_1_let: List[str], sub_chunk: int = 10, line_length: int = 100 +) -> List[str]: + """ + convert a list of strings into a chunked list of strings with (by de]fault) every 100 sting being + separated by a new line [line_length] and sequences of strings being separated by a space + :param sequence_1_let: a list of strings typically 1 single letter + :param sub_chunk: how often to separate strings by a space + :param line_length: how many strings need to be seen before a new line is added + :return: a list of formatted strings one per line + """ + + rows = chunks(sequence_1_let, 100) + + row_strings = [] + for row in rows: + row_chunks = list(chunks(row, 10)) + row_string = ["".join(chunk) for chunk in row_chunks] + row_strings.append(" ".join(row_string)) + + return row_strings diff --git a/src/nef_pipelines/lib/shift_lib.py b/src/nef_pipelines/lib/shift_lib.py new file mode 100644 index 0000000..76c5c1b --- /dev/null +++ b/src/nef_pipelines/lib/shift_lib.py @@ -0,0 +1,192 @@ +import dataclasses +from typing import Dict, List + +from pynmrstar import Loop, Saveframe + +from nef_pipelines.lib.nef_lib import UNUSED, loop_row_namespace_iter +from nef_pipelines.lib.structures import ( + AtomLabel, + SequenceResidue, + ShiftData, + ShiftList, +) + + +def nef_frames_to_shifts(frames: List[Saveframe]) -> List[ShiftData]: + """ + + :param frames: + :return: + """ + shifts = [] + + residue_field_names = [field.name for field in dataclasses.fields(SequenceResidue)] + atom_field_names = [ + field.name for field in dataclasses.fields(AtomLabel) if field.name != "residue" + ] + for frame in frames: + for loop in frame: + for row in loop_row_namespace_iter(loop): + + residue_fields = { + name: value + for name, value in vars(row).items() + if name in residue_field_names + } + atom_fields = { + name: value + for name, value in vars(row).items() + if name in atom_field_names + } + residue = SequenceResidue(residue_fields) + label = AtomLabel(residue, **atom_fields) + shifts.append(ShiftData(label, row.value, row.value_uncertainty)) + + return shifts + + +def shifts_to_nef_frame(shift_list: ShiftList, frame_name: str): + """ + + :param shift_list: + :param frame_name: + :return: + """ + category = "nef_chemical_shift_list" + + frame_code = f"{category}_{frame_name}" + + frame = Saveframe.from_scratch(frame_code, category) + + frame.add_tag("sf_category", category) + frame.add_tag("sf_framecode", frame_code) + + loop = Loop.from_scratch() + frame.add_loop(loop) + + tags = ( + "chain_code", + "sequence_code", + "residue_name", + "atom_name", + "value", + "value_uncertainty", + "element", + "isotope_number", + ) + + loop.set_category(category) + loop.add_tag(tags) + + for shift in shift_list.shifts: + + loop.add_data_by_tag("chain_code", shift.atom.residue.chain_code) + loop.add_data_by_tag("sequence_code", shift.atom.residue.sequence_code) + loop.add_data_by_tag("residue_name", shift.atom.residue.residue_name) + loop.add_data_by_tag("atom_name", shift.atom.atom_name) + loop.add_data_by_tag("value", shift.shift) + if shift.error is not None: + loop.add_data_by_tag("value_uncertainty", shift.error) + else: + loop.add_data_by_tag("value_uncertainty", UNUSED) + loop.add_data_by_tag("element", UNUSED) + loop.add_data_by_tag("isotope_number", UNUSED) + + return frame + + +def collapse_common_shifts(shift_list: ShiftList) -> ShiftList: + """ + replace entries in the shift list which are in the same residue and are related by symmetry + and have the same shift by the reduced form e.g. HA1 1.000 HA2 1.000 -> HA* 1.000 + :param shift_list: shift list to collapse + :return: collapsed chemcial shifts + """ + + result = [] + by_residue = _cluster_shifts_by_residue(shift_list) + for residue_shifts in by_residue.values(): + result.extend(_cluster_by_shift(residue_shifts)) + + return ShiftList(result) + + +def _cluster_shifts_by_residue( + shift_list: ShiftList, +) -> Dict[SequenceResidue, List[ShiftData]]: + """ + put all the shifts from the same residue in an entry in a dictionary keyed by residue + + :param shift_list: the ShiftList to cluster + :return: the dictionary of shifts with the same residue + """ + by_residue = {} + for shift in shift_list.shifts: + atom = shift.atom + residue = SequenceResidue( + shift.atom.chain_code, atom.sequence_code, atom.residue_name + ) + by_residue.setdefault(residue, []).append(shift) + + return by_residue + + +def _cluster_by_shift(shifts: List[ShiftData]) -> List[List[ShiftData]]: + """ + build a list lists where each child list contains ShiftData entries + which all have the same chemicl shifts + :param shifts: a list of shifts to cluster + :return: ShiftData clustered by shifts + """ + shift_map = {} + + for shift in shifts: + shift_map.setdefault(shift.shift, []).append(shift) + + result = [] + for entry in shift_map.values(): + + if len(entry) > 1: + result.extend(_collapse_cluster(entry)) + else: + result.append(entry[0]) + + return result + + +def _collapse_cluster(shifts: List[ShiftData]) -> List[ShiftData]: + """ + collapse a cluster of atoms with the same shift which are symmetry related (currently in the nef style) + HA1 / HA2 -> HA*, HG11, HG12, HG13, HG21, HG22, HG23 -> HG* [not HG**] Note all symmetry related shifts + should have the same shift else information will be lost + + :param shifts: a list of shifts to cluster + :return: the clusted shifts as a new list + """ + + by_stem = {} + for shift in shifts: + last_character = shift.atom.atom_name[-1] + if last_character.isnumeric() or last_character == "*": + by_stem.setdefault(shift.atom.atom_name[:-1], []).append(shift) + + result = [] + for stem, shifts in by_stem.items(): + if len(shifts) > 1: + first_shift = shifts[0] + first_atom = first_shift.atom + + residue = SequenceResidue( + first_atom.chain_code, first_atom.sequence_code, first_atom.residue_name + ) + new_atom = AtomLabel(residue, f"{stem}*") + new_shift = ShiftData(new_atom, first_shift.shift, first_shift.error) + + result.append(new_shift) + else: + result.append(shifts[0]) + + if len(result) != len(shifts): + result = _collapse_cluster(_collapse_cluster(result)) + + return result diff --git a/src/nef_pipelines/lib/structures.py b/src/nef_pipelines/lib/structures.py new file mode 100644 index 0000000..967f450 --- /dev/null +++ b/src/nef_pipelines/lib/structures.py @@ -0,0 +1,130 @@ +from dataclasses import dataclass +from enum import auto +from typing import Dict, List, Optional, Union + +from strenum import StrEnum + + +class Linking(StrEnum): + START = (auto(),) + MIDDLE = auto() + END = auto() + + +@dataclass(frozen=True, order=True) +class SequenceResidue: + chain_code: str + sequence_code: Union[int, str] + residue_name: str + is_cis: bool = False + linking: Optional[Linking] = None + variant: Optional[str] = None + + +# should contain a residue and have constructors? +@dataclass(frozen=True, order=True) +class AtomLabel: + residue: SequenceResidue + atom_name: str + + +@dataclass +class PeakAxis: + atom_labels: List[AtomLabel] + ppm: float + merit: str + # comment: str + + +@dataclass +class PeakValues: + serial: int + + height: Optional[float] = None + height_uncertainty: Optional[float] = None + + volume: Optional[float] = None + volume_uncertainty: Optional[float] = None + + deleted: Optional[bool] = False + comment: Optional[str] = "" + + width: Optional[float] = None # HWHH ppm + # bound: float + + # merit: Optional[str] = None + + # flag0: str + + +# assignment has a tuple of dimensions + + +@dataclass +class Assignments: + assignments: Dict[str, List[AtomLabel]] + + +@dataclass +class Peak: + id: int + values: PeakValues + + # move these to axis_values? + positions: Dict[str, float] + + # assignment has a list of one or more assignments + # each Assignment will have one value for each axis this maybe be either + # 0. a list with no AtomLabels - unassigned + # 1. a list with a single AtomLabel - this axis is definitively assigned + # 2. a list with multiple AtomLabels - this axis has multiple putative assignments + # Note if there are multiple unique assignments each of these is should be a top level + # assignment of the peak + assignments: List[Assignments] + + position_uncertainties: Optional[Dict[str, float]] = None + + +@dataclass +class PeakListData: + num_axis: int + axis_labels: List[str] + # isotopes: List[str] + data_set: str + sweep_widths: List[float] + spectrometer_frequencies: List[float] + + +# TODO: are axes indexed by names or by integer... +@dataclass +class PeakList: + peak_list_data: PeakListData + peaks: List[dict[Union[int, str], Peak]] + + +@dataclass +class LineInfo: + file_name: str + line_no: int + line: str + + +@dataclass +class ShiftData: + atom: AtomLabel + shift: float + error: Optional[float] = None + + +@dataclass +class ShiftList: + shifts: List[ShiftData] + + +@dataclass(order=True) +class RdcRestraint: + atom_1: AtomLabel + atom_2: AtomLabel + rdc: float + rdc_error: float + weight: Optional[float] = None diff --git a/src/nef_pipelines/lib/test_lib.py b/src/nef_pipelines/lib/test_lib.py new file mode 100644 index 0000000..7c8072e --- /dev/null +++ b/src/nef_pipelines/lib/test_lib.py @@ -0,0 +1,264 @@ +import sys +import traceback +from fnmatch import fnmatch +from io import StringIO +from itertools import zip_longest +from pathlib import Path +from typing import IO, AnyStr, List, Optional + +from click.testing import Result +from pynmrstar import Entry +from typer import Typer +from typer.testing import CliRunner + + +def run_and_read_pytest(args): + + from pytest import main + + original_output = sys.stdout + original_error = sys.stdout + sys.stdout = StringIO() + sys.stderr = StringIO() + + retcode = main(args) + + output = sys.stdout.getvalue() + error_output = sys.stderr.getvalue() + + sys.stdout.close() + sys.stderr.close() + sys.stdout = original_output + sys.stderr = original_error + + return retcode, output, error_output + + +def _split_test_spec(spec): + spec_parts = spec.split("::") + spec_parts[0] = Path(spec_parts[0]) + + return spec_parts + + +def select_matching_tests(tests, selectors): + results = [] + + for selector in selectors: + + selector_parts = _split_test_spec(selector) + num_selector_parts = len(selector_parts) + + if num_selector_parts == 1: + selector = f"*::{selector_parts[0]}" + selector_parts = _split_test_spec(selector) + num_selector_parts = len(selector_parts) + + if num_selector_parts == 2 and selector_parts[0] == Path(""): + selector = f"*::{selector_parts[1]}" + selector_parts = _split_test_spec(selector) + num_selector_parts = len(selector_parts) + + if num_selector_parts == 2 and selector_parts[1] == "": + selector = f"{selector_parts[0]}::*" + selector_parts = _split_test_spec(selector) + num_selector_parts = len(selector_parts) + + selector_path_parts = selector_parts[0].parts + num_selector_path_parts = len(selector_path_parts) + + for test in tests: + + # here we ensure we are looking for a .py file... + if not selector_path_parts[-1].endswith("py"): + selector_path_parts = list(selector_path_parts) + selector_path_parts[-1] = f"{selector_path_parts[-1]}.py" + selector_path_parts = tuple(selector_path_parts) + + test_parts = _split_test_spec(test) + num_test_parts = len(test_parts) + test_path_parts = test_parts[0].parts + num_test_path_parts = len(test_path_parts) + + paths_equal = False + if num_selector_path_parts <= num_test_path_parts: + selector_path = selector_path_parts[-num_selector_path_parts:] + test_path = test_path_parts[-num_selector_path_parts:] + paths_equal = selector_path == test_path + + path = str(test_parts[0]) + path_test = str(selector_parts[0]) + + if not paths_equal: + paths_equal = fnmatch(path, path_test) + + test_names_equal = False + if (num_selector_parts == 2) and (num_test_parts == 2): + test_names_equal = fnmatch(test_parts[-1], selector_parts[-1]) + + if paths_equal and test_names_equal: + results.append(test) + return results + + +def assert_lines_match( + expected: str, reported: str, squash_spaces: bool = True, ignore_empty=True +): + """ + compare two multi line strings line by line with stripping and raise an assertion if they don't match + note: empty lines are ignoresd by default, and multiple spaces are squashed + Args: + expected (str): the expected string + reported (str): the input string + squash_spaces (bool): remove duplicate spaces before comparison + + Returns: + None + """ + lines_expected = expected.split("\n") + lines_reported = reported.split("\n") + + if ignore_empty: + lines_expected = [line for line in lines_expected if len(line.strip()) != 0] + lines_reported = [line for line in lines_reported if len(line.strip()) != 0] + + zip_lines = zip_longest(lines_expected, lines_reported, fillvalue="") + for i, (expected_line, reported_line) in enumerate(zip_lines): + + expected_line_stripped = expected_line.strip() + reported_line_stripped = reported_line.strip() + + if squash_spaces: + expected_line_stripped = " ".join(expected_line_stripped.split()) + reported_line_stripped = " ".join(reported_line_stripped.split()) + + if reported_line_stripped != expected_line_stripped: + + print(f"exp|{i}| |{expected_line_stripped}|") + print(f"rep|{i}| |{reported_line_stripped}|") + + assert reported_line_stripped == expected_line_stripped + + +def isolate_frame(target: str, name: str) -> Optional[str]: + """ + Extract one frame from a NEF file by name as a string + Args: + target (Entry): target NEF entry + name (str): name of the save frame + + Returns: + Optional[str]: the entry or a None if not found + """ + + entry = Entry.from_string(target) + entry = str(entry.get_saveframe_by_name(name)) + + return entry + + +def path_in_parent_directory(root: str, target: str) -> str: + """ + given a root and a relative path find the relative file path + + Args: + root (str): root of the path + target (str): the relative path from the root + + Returns: + str: the target paths as a string + """ + parent_path = Path(root).parent + return str(Path(parent_path, target).absolute()) + + +def root_path(initial_path: str): + """given a path work up the directory structure till you find the + directory containing the nef executable + + initial_path (str): the path to start searching from + """ + + target = Path(initial_path) + + belt_and_braces = 100 # noqa: F841 this appears to be a bug + while ( + not (Path(target.root) == target) + and not (target / "src" / "nef" / "main.py").is_file() + ): + target = target.parent + belt_and_braces -= 1 + if belt_and_braces < 0: + msg = f"""\ + Error, while search for the rot of the path {initial_path} i walked up 100 + directories this looks like a bug! + """ + raise Exception(msg) + + return target + + +def path_in_test_data(root: str, file_name: str, local: bool = True) -> str: + """ + given a root and a file name find the relative to the file + in the parents test_data directory + + Args: + root (str): root of the path + file_name (str): the name of the file + local (bool): whether to use the directory of the tool + or read from the global test data + + Returns: + str: the target paths as a string + """ + + if not local: + test_data = root_path(root) / "tests" / "test_data" + else: + test_data = path_in_parent_directory(root, "test_data") + + return str(Path(test_data, file_name).absolute()) + + +def run_and_report( + typer_app: Typer, + args: List[str], + input: IO[AnyStr] = None, + expected_exit_code: int = 0, +) -> Result: + """ + run a typer app in the typer test harness and report exceptions and stdout to screen if there is an error + :param typer_app: the typer app hosting the application + :param args: command line arguments for the app + :param input: an input stream if required + :param expected_exit_code: what exit code to expect if the app is expected to end with an error + :return: results object + """ + + runner = CliRunner() + result = runner.invoke(typer_app, args, input=input) + + if result.exit_code != expected_exit_code: + print("\n", "-" * 40, "-stdout-", "-" * 40) + print(result.stdout) + if result.exception: + print("-" * 40, "exception", "-" * 40) + formatted = list(traceback.TracebackException(*result.exc_info).format()) + + # this is a hack, i would lobe to have a better solution! + if "SystemExit" in formatted[-1]: + for i, line in enumerate(reversed(formatted)): + if ( + "During handling of the above exception, another exception occurred" + in line + ): + break + formatted = formatted[: -i - 2] + print("".join(formatted)) + + print("-" * 40, "-" * 9, "-" * 40) + + assert result.exit_code == expected_exit_code + + return result diff --git a/src/nef_pipelines/lib/translation/__init__.py b/src/nef_pipelines/lib/translation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/nef_pipelines/lib/typer_utils.py b/src/nef_pipelines/lib/typer_utils.py new file mode 100644 index 0000000..3094c48 --- /dev/null +++ b/src/nef_pipelines/lib/typer_utils.py @@ -0,0 +1,10 @@ +from argparse import Namespace +from inspect import currentframe, getargvalues, getouterframes + + +def get_args(): + frame = currentframe() + outer_frames = getouterframes(frame) + caller_frame = outer_frames[1][0] + args = getargvalues(caller_frame).locals + return Namespace(**args) diff --git a/src/nef_pipelines/lib/util.py b/src/nef_pipelines/lib/util.py new file mode 100644 index 0000000..eadc17d --- /dev/null +++ b/src/nef_pipelines/lib/util.py @@ -0,0 +1,492 @@ +import io +import os +import sys +import traceback +from argparse import Namespace +from pathlib import Path +from typing import Dict, Iterator, List, Optional, TextIO, TypeVar, Union + +from cacheable_iter import iter_cache +from pynmrstar import Entry, Loop, Saveframe + +from nef_pipelines.lib.constants import ( + EXIT_ERROR, + NEF_META_DATA, + NEF_PIPELINES, + NEF_PIPELINES_VERSION, + NEF_UNKNOWN, +) +from nef_pipelines.lib.header_lib import ( + create_header_frame, + get_creation_time, + get_uuid, +) +from nef_pipelines.lib.structures import LineInfo + + +def _get_loop_by_category_or_none(frame: Saveframe, category: str) -> Loop: + + result = None + if f"_{category}" in frame.loop_dict.keys(): + result = frame.get_loop(category) + + return result + + +def loop_add_data_tag_dict(loop: Loop, data: Dict[str, object]) -> None: + tagged_data = [] + for category_tag in loop.get_tag_names(): + _, tag = category_tag.split(".") + if tag in data: + tagged_data.append(data[tag]) + else: + tagged_data.append(NEF_UNKNOWN) + + loop.add_data(tagged_data) + + +# TODO: there is no space for scrript arguments! +def fixup_metadata(entry: Entry, name: str, version: str, script: str): + """ + add a new entry to a nef metadata frame + + Args: + entry (Entry): the target entry + name (str): the name of the program used + version (str): the vserion of the program used + script (str): the script used + + Returns: + Entry: the modified entry + """ + + if entry is not None and NEF_META_DATA in entry.category_list: + meta_frame = entry[NEF_META_DATA] + + last_program = meta_frame.get_tag("program_name")[0] + last_program_version = meta_frame.get_tag("program_version")[0] + last_script_name = meta_frame.get_tag("script_name")[0] + + meta_frame.add_tag("program_name", name, update=True) + meta_frame.add_tag("program_version", version, update=True) + meta_frame.add_tag("script_name", script, update=True) + creation_time = get_creation_time() + meta_frame.add_tag("creation_time", creation_time, update=True) + uuid = get_uuid(NEF_PIPELINES, creation_time) + meta_frame.add_tag("uuid", uuid, update=True) + + run_history_loop = _get_loop_by_category_or_none(meta_frame, "nef_run_history") + if run_history_loop is not None: + if run_history_loop.get_tag("run_number"): + run_number_tags = run_history_loop.get_tag("run_number") + run_numbers = [int(run_number) for run_number in run_number_tags] + last_run_number = max(run_numbers) + next_run_number = last_run_number + 1 + else: + next_run_number = 1 + + data = { + "run_number": next_run_number, + "program_name": last_program, + "program_version": last_program_version, + "script_name": last_script_name, + } + loop_add_data_tag_dict(run_history_loop, data) + else: + run_history_loop = Loop.from_scratch("nef_run_history") + run_history_loop.add_tag( + ["run_number", "program_name", "program_version", "script_name"] + ) + run_history_loop.add_data(["."] * 4) + else: + header = create_header_frame(NEF_PIPELINES, NEF_PIPELINES_VERSION, script) + entry.add_saveframe(header) + + +class StringIteratorIO(io.TextIOBase): + + # https://stackoverflow.com/questions/12593576/adapt-an-iterator-to-behave-like-a-file-like-object-in-python + # second answer with more votes! + + def __init__(self, iter): + self._iter = iter + self._left = "" + + def readable(self): + return True + + def _read1(self, n=None): + while not self._left: + try: + self._left = next(self._iter) + except StopIteration: + break + ret = self._left[:n] + self._left = self._left[len(ret) :] + return ret + + def read(self, n=None): + line = [] + if n is None or n < 0: + while True: + m = self._read1() + if not m: + break + line.append(m) + else: + while n > 0: + m = self._read1(n) + if not m: + break + n -= len(m) + line.append(m) + return "".join(line) + + def readline(self): + line = [] + while True: + i = self._left.find("\n") + if i == -1: + line.append(self._left) + try: + self._left = next(self._iter) + except StopIteration: + self._left = "" + break + else: + line.append(self._left[: i + 1]) + self._left = self._left[i + 1 :] + break + return "".join(line) + + +# https://stackoverflow.com/questions/6657820/how-to-convert-an-iterable-to-a-stream/20260030#20260030 +def iterable_to_stream(iterable, buffer_size=io.DEFAULT_BUFFER_SIZE): + """ + Lets you use an iterable (e.g. a generator) that yields bytestrings as a read-only + input stream. + + The stream implements Python 3's newer I/O API (available in Python 2's io module). + For efficiency, the stream is buffered. + """ + + return StringIteratorIO(iterable) + + +def running_in_pycharm(): + return "PYCHARM_HOSTED" in os.environ + + +def get_pipe_file(args: Namespace) -> Optional[TextIO]: + """ + get an input on stdin or from an argument called pipe + + Args: + args (Namespace): command line arguments + + Returns: + TextIO: an input stream or None + + """ + + result = None + if "pipe" in args and args.pipe: + result = iter(cached_file_stream(args.pipe)) + # pycharm doesn't treat stdstreams correcly and hangs + elif not sys.stdin.isatty() and not running_in_pycharm(): + result = cached_stdin() + + return StringIteratorIO(result) if result else None + + +def get_pipe_file_or_exit(args: Namespace) -> Optional[TextIO]: + + try: + result = get_pipe_file(args) + except Exception as e: + exit_error("couldn't read from stdin or -pipe file", e) + + if isinstance(result, StringIteratorIO): + + result = "\n".join(list(result)) + + if len(result) == 0: + result = None + else: + result = io.StringIO(result) + + if result is None: + exit_error("couldn't read from stdin and no -pipe in command line arguments") + + return result + + +@iter_cache +def cached_stdin(): + return sys.stdin + + +class cached_file_stream: + def __init__(self, file_name): + self._file_name = file_name + + def __enter__(self): + return _cached_file_stream(self._file_name) + + def __exit__(self, *args): + pass + + def __iter__(self): + return _cached_file_stream(self._file_name) + + +@iter_cache +def _cached_file_stream(file_name): + try: + with open(file_name, "r") as lines: + result = lines.readlines() + except IOError as e: + exit_error(f"couldn't open stream {file_name} because {e}") + return result + + +def script_name(file: str) -> Path: + """ + get the name of the script + + Args: + file (str): the name of the file + + Returns: + Path: path to the file + """ + return Path(file).name + + +def exit_error(msg, exception=None): + """ + print an error message and exit error + + Args: + msg: the message + """ + + if exception is not None: + exc_info = sys.exc_info() + traceback.print_exception(*exc_info, file=sys.stderr) + print(file=sys.stderr) + + print(f"ERROR: {msg}", file=sys.stderr) + print("exiting...", file=sys.stderr) + sys.exit(EXIT_ERROR) + + +def process_stream_and_add_frames( + frames: List[Saveframe], input_args: Namespace +) -> Entry: + """ + take a set of save frames and either add them to a stream from stdin / a pipe-file or create + a new stream with a proper NEF metadata header + + Args: + frames: a set of save frames + input_args: command line arguments for an entry_name and pipe file source + + Returns: + a new entry containing the frames + """ + + try: + stream = get_pipe_file(input_args) + except Exception as e: + exit_error(f"failed to load pipe file because {e}") + + if stream is not None: + lines = "".join(stream.readlines()) + else: + lines = "" + + if len(lines.strip()) == 0: + stream = None + + new_entry = ( + Entry.from_string(lines) + if stream + else Entry.from_scratch(input_args.entry_name) + ) + + fixup_metadata( + new_entry, NEF_PIPELINES, NEF_PIPELINES_VERSION, script_name(__file__) + ) + + # TODO: check if frame exists + for frame in frames: + new_entry.add_saveframe(frame) + + return new_entry + + +def is_int(value: str) -> bool: + """ + Check if a string is an integer + Args: + value (str): the putative integer + + Returns bool: + true is if its an integer + + """ + result = False + try: + int(value) + result = True + except ValueError: + pass + + return result + + +def is_float(value: str) -> bool: + """ + Check if a string is an float + Args: + value (str): the putative float + + Returns bool: + true is if its an integer + + """ + result = False + try: + float(value) + result = True + except ValueError: + pass + + return result + + +T = TypeVar("T") + + +# https://stackoverflow.com/questions/312443/how-do-i-split-a-list-into-equally-sized-chunks +def chunks(input: Iterator[T], n: int) -> Iterator[List[T]]: + """Yield successive n-sized chunks from lst. + lst: the list to chunk + n: the chunk size + + Returns: + an iterator of chunks from lst of length T + + """ + lst = list(input) + for i in range(0, len(lst), n): + yield lst[i : i + n] + + +def _build_int_float_error_message(message, line_info, field, format): + messages = [] + if message: + messages.append(message) + if line_info: + message = f""" + couldn't convert {line_info.line} to format + + at {line_info.line_no + 1} in f{line_info.file_name} + + line value was + + {line_info.line} + + field was + + {field} + """ + messages.append(message) + message = "\n\n".join(messages) + return message + + +def read_float_or_exit( + string: str, line_info: LineInfo = None, field="unknown", message=None +) -> float: + """ + convert a string to an float or exit with error including line information + + Args: + string: string to parse + line_info: the line the string came from + field: the name of the field defininbg the string (ignored if there is no line info) + message: any other messages to show + + Returns: + a float + """ + + format = "float" + try: + result = float(string) + except Exception: + message = _build_int_float_error_message(message, line_info, field, format) + + exit_error(message) + + return result + + +def read_integer_or_exit( + string: str, line_info: LineInfo = None, field="unknown", message=None +) -> int: + """ + convert a string to an int or exit with error including line information + + Args: + string: string to parse + line_info: the line the string came from + field: the name of the field defining the string (ignored if there is no line info) + message: any other messages to show + + Returns: + an integer + """ + + format = "integer" + try: + result = int(string) + except Exception: + + message = _build_int_float_error_message(message, line_info, field, format) + + exit_error(message) + + return result + + +# TODO test this! +def parse_comma_separated_options( + lists: Union[List[List[str]], List[str], str] +) -> List[str]: + """ + Take a mixed list of strings or strings that can be parsed as a comma separated list of strings + and make a list of all the items + + e.g. + + test = [['1','2','3'], 'a,b,c', ['4','5']] + + gives + + ['1','2','3','a','b','c','4','5'] + + :param lists: a mixture of lists of string or comma separated as a at sting + :return: list of items + """ + result = [] + for item in lists: + if isinstance(item, str): + item = item.strip(",") + result.extend(item.split(",")) + else: + result.append(item) + + return result diff --git a/src/nef_pipelines/main.py b/src/nef_pipelines/main.py new file mode 100644 index 0000000..d4ddb62 --- /dev/null +++ b/src/nef_pipelines/main.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +# for pynmrstar to avoid WARNING:root:Loop with no data on line: xx +import logging +import sys +import traceback +from textwrap import dedent + +try: + import future # noqa: F401 +except Exception: + pass + +MINIMUM_VERSION = (3, 7) +if not sys.version_info >= MINIMUM_VERSION: + print( + "Error; minimum version required id %i.%i [and python 2 is not supported]" + % MINIMUM_VERSION, + file=sys.stderr, + ) + print("exiting...", file=sys.stderr) + + +logging.getLogger().setLevel(logging.ERROR) + +EXIT_ERROR = 1 + + +def do_exit_error(msg, trace_back=True, exit_code=EXIT_ERROR): + msg = dedent(msg) + if trace_back: + traceback.print_exc() + print(msg, file=sys.stderr) + print("exiting...", file=sys.stderr) + sys.exit(exit_code) + + +def main(): + try: + import typer + from click import ClickException + + except Exception as e: + + msg = """\ + + Initializaion error: one of the core libraries [friendly_tracback/typer] if missing from you environment + please make sure they are installed an try again + exiting...""" + do_exit_error(msg, e) + + try: + from nef_pipelines import nef_app + + nef_app.app = typer.Typer(no_args_is_help=True) + app = nef_app.app # noqa: F841 + + except Exception as e: + msg = """\ + + Initialisation error: failed to start the typer app, message the developer + exiting...""" + do_exit_error(msg, e) + + try: + # import components which will self register, this could and will be automated + import nef_pipelines.tools.chains # noqa: F401 + import nef_pipelines.tools.frames # noqa: F401 + import nef_pipelines.tools.header # noqa: F401 + import nef_pipelines.tools.stream # noqa: F401 + import nef_pipelines.tools.test # noqa: F401 + import nef_pipelines.transcoders.fasta # noqa: F401 + import nef_pipelines.transcoders.mars # noqa: F401 + import nef_pipelines.transcoders.nmrpipe # noqa: F401 + import nef_pipelines.transcoders.nmrview # noqa: F401 + import nef_pipelines.transcoders.pales # noqa: F401 + import nef_pipelines.transcoders.pdb # noqa: F401 + + except Exception as e: + msg = """\ + + Initialisation error: failed to load a plugin, remove the plugin or contact the developer + """ + + do_exit_error(msg, e) + + try: + + nef_app.app + + command = typer.main.get_command(nef_app.app) + + command(standalone_mode=False) + + except ClickException as e: + e.show() + do_exit_error( + f"inputs: {' '.join(sys.argv[1:])}", trace_back=False, exit_code=e.exit_code + ) + + except Exception as e: + + msg = f"""\ + + Runtime error: failed to process the data using the plugin and commands, check you inputs or report a bug + inputs: {' '.join(sys.argv[1:])} + message: {e} + """ + + do_exit_error(msg) + + +if __name__ == "__main__": + main() diff --git a/src/nef_pipelines/nef_app/__init__.py b/src/nef_pipelines/nef_app/__init__.py new file mode 100644 index 0000000..21c0e7f --- /dev/null +++ b/src/nef_pipelines/nef_app/__init__.py @@ -0,0 +1,3 @@ +# the root of the type app hierarchy + +app = None diff --git a/src/nef_pipelines/tools/chains/__init__.py b/src/nef_pipelines/tools/chains/__init__.py new file mode 100644 index 0000000..1f0790f --- /dev/null +++ b/src/nef_pipelines/tools/chains/__init__.py @@ -0,0 +1,16 @@ +import typer + +from nef_pipelines import nef_app + +chains_app = typer.Typer() + + +if nef_app.app: + nef_app.app.add_typer( + chains_app, name="chains", help="- carry out operations on chains" + ) + + # import of specific importers must be after app creation to avoid circular imports + import nef_pipelines.tools.chains.clone # noqa: F401 + import nef_pipelines.tools.chains.list # noqa: F401 + import nef_pipelines.tools.chains.rename # noqa: F401 diff --git a/src/nef_pipelines/tools/chains/clone.py b/src/nef_pipelines/tools/chains/clone.py new file mode 100644 index 0000000..69374ff --- /dev/null +++ b/src/nef_pipelines/tools/chains/clone.py @@ -0,0 +1,89 @@ +from dataclasses import replace +from itertools import islice +from typing import List + +import typer +from pynmrstar import Entry +from typer import Argument, Option + +from nef_pipelines.lib.sequence_lib import ( + chain_code_iter, + frame_to_chains, + sequence_from_frame, + sequence_to_nef_frame, +) +from nef_pipelines.lib.util import ( + exit_error, + get_pipe_file_or_exit, + parse_comma_separated_options, +) +from nef_pipelines.tools.chains import chains_app + +app = typer.Typer() + + +# noinspection PyUnusedLocal +@chains_app.command() +def clone( + target: str = Argument("A", help="chain to clone"), + count: int = Argument(1, help="how many copys to make"), + chain_codes: List[str] = Option( + None, + "-c", + "--chains", + help="new chain codes to add otherwise defaults to next available ascii uper case letter," + " can be called mutiple times or be a comma separated list", + ), +): + """- duplicate chains one or more times""" + + if not count > 0: + exit_error(f"clone count must be > 0 i got: {count}") + + chain_codes = parse_comma_separated_options(chain_codes) + + lines = "".join(get_pipe_file_or_exit([]).readlines()) + + entry = Entry.from_string(lines) + + molecular_system = entry.get_saveframes_by_category("nef_molecular_system") + + if len(molecular_system) < 1: + exit_error("couldn't find a molecular system frame in the stream") + + if len(molecular_system) > 1: + exit_error( + "found more than one molecular system frame, this is not a valid nef file!" + ) + + molecular_system = molecular_system[0] + + sequence = sequence_from_frame(molecular_system) + + # this should come from a sequence not a frame... + existing_chain_codes = frame_to_chains(molecular_system) + + if target not in existing_chain_codes: + exit_error( + f"couldn't find target chain {target} in {', '.join(existing_chain_codes)}" + ) + + useable_chain_codes = chain_code_iter(chain_codes, [target, *existing_chain_codes]) + new_chain_codes = islice(useable_chain_codes, count) + + target_residues = [residue for residue in sequence if residue.chain_code == target] + + for new_chain_code in new_chain_codes: + for residue in target_residues: + new_residue = replace(residue, chain_code=new_chain_code) + sequence.append(new_residue) + + sequence = sorted(sequence) + + molecular_system_frame = sequence_to_nef_frame(sequence) + + entry.remove_saveframe("nef_molecular_system") + + entry.add_saveframe(molecular_system_frame) + + print(entry) diff --git a/src/nef_pipelines/tools/chains/list.py b/src/nef_pipelines/tools/chains/list.py new file mode 100644 index 0000000..f24c586 --- /dev/null +++ b/src/nef_pipelines/tools/chains/list.py @@ -0,0 +1,48 @@ +from argparse import Namespace +from pathlib import Path + +import typer +from pynmrstar import Entry +from typer import Option + +from nef_pipelines.lib.sequence_lib import frame_to_chains +from nef_pipelines.lib.util import get_pipe_file +from nef_pipelines.tools.chains import chains_app + +app = typer.Typer() + +# TODO: it would be nice to put the chains with the first molecular system frame + + +# noinspection PyUnusedLocal +@chains_app.command() +def list( + pipe: Path = typer.Option( + None, + metavar="|PIPE|", + help="pipe to read NEF data from, for testing [overrides stdin !use stdin instead!]", + ), + comment: bool = Option(False, "-c", "--comment", help="prepend comment to chains"), + verbose: bool = Option(False, "-v", "--verbose", help="print verbose info"), + stream: bool = Option(False, "-s", "--stream", help="stream file after comment"), +): + """- list the chains in the molecular systems""" + lines = "".join(get_pipe_file(Namespace(pipe=pipe)).readlines()) + entry = Entry.from_string(lines) + sequence_frames = entry.get_saveframes_by_category("nef_molecular_system") + + chains = [] + if len(sequence_frames) > 0: + chains = frame_to_chains(sequence_frames[0]) + + result = " ".join(chains) + chains = "chain" if len(chains) == 1 else "chains" + + verbose = f"{len(result)} {chains}: " if verbose else "" + + comment = "# " if comment else "" + + print(f"{comment}{verbose}{result}") + + if stream: + print(lines) diff --git a/src/nef_pipelines/tools/chains/rename.py b/src/nef_pipelines/tools/chains/rename.py new file mode 100644 index 0000000..55dd4ab --- /dev/null +++ b/src/nef_pipelines/tools/chains/rename.py @@ -0,0 +1,87 @@ +import sys +from fnmatch import fnmatch +from typing import List + +import typer +from ordered_set import OrderedSet +from pynmrstar import Entry +from typer import Argument, Option + +from nef_pipelines.lib.util import chunks, get_pipe_file +from nef_pipelines.tools.chains import chains_app + +app = typer.Typer() + +# TODO: it would be nice to put the chains with the first molecular system frame + + +# noinspection PyUnusedLocal +@chains_app.command() +def rename( + old: str = Argument(..., help="old chain code"), + new: str = Argument(..., help="new chain code"), + comment: bool = Option(False, "--comment", help="prepend comment to chains"), + verbose: bool = Option(False, "-v", "--verbose", help="print verbose info"), + use_category: bool = Option( + False, + "-c", + "--category", + help="select frames to rename chains in by category rather than name", + ), + frames: List[str] = Option( + [], + "-f", + "--frame", + help="limit changes to a a particular frame by name [or category if --category is set], note: wildcards are " + "allowed, repeated uses add more frames", + ), +): + """- change the name of chains across one or multuiple frames""" + + lines = "".join(get_pipe_file([]).readlines()) + entry = Entry.from_string(lines) + + changes = 0 + changed_frames = OrderedSet() + for save_frame in entry: + process_frame = True + for frame_selector in frames: + + if use_category: + if not fnmatch(save_frame.category, f"*{frame_selector}*"): + + process_frame = False + else: + if not fnmatch(save_frame.name, f"*{frame_selector}*"): + process_frame = False + + if process_frame: + for loop in save_frame.loop_iterator(): + for tag in loop.get_tag_names(): + tag_parts = tag.split(".") + if tag_parts[-1].startswith("chain_code"): + tag_values = loop[tag] + for i, row in enumerate(tag_values): + if row == old: + tag_values[i] = new + changes += 1 + changed_frames.add(save_frame.name) + + loop[tag] = tag_values + + if verbose: + comment = "# " if comment else "" + out = sys.stderr if not comment else sys.stdout + if changes >= 1: + print( + f"{comment}rename chain: {changes} changes made in the following frames", + file=out, + ) + for chunk in chunks(changed_frames, 5): + print(f'{comment} {", ".join(chunk)}', file=out) + + else: + print(f"{comment}rename chain: no changes made", file=out) + print(file=out) + + print(entry) diff --git a/src/nef_pipelines/tools/frames/__init__.py b/src/nef_pipelines/tools/frames/__init__.py new file mode 100644 index 0000000..7534587 --- /dev/null +++ b/src/nef_pipelines/tools/frames/__init__.py @@ -0,0 +1,20 @@ +import typer + +import nef_pipelines +from nef_pipelines import nef_app + +frames_app = typer.Typer() + + +if nef_app.app: + nef_app.app.add_typer( + frames_app, + name="frames", + help="- carry out operations on frames in nef frames", + ) + + # import of specific importers must be after app creation to avoid circular imports + import nef_pipelines.tools.frames.delete # noqa: F401 + import nef_pipelines.tools.frames.insert # noqa: F401 + import nef_pipelines.tools.frames.list # noqa: F401 + import nef_pipelines.tools.frames.tabulate # noqa: F401 diff --git a/src/nef_pipelines/tools/frames/delete.py b/src/nef_pipelines/tools/frames/delete.py new file mode 100644 index 0000000..864bb88 --- /dev/null +++ b/src/nef_pipelines/tools/frames/delete.py @@ -0,0 +1,100 @@ +import inspect +import sys +from fnmatch import fnmatch +from typing import List + +import typer +from pynmrstar import Entry + +from nef_pipelines.lib.util import cached_stdin, exit_error, running_in_pycharm +from nef_pipelines.tools.frames import frames_app + +UNDERSCORE = "_" + +parser = None + + +# noinspection PyUnusedLocal +@frames_app.command() +def delete( + use_categories: bool = typer.Option( + False, + "-c", + "--category", + help="if selected use the category of the frame to select it for deletion rather than it name", + ), + exact: bool = typer.Option( + False, "-e", "--exact", help="don't treat name as a wild card" + ), + selectors: List[str] = typer.Argument( + ..., + help="a list of frames to delete by type or name, names can be wildcards, names have lead _'s removed and " + "surrounding back quotes ` removed", + ), +): + """- delete frames in the current input by type or name""" + + entry = _create_entry_from_stdin_or_exit(current_function()) + + to_delete = [] + for name in selectors: + for frame in entry: + frame_full_name = frame.name + frame_category = frame.category + frame_name = frame_full_name[len(frame_category) :].lstrip("_").strip("`") + + for selector in selectors: + if not exact: + selector = f"*{selector}*" + + if use_categories: + if fnmatch(frame_category, selector): + to_delete.append(frame) + else: + if fnmatch(frame_name, selector): + to_delete.append(frame) + + entry.remove_saveframe(to_delete) + + print(entry) + + +def current_function(): + + return inspect.stack()[1][3] + + +def calling_function(): + + return inspect.stack()[2][3] + + +def _create_entry_from_stdin_or_exit(command_name: str): + + try: + + if sys.stdin.isatty(): + exit_error( + f"the command {command_name} reads from stdin and there is no stream..." + ) + + if running_in_pycharm(): + exit_error("you can't build read fron stdin in pycharm...") + + result = cached_stdin() + + # result is an iterable as well as an iter, but may have been read already making the iter empty? + # hence the need to call iter? + lines = list(iter(result)) + + if len(lines) == 0: + exit_error( + f"the command {command_name} reads from stdin and the stream is empty..." + ) + + entry = Entry.from_string("".join(lines)) + + except Exception as e: + exit_error(f"failed to read nef entry from stdin because {e}", e) + + return entry diff --git a/src/nef_pipelines/tools/frames/insert.py b/src/nef_pipelines/tools/frames/insert.py new file mode 100644 index 0000000..868b042 --- /dev/null +++ b/src/nef_pipelines/tools/frames/insert.py @@ -0,0 +1,136 @@ +import inspect +import sys +from enum import auto +from fnmatch import fnmatch +from pathlib import Path +from typing import List + +import typer +from pynmrstar import Entry +from strenum import LowercaseStrEnum + +from nef_pipelines.lib.util import ( + cached_stdin, + exit_error, + parse_comma_separated_options, + running_in_pycharm, +) +from nef_pipelines.tools.frames import frames_app + +UNDERSCORE = "_" + +parser = None + + +class PRIORITY(LowercaseStrEnum): + STDIN = auto() + FILE = auto() + + +SELECT_HELP = """a list of frames by full name or categeory to merge from the the inserted file, can use lists for frame + names joined with , or be called mutiple times [default is all frames] if --category is set use + categories rather than names, if --exact isn'r fefined you can use wild cards""" + + +# noinspection PyUnusedLocal +@frames_app.command() +def insert( + exact: bool = typer.Option( + False, "-e", "--exact", help="don't treat name as a wild card" + ), + use_categories: bool = typer.Option( + False, + "-c", + "--category", + ), + select: List[str] = typer.Option(None, "-s", "--select", help=SELECT_HELP), + priority: PRIORITY = typer.Option( + PRIORITY.FILE, + "-p", + "--priority", + help=f"source of frames to prioritise, one of {PRIORITY.STDIN} and {PRIORITY.FILE}", + ), + nef_file_paths: List[Path] = typer.Argument( + ..., help="nef files from which to insert frames into the current stream" + ), +): + """- insert frames from another nef file into the current stream""" + + select = parse_comma_separated_options(select) + select_all = len(select) == 0 + + stream_entry = _create_entry_from_stdin_or_exit(current_function()) + + for nef_file_path in nef_file_paths: + external_entry = Entry.from_file(nef_file_path.open()) + for external_frame in external_entry: + + ok_external_frame = False + + if select_all: + ok_external_frame = True + if use_categories and exact and external_frame.category in select: + ok_external_frame = True + if not use_categories and exact and external_frame.name in select: + ok_external_frame = True + if use_categories and not exact: + for category in select: + if fnmatch(external_frame.category, f"*{category}*"): + ok_external_frame = True + if not use_categories and not exact: + for name in select: + if fnmatch(external_frame.category, f"*{name}*"): + ok_external_frame = True + + if ok_external_frame: + if external_frame in stream_entry: + if priority == PRIORITY.STDIN: + continue + elif priority == PRIORITY.FILE: + del stream_entry[external_frame.name] + stream_entry.add_saveframe(external_frame) + else: + stream_entry.add_saveframe(external_frame) + + print(stream_entry) + + +def current_function(): + + return inspect.stack()[1][3] + + +def calling_function(): + + return inspect.stack()[2][3] + + +def _create_entry_from_stdin_or_exit(command_name: str): + + try: + + if sys.stdin.isatty(): + exit_error( + f"the command {command_name} reads from stdin and there is no stream..." + ) + + if running_in_pycharm(): + exit_error("you can't build read fron stdin in pycharm...") + + result = cached_stdin() + + # result is an iterable as well as an iter, but may have been read already making the iter empty? + # hence the need to call iter? + lines = list(iter(result)) + + if len(lines) == 0: + exit_error( + f"the command {command_name} reads from stdin and the stream is empty..." + ) + + entry = Entry.from_string("".join(lines)) + + except Exception as e: + exit_error(f"failed to read nef entry from stdin because {e}", e) + + return entry diff --git a/src/nef_pipelines/tools/frames/list.py b/src/nef_pipelines/tools/frames/list.py new file mode 100644 index 0000000..e79c2dc --- /dev/null +++ b/src/nef_pipelines/tools/frames/list.py @@ -0,0 +1,243 @@ +import os +from math import floor +from pathlib import Path +from textwrap import dedent +from typing import List, Optional + +import typer +from pynmrstar import Entry +from tabulate import tabulate + +from nef_pipelines.lib.nef_lib import SelectionType, select_frames +from nef_pipelines.lib.sequence_lib import count_residues, frame_to_chains +from nef_pipelines.lib.typer_utils import get_args +from nef_pipelines.lib.util import chunks, exit_error, get_pipe_file +from nef_pipelines.tools.frames import frames_app + +UNDERSCORE = "_" + +parser = None + + +def _if_is_nef_file_load_as_entry(file_path): + entry = None + try: + entry = Entry.from_file(file_path) + except Exception: + pass + + return entry + + +# noinspection PyUnusedLocal +@frames_app.command() +def list( + pipe: Path = typer.Option( + None, + "-i", + "--in", + metavar="NEF-FILE", + help="read NEF data from a file instead of stdin", + ), + selector_type: SelectionType = typer.Option( + SelectionType.ANY, + "-t", + "--selector-type", + help=f"force how to select frames can be one of {', '.join(SelectionType.__members__)}", + ), + number: bool = typer.Option(False, "-n", "--number", help="number entries"), + verbose: int = typer.Option( + False, + "-v", + "--verbose", + count=True, + help="print verbose information more verbose options give more information", + ), + filters: Optional[List[str]] = typer.Argument( + None, help="filters string for entry names and categories to list" + ), +): + """- list the frames in the current input""" + + entry = None + if len(filters) > 0: + entry = _if_is_nef_file_load_as_entry(filters[0]) + if entry is not None: + if pipe: + msg = f"""\ + two nef file paths supplied... + path 1: {pipe} [from --in] + path 2: {filters[0]} [from args]""" + msg = dedent(msg) + exit_error(msg) + else: + pipe = filters[0] + filters = filters[1:] + if not filters: + filters = [ + "*", + ] + + args = get_args() + + if entry is None: + pipe_file = get_pipe_file(args) + raw_lines = pipe_file.readlines() + lines = "".join(raw_lines) + + try: + entry = Entry.from_string(lines) + except Exception as e: + exit_error(f"failed to read nef file {args.pipe} because", e) + + if entry is None: + if args.pipe is not None: + exit_error(f"couldn't read a nef stream from the file: {args.pipe}") + elif len(filters) > 0: + exit_error(f"couldn't read a nef stream from either {filters[0]} or stdin") + else: + exit_error("couldn't read a nef stream from stdin") + + print(f"entry {entry.entry_id}") + if verbose: + + import hashlib + + md5 = hashlib.md5(lines.encode("ascii")).hexdigest() + print(f" lines: {len(raw_lines)} frames: {len(entry)} checksum: {md5} [md5]") + print() + + frames = select_frames(entry, selector_type, filters) + + if verbose == 0: + frame_names = [frame.name for frame in frames] + if number: + frame_names = [ + f"{i}. {frame_name}" + for i, frame_name in enumerate(frame_names, start=1) + ] + + frame_list = _string_list_to_tabulation(frame_names) + print(tabulate(frame_list, tablefmt="plain")) + + else: + + for frame in enumerate(frames, start=1): + + filters = [f"*{filter}*" for filter in filters] + + if verbose == 0: + frame_names = [ + f"{i}. {frame.name}" for i, frame_name in enumerate(frames, start=1) + ] + + frame_list = _string_list_to_tabulation(frame_names) + print(tabulate(frame_list, tablefmt="plain")) + + print(f" category: {frame.category}") + if len(frame.name) != len(frame.category): + print( + f" name: {frame.name[len(frame.category):].lstrip(UNDERSCORE)}" + ) + + loop_lengths = [] + for loop in frame.loop_dict: + loop_lengths.append(str(len(loop))) + + loops = "" + if len(loops) == 1: + loops = " [length: 1]" + else: + loops = f' [lengths: {", ".join(loop_lengths)}]' + + print(f" loops: {len(frame.loop_dict)}{loops}") + + frame_standard = frame.name[: len("nef")] + is_standard_frame = frame_standard == "nef" + print(f" is nef frame: {is_standard_frame}") + + # ccpn_compound_name + if verbose == 2 and frame.category == "nef_molecular_system": + chains = frame_to_chains(frame) + + print(f' chains: {len(chains)} [{", ".join(chains)}]') + + residue_counts = {} + for chain in chains: + residue_counts[chain] = count_residues(frame, chain) + + residue_count_per_chain = {} + for chain in chains: + residue_count_per_chain[chain] = sum( + residue_counts[chain].values() + ) + + output = [ + f"{chain} {num_residues}" + for chain, num_residues in residue_count_per_chain.items() + ] + print(f' residues: {", ".join(output)}') + + for chain in chains: + counts_and_percentages = [] + + for residue, count in residue_counts[chain].items(): + percentage = ( + f"{count/ residue_count_per_chain[chain]*100:5.2f}" + ) + counts_and_percentages.append( + f"{residue}: {count} [{percentage}%]" + ) + + pre_string = f" {chain}. " + pre_string_width = len(pre_string) + + tabulation = _string_list_to_tabulation( + counts_and_percentages, pre_string_width + ) + table = tabulate(tabulation, tablefmt="plain") + + print(_indent_with_prestring(table, pre_string)) + + print() + + +def _indent_with_prestring(text_block, pre_string): + raw_result = [] + empty_prestring = " " * len(pre_string) + for i, string in enumerate(text_block.split("\n")): + if i == 0: + raw_result.append(f"{pre_string}{string}") + else: + raw_result.append(f"{empty_prestring}{string}") + + return "\n".join(raw_result) + + +def _string_list_to_tabulation(frame_names, used_columns=0): + try: + width, _ = os.get_terminal_size() + except Exception: + width = 100 + + width -= used_columns + + # apply a sensible minimum width + if width < 20: + width = 20 + + if len(frame_names) > 0: + frame_name_widths = [len(frame_name) for frame_name in frame_names] + max_frame_name_width = max(frame_name_widths) + + columns = int(floor(width / (max_frame_name_width + 1))) + column_width = int(floor(width / columns)) + + columns = 1 if columns == 0 else columns + + frame_names = [frame_name.rjust(column_width) for frame_name in frame_names] + frame_name_list = chunks(frame_names, columns) + else: + frame_name_list = [[]] + + return frame_name_list diff --git a/src/nef_pipelines/tools/frames/tabulate.py b/src/nef_pipelines/tools/frames/tabulate.py new file mode 100644 index 0000000..0fae47a --- /dev/null +++ b/src/nef_pipelines/tools/frames/tabulate.py @@ -0,0 +1,277 @@ +import fnmatch +import sys +from argparse import Namespace +from collections import Counter +from pathlib import Path +from typing import Dict, List + +import typer +from pynmrstar import Entry, Saveframe +from tabulate import tabulate as tabulate_formatter +from tabulate import tabulate_formats + +from nef_pipelines.lib.nef_lib import UNUSED +from nef_pipelines.lib.typer_utils import get_args +from nef_pipelines.lib.util import exit_error, get_pipe_file, read_integer_or_exit +from nef_pipelines.tools.frames import frames_app + +ALL_LOOPS = "ALL" + +UNDERSCORE = "_" + +parser = None + +EXIT_ERROR = 1 + +FORMAT_HELP = f'format for the table, [possible formats are: {", ".join(tabulate_formats)}: those provided by tabulate]' +FRAMES_HELP = ( + "selectors for loops to tabulate. note: wild cards are allowed and specific loops can be chosen" + " (see above)" +) + +ABBREVIATED_HEADINGS = { + "index": "ind", + "chain_code": "chain", + "sequence_code": "seq", + "residue_name": "resn", + "linking": "link", + "cis_peptide": "cis", + "ccpn_compound_name": "ccpn_name", + "ccpn_chain_role": "ccpn_role", + "ccpn_comment": "comment", + "ccpn_dataset_serial": "data_serial", + "program_name": "name", + "program_version": "version", + "script_name": "script", + "peak_id": "id", + "volume": "vol", + "volume_uncertainty": "vol-err", + "height_uncertainty": "height-err", + "position": "pos", + "position_uncertainty": "pos-err", + "ccpn_figure_of_merit": "merit", + "ccpn_annotation": "ann", + "ccpn_peak_list_serial": "ccpn-serial", +} + + +# noinspection PyUnusedLocal +@frames_app.command(no_args_is_help=True) +def tabulate( + pipe: Path = typer.Option( + None, + metavar="|PIPE|", + help="pipe to read NEF data from, for testing [overrides stdin !use stdin instead!]", + ), + format: str = typer.Option( + "plain", "-f", "--format", help=FORMAT_HELP, metavar="format" + ), + verbose: bool = typer.Option( + False, "-v", "--verbose", help="include frame names etc in output" + ), + full: bool = typer.Option( + False, "--full", help="don't suppress empty columns [default: False]" + ), + no_abbreviations: bool = typer.Option( + False, "--no-abbreviations", help="dont' abbreviate column headings" + ), + frame_selectors: List[str] = typer.Argument( + None, help=FRAMES_HELP, metavar="..." + ), +): + """- tabulate loops from frames in a NEF file. notes: 1. using the name of a frame will tabulate all loops, + using frame.loop_index [e.g. moleculecular_system.1] will tabulate a specific loop. 2. wild cards can be use for + frame names e.g. mol would select molecular_system" and anything other frame whose name contains mol 3. by default + empty columns are ignored""" + + args = get_args() + + pipe_file = get_pipe_file(Namespace(pipe=pipe)) + + if pipe_file: + + raw_lines = pipe_file.readlines() + lines = "".join(raw_lines) + + entry = Entry.from_string(lines) + + tabulate_frames(entry, args) + + +def tabulate_frames(entry, args): + frames_to_tabulate = _get_frames_to_tabulate(entry, args.frame_selectors) + frame_indices = _select_loop_indices(args.frame_selectors) + + for frame_name, frame_data in frames_to_tabulate.items(): + category = frame_data.category + + category_length = len(category) + + frame_id = frame_data.name[category_length:].lstrip("_") + frame_id = frame_id.strip() + frame_id = frame_id if len(frame_id) > 0 else "" + + for i, loop in enumerate(frame_data.loops, start=1): + if _should_output_loop(i, frame_name, frame_indices): + _output_loop(loop, frame_id, category, args) + + +def _select_loop_indices(frames): + frame_indices = {} + for frame_selector in frames: + frame_selector_index = frame_selector.split(".") + if len(frame_selector_index) == 1: + frame_name_selector = frame_selector_index[0] + frame_indices.setdefault(frame_name_selector, set()).add(ALL_LOOPS) + elif len(frame_selector_index) == 2: + frame_name_selector = frame_selector_index[0] + index_string = frame_selector_index[1] + message = f"[frames tabulate] expected an index got '{index_string}' from {'.'.join(frame_selector_index)}" + index = read_integer_or_exit(index_string, message=message) + frame_indices.setdefault(frame_name_selector, set()).add(index) + else: + exit_error( + f'[frames tabulate] too many fields int the frame selector {".".join(frame_selector)}' + ) + for frame_selector, indices in frame_indices.items(): + if ALL_LOOPS in indices and len(indices) > 1: + indices.remove(ALL_LOOPS) + index_strings = ", ".join([str(index) for index in indices]) + message = ( + "[frames tabulate] incompatible selections, you selected all loops and a specific index for " + + +f"frame: {frame_selector} indices were: {index_strings}" + ) + exit_error(message) + + return frame_indices + + +def _remove_empty_columns(tabulation, headers): + + counter = Counter() + for row in tabulation: + for i, header in enumerate(headers): + if row[i] != UNUSED: + counter[i] += 1 + + empty_columns = [] + for column_index, heading in enumerate(headers): + if not counter[column_index]: + empty_columns.append(column_index) + + empty_columns = list(reversed(empty_columns)) + + for row in tabulation: + for column_index in empty_columns: + del row[column_index] + + for column_index in empty_columns: + del headers[column_index] + + return tabulation, headers + + +def _output_loop(loop, frame_id, category, args): + + if args.verbose: + print() + if frame_id: + print(f"{frame_id}: {category}/{loop.category[1:]}") + else: + print(f"{category}/{loop.category[1:]}") + print() + table = [] + headers = loop.tags + for line in loop.data: + row = list(line) + table.append(row) + + if not args.full: + table, headers = _remove_empty_columns(table, headers) + + if not args.no_abbreviations: + headers = _abbreviate_headers(headers, ABBREVIATED_HEADINGS) + + print(tabulate_formatter(table, headers=headers, tablefmt=args.format)) + + +def _abbreviate_headers(headers: List[str], abbreviations: Dict[str, str]) -> List[str]: + + result = [] + + for header in headers: + for stem in abbreviations: + + if header.startswith(stem): + + header = f"{abbreviations[stem]}{header[len(stem):]} " + + result.append(header.replace("_", "-")) + + return result + + +def _should_output_loop(index, frame_name, frame_indices): + do_output = True + for frame_selector in frame_indices: + if fnmatch.fnmatch(frame_name, f"*{frame_selector}*"): + indices = frame_indices[frame_selector] + if index not in indices and ALL_LOOPS not in indices: + do_output = False + return do_output + + +def _count_loop_rows(save_frame: Saveframe) -> int: + count = 0 + for loop in save_frame.loops: + count += len(loop) + + return count + + +def _get_frames_to_tabulate(entry, frame_selectors): + + frames_with_empty_loops = _get_loops_with_empty_frames_and_errors(entry) + + frames_to_tabulate = _select_chosen_frames(entry, frame_selectors) + + _remove_empty_frames_and_warn(frames_to_tabulate, frames_with_empty_loops) + + return frames_to_tabulate + + +def _remove_empty_frames_and_warn(frames_to_tabulate, frames_with_empty_loops): + for frame_name in frames_to_tabulate: + if frame_name in frames_with_empty_loops: + print(frames_with_empty_loops[frame_name], file=sys.stderr) + for frame_name in frames_with_empty_loops: + if frame_name in frames_to_tabulate: + del frames_to_tabulate[frame_name] + + +def _select_chosen_frames(entry, frame_selectors): + frames_to_tabulate = {} + for frame_name, data in entry.frame_dict.items(): + if len(frame_selectors) > 0: + for frame_selector in frame_selectors: + frame_selector = frame_selector.split(".")[0] + + if fnmatch.fnmatch(frame_name, f"*{frame_selector}*"): + frames_to_tabulate[frame_name] = data + else: + + frames_to_tabulate[frame_name] = data + return frames_to_tabulate + + +def _get_loops_with_empty_frames_and_errors(entry): + frames_with_empty_loops = {} + for frame_name, data in entry.frame_dict.items(): + if len(data.loops) == 0: + msg = f"WARNING: frame {frame_name} has no loops and the frame will be ignored" + frames_with_empty_loops[frame_name] = msg + + if frame_name not in frames_with_empty_loops and _count_loop_rows(data) == 0: + msg = f"WARNING: frame {frame_name} has loops but they contain no data and the frame will be ignored" + frames_with_empty_loops[frame_name] = msg + return frames_with_empty_loops diff --git a/src/nef_pipelines/tools/header.py b/src/nef_pipelines/tools/header.py new file mode 100644 index 0000000..6d86caa --- /dev/null +++ b/src/nef_pipelines/tools/header.py @@ -0,0 +1,36 @@ +import typer +from pynmrstar import Entry + +from nef_pipelines import nef_app +from nef_pipelines.lib.constants import NEF_PIPELINES, NEF_PIPELINES_VERSION +from nef_pipelines.lib.header_lib import create_header_frame +from nef_pipelines.lib.typer_utils import get_args + + +# noinspection PyUnusedLocal +@nef_app.app.command() +def header( + name: str = typer.Argument("nef", help="name for the entry", metavar="") +): + """- add a header to the stream""" + args = get_args() + + entry = build_meta_data(args) + + print(entry) + + +def build_meta_data(args): + from lib.util import script_name + + result = Entry.from_scratch(args.name) + header_frame = create_header_frame( + NEF_PIPELINES, NEF_PIPELINES_VERSION, script_name(__file__) + ) + result.add_saveframe(header_frame) + + return result + + +if __name__ == "__main__": + typer.run(header) diff --git a/src/nef_pipelines/tools/offset_shifts.py b/src/nef_pipelines/tools/offset_shifts.py new file mode 100644 index 0000000..4e46422 --- /dev/null +++ b/src/nef_pipelines/tools/offset_shifts.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +# Importing libraries +import argparse +import fcntl +import os +import sys +from fnmatch import fnmatch + +from pynmrstar import Entry + +CHAIN_CODE = "chain_code" + +SEQUENCE_CODE = "sequence_code" + +parser = None + +EXIT_ERROR = 1 + + +def exit_error(msg): + if parser: + parser.print_help() + print() + print(f"ERROR: {msg}") + print("exiting...") + sys.exit(EXIT_ERROR) + + +target_frames = { + "nef_chemical_shift_list": ["_nef_chemical_shift"], + "nef_nmr_spectrum": ["_nef_peak"], +} + + +def offset_chemical_shifts(entry, args): + + if not args.target: + exit_error( + "WARNING: i need a target nucleus or atom name provided by -t / --target, such as *CB* or 13C" + ) + + target_loops = [] + for frame_data in entry.frame_dict.values(): + if (frame_category := frame_data.category) in target_frames: + for loop in frame_data.loops: + if (loop_category := loop.category) in target_frames[frame_category]: + target_loops.append(((frame_category, loop_category), loop)) + + for loop_type, target_loop in target_loops: + if loop_type == ("nef_chemical_shift_list", "_nef_chemical_shift"): + chain_index = target_loop.tag_index("chain_code") + value_index = target_loop.tag_index("value") + element_index = target_loop.tag_index("element") + isotope_index = target_loop.tag_index("isotope_number") + atom_name_index = target_loop.tag_index("atom_name") + + for line in target_loop: + isotope = line[isotope_index] + line[element_index] + chain = line[chain_index] + value = line[value_index] + atom_name = line[atom_name_index] + + if chain == args.chain_code: + if args.target == isotope or fnmatch(atom_name, args.target): + value = float(value) + args.offset + line[value_index] = value + + if loop_type == ("nef_nmr_spectrum", "_nef_peak"): + exit_error("Error: not implemented for peak lists yet") + + # for loop_data in frame_data.loop_dict.values(): + # print(loop_data.category) + # if SEQUENCE_CODE in loop_data.tags: + # sequence_index = loop_data.tag_index(SEQUENCE_CODE) + # chain_index = loop_data.tag_index(CHAIN_CODE) + # for line in loop_data: + # if line[chain_index] == chain: + # sequence = line[sequence_index] + # if '@' not in sequence: + # try: + # sequence = int(sequence) + # except ValueError: + # pass + # if isinstance(sequence, int): + # sequence = sequence + offset + # line[sequence_index] = str(sequence) + + +def read_args(): + global parser + parser = argparse.ArgumentParser(description="Offset chemical shifts") + parser.add_argument( + "-c", + "--chain", + metavar="CHAIN", + type=str, + default="A", + dest="chain", + help="chain in which to offset shifts", + ) + parser.add_argument( + "-o", + "--offset", + metavar="OFFSET", + type=float, + default=0.0, + dest="offset", + help="offset to add to shifts", + ) + parser.add_argument( + "-t", + "--target", + metavar="TARGET", + type=str, + default=None, + dest="target", + help="nucleus or atom to offset", + ) + parser.add_argument( + "-s", + "--spectra", + metavar="OFFSET", + type=str, + default="*", + dest="spectra", + action="append", + nargs="+", + help="nucleus or atom to offset", + ) + parser.add_argument(metavar="FILES", nargs=argparse.REMAINDER, dest="files") + + return parser.parse_args() + + +def check_stream(): + # make stdin a non-blocking file + fd = sys.stdin.fileno() + fl = fcntl.fcntl(fd, fcntl.F_GETFL) + fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK) + + try: + result = sys.stdin.read() + except Exception: + result = "" + + return result + + +if __name__ == "__main__": + + args = read_args() + + print(args) + + is_tty = sys.stdin.isatty() + + if is_tty and len(args.files) == 0: + parser.print_help() + print() + msg = """I require at least 1 argument or input stream with a chemical_shift_list frame""" + + exit_error(msg) + + entries = [] + try: + if len(args.files) == 0: + lines = check_stream() + if len(lines) != 0: + entries.append(Entry.from_string(lines)) + else: + exit_error("Error: input appears to be empty") + else: + for file in args.files: + entries.append(Entry.from_file(file)) + except OSError as e: + msg = f"couldn't open target nef file because {e}" + exit_error(msg) + + for entry in entries: + offset_chemical_shifts(entry, args) + + print(entry) diff --git a/src/nef_pipelines/tools/renumber_residues.py b/src/nef_pipelines/tools/renumber_residues.py new file mode 100644 index 0000000..bc108b6 --- /dev/null +++ b/src/nef_pipelines/tools/renumber_residues.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +# Importing libraries +import argparse +import fcntl +import os +import sys + +from pynmrstar import Entry + +CHAIN_CODE = "chain_code" + +SEQUENCE_CODE = "sequence_code" + +parser = None + +EXIT_ERROR = 1 + + +def exit_error(msg): + if parser: + parser.print_help() + print() + print(f"ERROR: {msg}") + print("exiting...") + sys.exit(EXIT_ERROR) + + +def offset_residue_numbers(entry, chain, offset): + for frame_data in entry.frame_dict.values(): + for loop_data in frame_data.loop_dict.values(): + if SEQUENCE_CODE in loop_data.tags: + sequence_index = loop_data.tag_index(SEQUENCE_CODE) + chain_index = loop_data.tag_index(CHAIN_CODE) + for line in loop_data: + if line[chain_index] == chain: + sequence = line[sequence_index] + if "@" not in sequence: + try: + sequence = int(sequence) + except ValueError: + pass + if isinstance(sequence, int): + sequence = sequence + offset + line[sequence_index] = str(sequence) + + +def read_args(): + global parser + parser = argparse.ArgumentParser( + description="Add chemical shift errors to a NEF file" + ) + parser.add_argument( + "-c", + "--chain", + metavar="CHAIN", + type=str, + default="A", + dest="chain", + help="chain in which to offset residue numbers", + ) + parser.add_argument( + "-o", + "--offset", + metavar="OFFSET", + type=int, + default=0, + dest="offset", + help="offset to add to residue numbers", + ) + parser.add_argument(metavar="FILES", nargs=argparse.REMAINDER, dest="files") + + return parser.parse_args() + + +def check_stream(): + # make stdin a non-blocking file + fd = sys.stdin.fileno() + fl = fcntl.fcntl(fd, fcntl.F_GETFL) + fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK) + + try: + result = sys.stdin.read() + except Exception: + result = "" + + return result + + +if __name__ == "__main__": + + args = read_args() + + is_tty = sys.stdin.isatty() + + if is_tty and len(args.files) == 0: + parser.print_help() + print() + msg = """I require at least 1 argument or input stream with a chemical_shift_list frame""" + + exit_error(msg) + + entries = [] + try: + if len(args.files) == 0: + lines = check_stream() + if len(lines) != 0: + entries.append(Entry.from_string(lines)) + else: + exit_error("Error: input appears to be empty") + else: + for file in args.files: + entries.append(Entry.from_file(file)) + except OSError as e: + msg = f"couldn't open target nef file because {e}" + exit_error(msg) + + for entry in entries: + offset_residue_numbers(entry, args.chain, args.offset) + + # print(entry) diff --git a/src/nef_pipelines/tools/res_assign.py_unused b/src/nef_pipelines/tools/res_assign.py_unused new file mode 100644 index 0000000..5a06aac --- /dev/null +++ b/src/nef_pipelines/tools/res_assign.py_unused @@ -0,0 +1,318 @@ +#!/usr/bin/env python3 +import argparse +import operator +import sys +from dataclasses import dataclass + +from numpy import average +from pynmrstar import Entry + +parser = None + +EXIT_ERROR = 1 + + +def exit_error(msg): + + print(f"ERROR: {msg}") + print("exiting...") + sys.exit(EXIT_ERROR) + + +@dataclass(frozen=True) +class Assignment: + assignment: int + fixed: bool = False + merit: float = 0.0 + + +def read_args(): + global parser + parser = argparse.ArgumentParser( + description="Assign a NEF file based on output from an assignment in NEF format" + ) + parser.add_argument( + "-t", + "--target", + metavar="TARGET_NEF", + type=str, + default=None, + dest="target", + help="target nef file to assign", + ) + parser.add_argument(metavar="ASSIGNMENT", nargs=1, dest="assignment") + + return parser.parse_args() + + +if __name__ == "__main__": + + args = read_args() + + try: + nef_target_data = Entry.from_file(args.target) + except OSError as e: + msg = f"couldn't open target nef file because {e}" + exit_error(msg) + + try: + res_assign = Entry.from_file(args.assignment[0]) + except OSError as e: + msg = f"couldn't open residue assignments file because {e}" + exit_error(msg) + + shift_list_frames = [] + for frame_name in nef_target_data.frame_dict: + if "nef_chemical_shift_list" in frame_name: + shift_list_frames.append(frame_name) + + if "ccpn_residue_assignments" not in res_assign.frame_dict: + exit_error("res_assign doesn't contain a ccpn_residue_assignments frame") + + residue_assignments = {} + residue_types = {} + + # TODO need constants for names + # TODO need a check of required columns + for loop in res_assign["ccpn_residue_assignments"].loop_dict.values(): + + loop_tags = {tag: i for i, tag in enumerate(loop.tags)} + + for line_number, line in enumerate(loop): + + fixed = line[loop_tags["fixed"]].lower() in ("true", ".") + + # need functions to check + chain = line[loop_tags["chain_code"]] + residue_number = int(line[loop_tags["residue_number"]]) + residue_type = line[loop_tags["residue_type"]] + + residue_key = chain, residue_number + residue_types[residue_key] = residue_type + + if not fixed: + assignment = int(line[loop_tags["assignment"]]) + assigned_residue = int(line[loop_tags["assignment"]]) + merit = float(line[loop_tags["merit"]]) + + residue_assignments.setdefault(residue_key, []).append( + Assignment(assignment, False, merit) + ) + + if residue_key in residue_types: + if (bad_residue_type := residue_types[residue_key]) != residue_type: + msg = f"""more than one residue type for assignment + line {line_number} + file {sys.argv[2]} + + residue types {bad_residue_type} {residue_type} + """ + + min_merit = 0.6 + filtered_residue_assignments = {} + for residue_key, assignments in residue_assignments.items(): + assignments.sort(key=operator.attrgetter("merit")) + + assignment = assignments[0] + if assignment.merit > min_merit: + filtered_residue_assignments[residue_key] = assignment + + +assignment_to_residue = {} +for residue, assignment in filtered_residue_assignments.items(): + assignment_to_residue[assignment] = residue + + +nmr_residue_to_residue = {} +for assignment, residue in assignment_to_residue.items(): + + nmr_residue = "@%i" % assignment.assignment + nmr_residue_minus_1 = "@%i-1" % assignment.assignment + + nmr_residue_to_residue[nmr_residue] = (*residue, residue_types[residue]) + + residue_minus_1 = (residue[0], residue[1] - 1) + nmr_residue_to_residue[nmr_residue_minus_1] = ( + *residue_minus_1, + residue_types[residue_minus_1], + ) + +# for key, value in nmr_residue_to_residue.items(): +# print(key,value) + + +class TagIndexKey: + def __init__(self, tag_indices): + self._tags_indices = tag_indices + + def __call__(self, data): + result = [] + + for tag_index in self._tags_indices: + value = data[tag_index] + + try: + value = int(value) + except ValueError: + pass + + if isinstance(value, int): + result.append((0, value)) + elif isinstance(value, str): + result.append((1, value)) + + return tuple(result) + + +target_frames = ["nef_chemical_shift_list", "nef_nmr_spectrum"] +for frame_name, frame_data in nef_target_data.frame_dict.items(): + + target_type = None + for target_frame in target_frames: + if frame_name.startswith(target_frame): + target_type = target_frame + break + # print(frame_name, target_type) + + if target_type == "nef_chemical_shift_list": + + for loop_name, loop_data in frame_data.loop_dict.items(): + + sequence_code_index = loop_data.tag_index("sequence_code") + chain_code_index = loop_data.tag_index("chain_code") + residue_name_index = loop_data.tag_index("residue_name") + atom_name_index = loop_data.tag_index("atom_name") + + value_index = loop_data.tag_index("value") + value_uncertainty_index = loop_data.tag_index("value_uncertainty") + + tagKey = TagIndexKey( + [chain_code_index, sequence_code_index, atom_name_index] + ) + for line_number, line in enumerate(loop_data): + nmr_residue = line[sequence_code_index] + + if nmr_residue in nmr_residue_to_residue: + + chain_code, residue_number, residue_type = nmr_residue_to_residue[ + nmr_residue + ] + + line[chain_code_index] = chain + line[sequence_code_index] = residue_number + line[residue_name_index] = residue_type + + line_by_assignment = {} + for line_number, line in enumerate(loop_data): + key = ( + line[chain_code_index], + line[sequence_code_index], + line[atom_name_index], + ) + line_by_assignment.setdefault(key, []).append(line) + + for key, lines in line_by_assignment.items(): + if len(lines) > 1: + value = average([float(line[value_index]) for line in lines]) + uncertainty = average( + [float(line[value_uncertainty_index]) for line in lines] + ) + + lines[0][value_index] = value + lines[0][value_uncertainty_index] = uncertainty + for line in lines[1:]: + del loop_data.data[loop_data.data.index(line)] + + # elif '@' in nmr_residue: + # print(f'WARNING: shift list, nmr residue {nmr_residue} not found in file skipping') + + loop_data.sort_rows( + tags=["chain_code", "sequence_code", "atom_name"], key=tagKey + ) # print(line) + + # print (nmr_residue_to_residue) + if target_type == "nef_nmr_spectrum": + + loop_types = {} + + for loop_name, loop_data in frame_data.loop_dict.items(): + + if loop_name == "_nef_peak": + + loop_tags = {tag: i for i, tag in enumerate(loop_data.tags)} + + # print(loop_tags) + + active_indices = [] + for i in range(1, 15): + test_name = "atom_name_%i" % i + if test_name in loop_tags: + active_indices.append(i) + + for data in loop_data: + for active_index in active_indices: + # print(data) + chain_index = loop_tags["chain_code_%i" % active_index] + atom_index = loop_tags["atom_name_%i" % active_index] + residue_number_index = loop_tags[ + "sequence_code_%i" % active_index + ] + residue_type_index = loop_tags["residue_name_%i" % active_index] + + residue_number = data[residue_number_index] + if residue_number in nmr_residue_to_residue: + ( + chain_code, + residue_number, + residue_type, + ) = nmr_residue_to_residue[residue_number] + + data[chain_index] = chain_code + data[residue_number_index] = residue_number + data[residue_type_index] = residue_type + + elif "@" in nmr_residue: + print( + f"WARNING: spectrum, nmr residue {nmr_residue} not found in file skipping" + ) + + # print(data) + # print(nef_target_data) + # for active_index in active_indices: + + # tag_indices = set() + # for tag in loop_tags: + # if tag.startswith('position'): + # tag_index = tag.split('_')[-1] + # # print(tag_index) + # tag_indices.add(tag_index) + # # print('indices',tag_indices) + # print(loop_data.category) + + # print(loop.tag_index) + + # sequence_code_index = loop_tags['sequence_code'] + # chain_code_index = loop_tags['chain_code'] + # residue_name_index = loop_tags['residue_name'] + # for line_number, line in enumerate(loop_data): + # if (nmr_residue := line[sequence_code_index]) in nmr_residue_to_residue: + # chain_code, residue_number, residue_type = nmr_residue_to_residue[nmr_residue] + + # line[chain_code_index] = chain + # line[sequence_code_index] = residue_number + # line[residue_name_index] = residue_type + + # line[sequence] + +# should sort rows but need a schema +# for loop in nef_target_data['nef_chemical_shift_list_default']: +# loop.sort_rows(tags =['chain_code', 'sequence_code']) +print(nef_target_data) + +# for frame_name, frame_data in nef_target_data.frame_dict.items(): +# +# +# +# +# for loop_name, loop_data in frame_data.loop_dict.items(): +# print(id(loop_data)) diff --git a/src/nef_pipelines/tools/stream.py b/src/nef_pipelines/tools/stream.py new file mode 100644 index 0000000..157e53f --- /dev/null +++ b/src/nef_pipelines/tools/stream.py @@ -0,0 +1,18 @@ +from pathlib import Path + +import typer + +from nef_pipelines import nef_app + +if nef_app: + # noinspection PyUnusedLocal + @nef_app.app.command() + def stream( + file_name: Path = typer.Argument( + ..., help="nef file to stream [- indicates stdin]", metavar="" + ) + ): + "- stream a nef file" + with open(file_name) as file_h: + for line in file_h: + print(line, end="") diff --git a/src/nef_pipelines/tools/test.py b/src/nef_pipelines/tools/test.py new file mode 100644 index 0000000..82bb237 --- /dev/null +++ b/src/nef_pipelines/tools/test.py @@ -0,0 +1,95 @@ +import os +import sys +from pathlib import Path +from textwrap import dedent +from typing import List + +import typer + +from nef_pipelines.lib.test_lib import run_and_read_pytest, select_matching_tests +from nef_pipelines.lib.util import exit_error +from nef_pipelines.nef_app import app + +TARGET_HELP = """ + targets are selected as path::name, globs can be used, + missing paths and names are replaced by * eg path:: is equivalent to path::* + note: filenames do not require a trailing .py + """ +TARGET_HELP = dedent(TARGET_HELP) + + +# TODO add verbosity control +# TODO add flag to enable warnings + + +@app.command() +def test( + warnings: bool = typer.Option( + False, "-w", "--warnings", help="include all warnings" + ), + targets: List[str] = typer.Argument(None, help=TARGET_HELP), +): + """- run the test suite""" + + from pytest import main + + dir_path = Path(os.path.dirname(os.path.realpath(__file__))).parent + + root_path = str(dir_path.parent / "..") + + os.chdir(Path(root_path).absolute()) + + tests = _find_pytest_commands(root_path, targets) + + if not targets or (targets and len(tests) != 0): + command = ["-vvv", "--full-trace", *tests] + + if not warnings: + command = ["--disable-warnings", *command] + main(command) + + +def _find_pytest_commands(root_path, targets): + if not targets: + targets = "*" + + ret_code, stdout, stderr = run_and_read_pytest(["--collect-only", "-qq", root_path]) + + print(stdout) + + if ret_code != 0: + output = f""" + return code: {ret_code} + __________________________________________________________________stdout_______________________________________________________________ + {stdout} + _______________________________________________________________________________________________________________________________________ + __________________________________________________________________stderr_______________________________________________________________ + {stderr} + _______________________________________________________________________________________________________________________________________ + """ + print(output, file=sys.stderr) + _exit_if_stderr(stdout) + + tests = stdout.split("\n")[:-3] + + return [ + Path.cwd() / test_path for test_path in select_matching_tests(tests, targets) + ] + + +def _exit_if_stderr(stderr): + if stderr.strip(): + _exit_pytest_error(stderr) + + +def _exit_pytest_error(output): + msg = f""" + couldn't collect tests because of an error + + ----------------------- pytest errors ----------------------- + {output} + ----------------------- pytest errors ----------------------- + """ + msg = dedent(msg) + msg = f"{msg}" + exit_error(msg) diff --git a/src/nef_pipelines/transcoders/__init__.py b/src/nef_pipelines/transcoders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/nef_pipelines/transcoders/fasta/__init__.py b/src/nef_pipelines/transcoders/fasta/__init__.py new file mode 100644 index 0000000..a0ea63d --- /dev/null +++ b/src/nef_pipelines/transcoders/fasta/__init__.py @@ -0,0 +1,19 @@ +import typer + +import nef_pipelines +from nef_pipelines import nef_app + +app = typer.Typer() +import_app = typer.Typer() +export_app = typer.Typer() + +if nef_app.app: + + nef_app.app.add_typer(app, name="fasta", help="- read and write fasta sequences") + + app.add_typer(import_app, name="import", help="- import fasta sequences") + app.add_typer(export_app, name="export", help="- export fasta sequences") + + # import of specific importers must be after app creation to avoid circular imports + import nef_pipelines.transcoders.fasta.exporters.sequence # noqa: F401 + import nef_pipelines.transcoders.fasta.importers.sequence # noqa: F401 diff --git a/src/nef_pipelines/transcoders/fasta/exporters/__init__.py b/src/nef_pipelines/transcoders/fasta/exporters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/nef_pipelines/transcoders/fasta/exporters/sequence.py b/src/nef_pipelines/transcoders/fasta/exporters/sequence.py new file mode 100644 index 0000000..3952cb2 --- /dev/null +++ b/src/nef_pipelines/transcoders/fasta/exporters/sequence.py @@ -0,0 +1,109 @@ +from pathlib import Path +from typing import Dict, List + +import typer + +from nef_pipelines.lib.nef_lib import read_entry_from_file_or_stdin_or_exit_error +from nef_pipelines.lib.sequence_lib import ( + make_chunked_sequence_1let, + sequence_from_frame, + translate_3_to_1, +) +from nef_pipelines.lib.structures import SequenceResidue +from nef_pipelines.lib.util import exit_error +from nef_pipelines.transcoders.fasta import export_app + +app = typer.Typer() + + +# noinspection PyUnusedLocal +@export_app.command() +def sequence( + chain_codes: str = typer.Option( + [], + "-c", + "--chain_code", + help="chains to export, to add multiple chains use repeated calls [default: 'A']", + metavar="", + ), + in_file: Path = typer.Option( + None, "-i", "--in", help="file to read nef data from", metavar="" + ), +): + """- convert nef sequence to fasta""" + + # frame_selectors = parse_comma_separated_options(chain_codes) + + chain_codes = ["A"] if not chain_codes else chain_codes + + entry = read_entry_from_file_or_stdin_or_exit_error(in_file) + + fasta_records = fasta_records_from_entry(entry, chain_codes) + + for record in fasta_records.values(): + print("\n".join(record)) + + +def fasta_records_from_entry(entry, chain_codes): + + molecular_system = molecular_system_from_entry_or_exit(entry) + + residues = sequence_from_frame(molecular_system) + + fasta_records = nef_to_fasta_records(residues, chain_codes) + + return fasta_records + + +def molecular_system_from_entry_or_exit(entry): + + # noinspection PyUnboundLocalVariable + molecular_systems = entry.get_saveframes_by_category("nef_molecular_system") + if not molecular_systems: + exit_error( + "Couldn't find a molecular system frame it should be labelled 'save_nef_molecular_system'" + ) + + if len(molecular_systems) > 1: + exit_error("There can only be one molecular system in a NEF file") + + return molecular_systems[0] + + +def nef_to_fasta_records( + residues: List[SequenceResidue], chain_codes: List[str] +) -> Dict[str, List[str]]: + + residues_by_chain = {} + for chain_code in chain_codes: + residues_by_chain[chain_code] = [ + residue for residue in residues if residue.chain_code == chain_code + ] + + chain_starts = {} + for chain_code, residues in residues_by_chain.items(): + chain_starts[chain_code] = residues[0].sequence_code + + residue_sequences = {} + for chain_code in chain_codes: + residue_sequences[chain_code] = [ + residue.residue_name for residue in residues_by_chain[chain_code] + ] + + residue_sequences = { + chain: translate_3_to_1(residue_sequence) + for chain, residue_sequence in residue_sequences.items() + } + residue_sequences = { + chain: make_chunked_sequence_1let(residue_sequence) + for chain, residue_sequence in residue_sequences.items() + } + + result = {} + for chain_code, residue_sequence in residue_sequences.items(): + result[chain_code] = [ + f">CHAIN: A | START RESIDUE: {chain_starts[chain_code]}", + "\n".join(residue_sequence), + ] + + return result diff --git a/src/nef_pipelines/transcoders/fasta/importers/__init__.py b/src/nef_pipelines/transcoders/fasta/importers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/nef_pipelines/transcoders/fasta/importers/sequence.py b/src/nef_pipelines/transcoders/fasta/importers/sequence.py new file mode 100644 index 0000000..db4846c --- /dev/null +++ b/src/nef_pipelines/transcoders/fasta/importers/sequence.py @@ -0,0 +1,156 @@ +from itertools import chain, cycle, islice, zip_longest +from pathlib import Path +from typing import Iterable, List + +import typer +from Bio.SeqIO.FastaIO import SimpleFastaParser +from ordered_set import OrderedSet + +from nef_pipelines.lib.sequence_lib import ( + chain_code_iter, + offset_chain_residues, + sequence_3let_to_res, + sequence_to_nef_frame, + translate_1_to_3, +) +from nef_pipelines.lib.structures import SequenceResidue +from nef_pipelines.lib.typer_utils import get_args +from nef_pipelines.lib.util import ( + exit_error, + parse_comma_separated_options, + process_stream_and_add_frames, +) +from nef_pipelines.transcoders.fasta import import_app + +app = typer.Typer() + +NO_CHAIN_START_HELP = """don't include the start chain link type on a chain for the first residue [linkage will be + middle] for the named chains. Either use a comma joined list of chains [e.g. A,B] or call this + option multiple times to set chain starts for multiple chains""" +NO_CHAIN_END_HELP = """don't include the end chain link type on a chain for the last residue [linkage will be + middle] for the named chains. Either use a comma joined list of chains [e.g. A,B] or call this + option multiple times to set chain ends for multiple chains""" + + +# todo add comment to other sequences etc +@import_app.command() +def sequence( + chain_codes: List[str] = typer.Option( + [], + "--chains", + help="chain codes to use for the exported chains, can be a a comma sepatared list or can be called " + "multiple times", + metavar="", + ), + starts: List[str] = typer.Option( + [], + "--starts", + help="first residue number of sequences can be a comma separated list or ", + metavar="", + ), + no_chain_starts: List[str] = typer.Option( + [], "--no-chain-start", help=NO_CHAIN_START_HELP + ), + no_chain_ends: List[str] = typer.Option( + [], "--no-chain-end", help=NO_CHAIN_END_HELP + ), + entry_name: str = typer.Option("fasta", help="a name for the entry if required"), + pipe: List[Path] = typer.Option( + None, + metavar="|PIPE|", + help="pipe to read NEF data from, for testing [overrides stdin !use stdin instead!]", + ), + file_names: List[Path] = typer.Argument( + ..., help="the file to read", metavar="" + ), +): + """- convert fasta sequence to nef""" + + chain_codes = parse_comma_separated_options(chain_codes) + no_chain_starts = parse_comma_separated_options(no_chain_starts) + no_chain_ends = parse_comma_separated_options(no_chain_ends) + starts = [int(elem) for elem in parse_comma_separated_options(starts)] + + args = get_args() + + process_sequences(args) + + +def process_sequences(args): + fasta_frames = [] + + chain_code_iterator = chain_code_iter(args.chain_codes) + fasta_sequences = OrderedSet() + for file_path in args.file_names: + fasta_sequences.update(read_sequences(file_path, chain_code_iterator)) + + read_chain_codes = residues_to_chain_codes(fasta_sequences) + + offsets = _get_sequence_offsets(read_chain_codes, args.starts) + + fasta_sequences = offset_chain_residues(fasta_sequences, offsets) + + fasta_sequences = sorted( + fasta_sequences, key=lambda x: (x.chain_code, x.sequence_code) + ) + + fasta_frames.append(sequence_to_nef_frame(fasta_sequences)) + + entry = process_stream_and_add_frames(fasta_frames, args) + + print(entry) + + +def _get_sequence_offsets(chain_codes: List[str], starts: List[int]): + + offsets = [start - 1 for start in starts] + cycle_starts = chain( + offsets, + cycle( + [ + 0, + ] + ), + ) + offsets = list(islice(cycle_starts, len(chain_codes))) + + return {chain_code: offset for chain_code, offset in zip(chain_codes, offsets)} + + +def residues_to_chain_codes(residues: List[SequenceResidue]) -> List[str]: + return list(OrderedSet([residue.chain_code for residue in residues])) + + +# could do with taking a list of offsets +# noinspection PyUnusedLocal +def read_sequences(path: Path, chain_codes: Iterable[str]) -> List[SequenceResidue]: + + residues = OrderedSet() + try: + with open(path) as handle: + try: + sequences = list(SimpleFastaParser(handle)) + except Exception as e: + # check if relative to os.getcwd + exit_error(f"Error reading fasta file {str(path)}", e) + + number_sequences = len(sequences) + + # read as many chain codes as there are sequences + # https://stackoverflow.com/questions/16188270/get-a-fixed-number-of-items-from-a-generator + + chain_codes = list(islice(chain_codes, number_sequences)) + + for (meta_data, sequence), chain_code in zip_longest( + sequences, chain_codes + ): + + sequence_3_let = translate_1_to_3(sequence) + chain_residues = sequence_3let_to_res(sequence_3_let, chain_code) + + residues.update(chain_residues) + + except IOError as e: + exit_error(f"couldn't open {path} because:\n{e}", e) + + return residues diff --git a/src/nef_pipelines/transcoders/mars/__init__.py b/src/nef_pipelines/transcoders/mars/__init__.py new file mode 100644 index 0000000..68aaf1e --- /dev/null +++ b/src/nef_pipelines/transcoders/mars/__init__.py @@ -0,0 +1,18 @@ +import typer + +import nef_pipelines +from nef_pipelines import nef_app + +app = typer.Typer() +export_app = typer.Typer() + +if nef_app.app: + + nef_app.app.add_typer(app, name="mars", help="- export mars [shifts and sequences]") + + app.add_typer( + export_app, name="export", help="- export mars [shifts and sequences]" + ) + + # import of specific importers must be after app creation to avoid circular imports + import nef_pipelines.transcoders.mars.exporters.shifts # noqa: F401 diff --git a/src/nef_pipelines/transcoders/mars/exporters/__init__.py b/src/nef_pipelines/transcoders/mars/exporters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/nef_pipelines/transcoders/mars/exporters/shifts.py b/src/nef_pipelines/transcoders/mars/exporters/shifts.py new file mode 100644 index 0000000..0f1777d --- /dev/null +++ b/src/nef_pipelines/transcoders/mars/exporters/shifts.py @@ -0,0 +1,161 @@ +# original implementation by esther +from dataclasses import dataclass +from string import digits +from typing import List + +import typer +from tabulate import tabulate + +from nef_pipelines.lib.nef_lib import ( + create_entry_from_stdin_or_exit, + select_frames_by_name, +) +from nef_pipelines.lib.shift_lib import nef_frames_to_shifts +from nef_pipelines.lib.util import exit_error, is_float, is_int +from nef_pipelines.transcoders.mars import export_app + +app = typer.Typer() + +# TODO: name translations +# TODO: correct weights +# TODO: move utuilities to lib +# TODO: support multiple chains + +REMOVE_DIGITS = str.maketrans("", "", digits) + + +def has_numbers(input: str) -> bool: + return any(char.isdigit() for char in input) + + +# noinspection PyUnusedLocal +@export_app.command() +def shifts( + shift_frames: List[str] = typer.Option( + [], + "-f", + "--frame", + help="selector for the shift restraint frames to use, can be called multiple times and include wild cards", + ), + chains: str = typer.Option( + [], + "-c", + "--chain", + help="chains to export, to add mutiple chains use repeated calls [default: 'A']", + metavar="", + ), +): + """- convert nef chemical shifts to mars""" + + if len(shift_frames) == 0: + shift_frames = ["*"] + + entry = create_entry_from_stdin_or_exit() + + frames = entry.get_saveframes_by_category("nef_chemical_shift_list") + + frames = select_frames_by_name(frames, shift_frames) + + if len(frames) == 0: + exit_error("no shift frames selected") + + shifts = nef_frames_to_shifts(frames) + + @dataclass(frozen=True, order=True) + class PseudoAtom: + sequence_code: int + residue_name: str + negative_offset: int + atom_name: str + + pseudo_atom_shifts = {} + for shift in shifts: + + # first deal with completely unassigned residues chain_code @- and sequence_code @xxxx where xxx + # is the pseudo residue + chain_code = shift.atom.chain_code + sequence_code = shift.atom.sequence_code + atom_name = shift.atom.atom_name + if chain_code.startswith("@-") and sequence_code.startswith("@"): + sequence_code = sequence_code.lstrip("@") + sequence_code_fields = sequence_code.split("-") + + if len(sequence_code_fields) > 2: + continue + + sequence_code = sequence_code_fields[0] + if not is_int(sequence_code): + continue + sequence_code = int(sequence_code) + + negative_offset = ( + sequence_code_fields[1] if len(sequence_code_fields) == 2 else 0 + ) + + if not is_int(negative_offset): + continue + else: + negative_offset = int(negative_offset) + + if negative_offset not in (0, 1): + continue + + atom_name = atom_name.replace("@", "") + if has_numbers(atom_name): + continue + + pseudo_atom = PseudoAtom( + sequence_code, + shift.atom.residue_name, + negative_offset, + atom_name=atom_name, + ) + + if not is_float(shift.shift): + continue + + # TODO: should be value + value = float(shift.shift) + + pseudo_atom_shifts[pseudo_atom] = value + + headings = ( + ("H", ("H", 0)), + ("N", ("N", 0)), + ("Ca", ("CA", 0)), + ("Ca-1", ("CA", 1)), + ("Cb", ("CB", 0)), + ("Cb-1", ("CB", 1)), + ("CO", ("C", 0)), + ("CO-1", ("C", 1)), + ("HA", ("HA", 0)), + ("HA-1", ("HA", 1)), + ) + headers = (" ", "H", "N", "CA", "CA-1", "CB", "CB-1", "CO", "CO-1", "HA", "HA-1") + + pseudo_residues = {} + for pseudo_atom in pseudo_atom_shifts: + key = (pseudo_atom.atom_name, pseudo_atom.negative_offset) + pseudo_residues.setdefault(pseudo_atom.sequence_code, {})[key] = pseudo_atom + + lines = [] + for pseudo_residue_num in sorted(pseudo_residues): + pseudo_atoms = pseudo_residues[pseudo_residue_num] + + line = [] + lines.append(line) + + line.append("PR_" + str(pseudo_residue_num)) + + for heading, heading_key in headings: + + if heading_key in pseudo_atoms: + + pseudo_atom = pseudo_atoms[heading_key] + shift = pseudo_atom_shifts[pseudo_atom] + + line.append("%-7.3f " % shift) + else: + line.append("- ") + + print(tabulate(lines, headers=headers, tablefmt="plain")) diff --git a/src/nef_pipelines/transcoders/nmrpipe/__init__.py b/src/nef_pipelines/transcoders/nmrpipe/__init__.py new file mode 100644 index 0000000..1a06db0 --- /dev/null +++ b/src/nef_pipelines/transcoders/nmrpipe/__init__.py @@ -0,0 +1,21 @@ +import typer + +import nef_pipelines +from nef_pipelines import nef_app + +app = typer.Typer() +import_app = typer.Typer() + +if nef_app.app: + nef_app.app.add_typer( + app, name="nmrpipe", help="- read nmrpipe [peaks shifts & sequencess]" + ) + + app.add_typer( + import_app, name="import", help="- import nmrpipe [peaks, shifts & sequences]" + ) + + # import of specific importers must be after app creation to avoid circular imports + import nef_pipelines.transcoders.nmrpipe.importers.peaks # noqa: F401 + import nef_pipelines.transcoders.nmrpipe.importers.sequence # noqa: F401 + import nef_pipelines.transcoders.nmrpipe.importers.shifts # noqa: F401 diff --git a/src/nef_pipelines/transcoders/nmrpipe/importers/__init__.py b/src/nef_pipelines/transcoders/nmrpipe/importers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/nef_pipelines/transcoders/nmrpipe/importers/peaks.py b/src/nef_pipelines/transcoders/nmrpipe/importers/peaks.py new file mode 100644 index 0000000..3c82a1c --- /dev/null +++ b/src/nef_pipelines/transcoders/nmrpipe/importers/peaks.py @@ -0,0 +1,84 @@ +from pathlib import Path +from textwrap import dedent +from typing import List + +import typer + +from nef_pipelines.lib.typer_utils import get_args +from nef_pipelines.lib.util import ( + cached_file_stream, + exit_error, + process_stream_and_add_frames, +) +from nef_pipelines.transcoders.nmrpipe import import_app + +from ...nmrview.importers.peaks import create_spectrum_frame +from ..nmrpipe_lib import ( + NMRPIPE_PEAK_EXPECTED_FIELDS, + check_is_peak_file, + get_gdb_columns, + read_db_file_records, + read_peak_file, +) + +app = typer.Typer() + + +# noinspection PyUnusedLocal +@import_app.command(no_args_is_help=True) +def peaks( + entry_name: str = typer.Option("nmrpipe", help="a name for the entry"), + chain_code: str = typer.Option( + "A", "--chain", help="chain code", metavar="" + ), + axis_codes: str = typer.Option( + "1H.15N", metavar="", help="a list of axis codes joined by dots" + ), + filter_noise: bool = typer.Option(True, help="remove peaks labelled as noise"), + file_names: List[Path] = typer.Argument( + ..., help="input peak files", metavar="" + ), +): + """convert nmrpipe peak file .xpk files to NEF""" + + args = get_args() + + peak_lists = _read_nmrpipe_peaks(args) + + frames = [] + for i, peak_list in enumerate(peak_lists, start=1): + + frame_name = entry_name if len(peak_lists) == 1 else f"{entry_name}_{i}" + + frames.append(create_spectrum_frame(args, frame_name, peak_list)) + + entry = process_stream_and_add_frames(frames, args) + + print(entry) + + +def _read_nmrpipe_peaks(args): + results = [] + for file_name in args.file_names: + file_h = cached_file_stream(file_name) + gdb_file = read_db_file_records(file_h, file_name=file_name) + + _check_is_peak_file_or_exit(gdb_file) + + results.append(read_peak_file(gdb_file, args)) + + return results + + +def _check_is_peak_file_or_exit(gdb_file): + columns = set(get_gdb_columns(gdb_file)) + + if not check_is_peak_file(gdb_file): + msg = f"""\ + this gdb file doesn't appear to contain all the columns expected for a peak file + expected: {','.join(NMRPIPE_PEAK_EXPECTED_FIELDS)} + got {','.join(columns & NMRPIPE_PEAK_EXPECTED_FIELDS)} + file: {gdb_file.file_name} + """ + msg = dedent(msg) + exit_error(msg) diff --git a/src/nef_pipelines/transcoders/nmrpipe/importers/sequence.py b/src/nef_pipelines/transcoders/nmrpipe/importers/sequence.py new file mode 100644 index 0000000..5449933 --- /dev/null +++ b/src/nef_pipelines/transcoders/nmrpipe/importers/sequence.py @@ -0,0 +1,121 @@ +# TODO add support for DATA FIRST_RESID +# noinspection PyUnusedLocal +from argparse import Namespace +from pathlib import Path +from typing import Iterable, List + +import typer + +from nef_pipelines.lib.sequence_lib import ( + chain_code_iter, + offset_chain_residues, + sequence_3let_to_sequence_residues, + sequence_to_nef_frame, +) +from nef_pipelines.lib.structures import SequenceResidue +from nef_pipelines.lib.typer_utils import get_args +from nef_pipelines.lib.util import ( + cached_file_stream, + exit_error, + process_stream_and_add_frames, +) +from nef_pipelines.transcoders.nmrpipe import import_app +from nef_pipelines.transcoders.nmrpipe.nmrpipe_lib import ( + gdb_to_3let_sequence, + read_db_file_records, +) + +app = typer.Typer() + + +# noinspection PyUnusedLocal +@import_app.command() +def sequence( + chain_codes: str = typer.Option( + "A", + "--chains", + help="chain codes as a list of names spearated by dots", + metavar="", + ), + no_chain_start: bool = typer.Option( + False, + "--no-chain-start/", + help="don't include a start of chain link type for the first residue", + ), + no_chain_end: bool = typer.Option( + False, + "--no-chain-end/", + help="don't include an end of chain link type for the last residue", + ), + entry_name: str = typer.Option("nmrpipe", help="a name for the entry"), + pipe: Path = typer.Option( + None, + metavar="|PIPE|", + help="pipe to read NEF data from, for testing [overrides stdin !use stdin instead!]", + ), + start: int = typer.Option( + 1, metavar="offset", help="residue number to start the sequence at" + ), + file_names: List[Path] = typer.Argument( + ..., help="input files of type nmrview.seq", metavar="" + ), +): + """convert nmrview sequence file .seq files to NEF""" + try: + args = get_args() + + process_sequence(args) + except Exception as e: + exit_error(f"reading sequence failed because {e}", e) + + +def process_sequence(args: Namespace): + nmrpipe_frames = [] + + for file_name, chain_code in zip( + args.file_names, chain_code_iter(args.chain_codes) + ): + # cached_file_stream + with cached_file_stream(file_name) as lines: + + nmrpipe_sequence = read_sequence( + lines, + chain_code=chain_code, + sequence_file_name=file_name, + start=args.start, + ) + + frame = sequence_to_nef_frame(nmrpipe_sequence, args.entry_name) + + nmrpipe_frames.append(frame) + + entry = process_stream_and_add_frames(nmrpipe_frames, args) + + print(entry) + + +def read_sequence( + sequence_lines: Iterable[str], + chain_code: str = "A", + sequence_file_name: str = "unknown", + start: int = 1, +) -> List[SequenceResidue]: + + gdb_file = read_db_file_records(sequence_lines, sequence_file_name) + + sequence_3let = gdb_to_3let_sequence(gdb_file) + + sequence_residues = sequence_3let_to_sequence_residues( + sequence_3let, chain_code=chain_code + ) + + sequence_residues = offset_chain_residues( + sequence_residues, {chain_code: start - 1} + ) + + return sequence_residues + + +if __name__ == "__main__": + + typer.run(sequence) diff --git a/src/nef_pipelines/transcoders/nmrpipe/importers/shifts.py b/src/nef_pipelines/transcoders/nmrpipe/importers/shifts.py new file mode 100644 index 0000000..10acdaa --- /dev/null +++ b/src/nef_pipelines/transcoders/nmrpipe/importers/shifts.py @@ -0,0 +1,81 @@ +from argparse import Namespace +from pathlib import Path +from typing import Iterable, List + +import typer + +from nef_pipelines.lib.sequence_lib import chain_code_iter +from nef_pipelines.lib.shift_lib import shifts_to_nef_frame +from nef_pipelines.lib.structures import ShiftList +from nef_pipelines.lib.typer_utils import get_args +from nef_pipelines.lib.util import ( + cached_file_stream, + exit_error, + process_stream_and_add_frames, +) +from nef_pipelines.transcoders.nmrpipe import import_app +from nef_pipelines.transcoders.nmrpipe.nmrpipe_lib import ( + read_db_file_records, + read_shift_file, +) + +app = typer.Typer() + + +# noinspection PyUnusedLocal +@import_app.command() +def shifts( + chain_codes: str = typer.Option( + "A", + "--chains", + help="chain codes as a list of names spearated by dots", + metavar="", + ), + entry_name: str = typer.Option("nmrpipe", help="a name for the entry"), + pipe: Path = typer.Option( + None, + metavar="|PIPE|", + help="pipe to read NEF data from, for testing [overrides stdin !use stdin instead!]", + ), + file_names: List[Path] = typer.Argument( + ..., help="input files of type nmrpipe.tab", metavar="" + ), +): + """convert nmrpipe shift file .tab files to NEF""" + try: + args = get_args() + + process_shifts(args) + except Exception as e: + exit_error(f"reading sequence failed because {e}", e) + + +def process_shifts(args: Namespace): + nmrpipe_frames = [] + + for file_name, chain_code in zip( + args.file_names, chain_code_iter(args.chain_codes) + ): + # cached_file_stream + with cached_file_stream(file_name) as lines: + + nmrpipe_shifts = read_shifts(lines, chain_code=chain_code) + + frame = shifts_to_nef_frame(nmrpipe_shifts, args.entry_name) + + nmrpipe_frames.append(frame) + + entry = process_stream_and_add_frames(nmrpipe_frames, args) + + print(entry) + + +def read_shifts( + shift_lines: Iterable[str], + chain_code: str = "A", + sequence_file_name: str = "unknown", +) -> ShiftList: + + gdb_file = read_db_file_records(shift_lines, sequence_file_name) + + return read_shift_file(gdb_file, chain_code) diff --git a/src/nef_pipelines/transcoders/nmrpipe/nmrpipe_lib.py b/src/nef_pipelines/transcoders/nmrpipe/nmrpipe_lib.py new file mode 100644 index 0000000..fb4d3be --- /dev/null +++ b/src/nef_pipelines/transcoders/nmrpipe/nmrpipe_lib.py @@ -0,0 +1,688 @@ +import functools +import operator +import re +from collections import Counter +from dataclasses import dataclass, field +from enum import IntEnum +from textwrap import dedent +from typing import Callable, Dict, List, Optional, TextIO, Tuple, Union + +from tabulate import tabulate + +from nef_pipelines.lib.sequence_lib import ( + TRANSLATIONS_1_3, + make_chunked_sequence_1let, + translate_1_to_3, +) +from nef_pipelines.lib.structures import ( + AtomLabel, + LineInfo, + PeakAxis, + PeakList, + PeakListData, + PeakValues, + SequenceResidue, + ShiftData, + ShiftList, +) +from nef_pipelines.lib.util import is_int + + +class PEAK_TYPES(IntEnum): + PEAK = 1 + NOISE = 2 + SINC_WIGGLE = 3 + + +# known field types +DATA = "DATA" +REMARK = "REMARK" +VALUES = "__VALUES__" +SEQUENCE = "SEQUENCE" +FORMAT = "FORMAT" +VARS = "VARS" +COMMENT = "#" +NULLSTRING = "NULLSTRING" +NULLVALUE = "NULLVALUE" + +NMRPIPE_PEAK_EXPECTED_FIELDS = "INDEX X_AXIS XW XW_HZ ASS CLUSTID MEMCNT".split() +NMRPIPE_SHIFTS_EXPECTED_FIELDS = "RESID RESNAME ATOMNAME SHIFT".split() + + +@dataclass(frozen=True) +class DbRecord: + index: int + type: str + values: Tuple[Union[int, str, float]] + line_info: LineInfo = None + + +@dataclass +class DbFile: + name: str = "unknown" + records: List[DbRecord] = field(default_factory=list) + + +def _raise_data_before_format(line_info): + msg = f"""\ + bad nmrpipe db file, data seen before VAR and FORMAT + file: {line_info.file_name}' + line no: {line_info.line_no} + line: {line_info.line} + """ + msg = dedent(msg) + raise DataBeforeFormat(msg) + + +def read_db_file_records(file_h: TextIO, file_name: str = "unknown") -> DbFile: + + """ + Read from NmrPipe (NIH) tab/gdb-file + Args: + file_h (TextIO): a file like object + file_name (str): the name of the file being read (for debugging) + + Returns DbFile: + a list of all the records in the fiule + """ + + records: List[DbRecord] = [] + column_names = None + column_formats = None + record_count = Counter() + in_header = True + + for line_index, line in enumerate(file_h): + + line_info = LineInfo(file_name, line_index + 1, line) + line = line.strip() + + raw_fields = line.strip().split() + + if len(line) == 0: + continue + + record_type = raw_fields[0] + record_count[record_type] += 1 + + fields = raw_fields + + handled = False + + if in_header: + if record_type == "VARS": + + column_names = fields[1:] + records.append( + DbRecord( + record_count[record_type], record_type, column_names, line_info + ) + ) + handled = True + + if record_count[record_type] != 1: + _raise_multiple(record_type, line_info) + + if record_type == "FORMAT": + column_formats = _formats_to_constructors(fields, line_info) + + _check_var_and_format_count_raise_if_bad( + column_names, column_formats, line_info + ) + + records.append( + DbRecord( + record_count[record_type], record_type, fields[1:], line_info + ) + ) + in_header = False + continue + + if record_type in ("REMARK", "#"): + records.append( + DbRecord(record_count[record_type], record_type, line, line_info) + ) + handled = True + + if record_type == DATA: + records.append( + DbRecord( + record_count[record_type], record_type, fields[1:], line_info + ) + ) + handled = True + + if not handled: + _raise_data_before_format(line_info) + + if not in_header: + + if record_type in (VARS, FORMAT): + _raise_multiple(record_type, line_info) + + if column_names and column_formats: + record_count["__VALUES__"] += 1 + + del record_count[record_type] + + values = _build_values_or_raise( + column_formats, column_names, fields, line_info + ) + + record = DbRecord( + record_count["__VALUES__"], "__VALUES__", values, line_info + ) + records.append(record) + + handled = True + + if not handled: + records.append( + DbRecord(record_count[record_type], record_type, fields[1:], line_info) + ) + + return DbFile(file_name, records) + + +def _find_nth(haystack, needle, n): + # https://stackoverflow.com/questions/1883980/find-the-nth-occurrence-of-substring-in-a-string/41626399#41626399 + start = haystack.find(needle) + while start >= 0 and n > 1: + start = haystack.find(needle, start + len(needle)) + n -= 1 + return start + + +def _build_values_or_raise(column_formats, column_names, fields, line_info): + columns_formats = _check_column_count_raise_if_bad( + column_formats, column_names, line_info + ) + result = [] + field_count = Counter() + for column_no, (raw_field, constructor) in enumerate(zip(fields, columns_formats)): + try: + field_count[raw_field] += 1 + value = constructor(raw_field) + + except Exception: + absolute_column = _find_nth( + line_info.line, raw_field, field_count[raw_field] + ) + msg = f""" + Couldn't convert {raw_field} to type {_constructor_to_name(constructor)} + file: {line_info.file_name} + line no: {line_info.line_no} + column: {column_no + 1} + line: {line_info.line.rstrip()} + {' ' * absolute_column + '^'} + """ + msg = dedent(msg) + raise BadFieldFormat(msg) + result.append(value) + return result + + +def _raise_multiple(format_str, line_info): + msg = f"""\ + bad NMRPipe db file, multiple {format_str} statements found + file: {line_info.file_name} + line no: {line_info.line_no} + line: {line_info.line} + """ + msg = dedent(msg) + + if format_str == "VARS": + raise MultipleVars(msg) + elif format_str == "FORMAT": + raise MultipleFormat(msg) + + +def _check_var_and_format_count_raise_if_bad(column_names, column_formats, line_info): + if column_names is None: + msg = f"""\ + no column names defined by a VARS line when FORMAT line read + file: {line_info.file_name} + line no: {line_info.line_no} + line: {line_info.line}""" + msg = dedent(msg) + raise NoVarsLine(msg) + + num_formats = len(column_names) + num_column_names = len(column_formats) + if num_formats != num_column_names: + msg = f"""\ + number of column names and formats must agree + got {num_column_names} column names and {num_formats} formats + file: {line_info.file_name} + line no: {line_info.line_no} + line: {line_info.line} + """ + msg = dedent(msg) + + raise WrongColumnCount(msg) + + +def _check_column_count_raise_if_bad(column_formats, column_names, line_info): + raw_fields = line_info.line.split() + num_fields = len(raw_fields) + num_columns = len(column_formats) + + missing_fields = ["*"] * abs(num_fields - num_columns) + raw_fields = [*raw_fields, *missing_fields] + + if num_fields != num_columns: + column_formats = _constructor_names(column_formats) + tab = [ + column_names, + column_formats, + raw_fields, + ] + tabulated = tabulate(tab, tablefmt="plain") + msg = f"""\ + number fields ({num_fields + 1}) doesn't not match number of columns ({num_columns + 1}) + + expected + %s + + missing fields marked with * + + file: {line_info.file_name} + line no: {line_info.line_no} + line: {line_info.line} + + """ + msg = dedent(msg) + msg = msg % tabulated + raise WrongColumnCount(msg) + return column_formats + + +def _formats_to_constructors(formats, line_info): + result = [] + + field_counter = Counter() + for column_index, field_format in enumerate(formats[1:]): + field_counter[field_format] += 1 + field_format = field_format.strip() + field_format = field_format[-1] + + if field_format == "d": + result.append(int) + elif field_format == "f": + result.append(float) + elif field_format == "s": + result.append(str) + elif field_format == "e": + result.append(float) + else: + format_column = _find_nth( + line_info.line, field_format, field_counter[field_format] + ) + msg = f""" + unexpected format {field_format} at index {column_index+1}, expected formats are: + s, d, e, f (string, integer, scientific(float), float) + + file: {line_info.file_name} + line no: {line_info.line_no} + line: {line_info.line} + {' ' * format_column + '^'} + + """ + raise BadFieldFormat(msg) + return result + + +OptionDbRecordPredicate = Optional[Callable[[DbRecord], bool]] + + +def select_records( + gdb: DbFile, record_type: str, predicate: OptionDbRecordPredicate = None +) -> List[DbRecord]: + """ + Select records from a gdb file by type and predicate + Args: + gdb (DbFile): gdb/tab file + type (str): the type of the record #, REMARK, __VALUE__ etc + predicate (OptionDbRecordPredicate): an optional test to apply to the record + + Returns List[DbRecord]: + in the selected gdb/tab records + """ + result = [record for record in gdb.records if record.type == record_type] + if predicate: + result = [record for record in result if predicate(record)] + return result + + +def gdb_to_3let_sequence( + gdb: DbFile, translations: Dict[str, str] = TRANSLATIONS_1_3 +) -> List[SequenceResidue]: + """ + read sequence records from a gdb file and convert them to a list of sequence residues + it is assumed that residues start from 1 and are in chain A + Args: + gdb (DbFile): the source db/tab file records + translations (Dict[str, str]): a translation table for 1 letter codes to 3 letter codes + + Returns List[SequenceResidue]: + a list of sequence residues + """ + sequence_records = select_records( + gdb, "DATA", predicate=lambda rec: rec.values[0] == "SEQUENCE" + ) + + sequence = [record.values[1:] for record in sequence_records] + + flattened_sequence = functools.reduce(operator.iconcat, sequence, []) + sequence_string = "".join(flattened_sequence) + + return translate_1_to_3(sequence_string, translations) + + +def _assignments_to_atom_labels(assignments, dimensions, chain_code="A"): + result = [] + + for assignment in assignments: + chain_code = chain_code + + residue_name = None + len_assignment = len(assignment) + if len_assignment > 0: + residue_name = translate_1_to_3(assignment[0], unknown=".")[0] + + sequence_code = None + if len_assignment > 1: + raw_sequence_code = assignment[1] + if is_int(raw_sequence_code): + sequence_code = int(raw_sequence_code) + + atom_name = None + if len_assignment > 2: + atom_name = assignment[2] + + result.append( + AtomLabel( + SequenceResidue(chain_code, sequence_code, residue_name), atom_name + ) + ) + + len_result = len(result) + if len_result < dimensions: + for i in len_result - dimensions: + result.append(AtomLabel(SequenceResidue(None, None, None), None)) + return result + + +# NMRPIPE VARS https://spin.niddk.nih.gov/NMRPipe/doc2new/ +# +# INDEX = 'INDEX' # REQUIRED - The unique peak ID number. +# X_AXIS = 'X_AXIS' # REQUIRED - Peak position: in points in 1st dimension, from left of spectrum limit +# Y_AXIS = 'Y_AXIS' # REQUIRED - Peak position: in points in 2nd dimension, from bottom of spectrum limit +# DX = 'DX' # NOT REQUIRED - Estimate of the error in peak position due to random noise, in points. +# DY = 'DY' # NOT REQUIRED - Estimate of the error in peak position due to random noise, in points. +# X_PPM = 'X_PPM' # NOT REQUIRED - Peak position: in ppm in 1st dimension +# Y_PPM = 'Y_PPM' # NOT REQUIRED - Peak position: in ppm in 2nd dimension +# X_HZ = 'X_HZ' # NOT REQUIRED - Peak position: in Hz in 1st dimension +# Y_HZ = 'Y_HZ' # NOT REQUIRED - Peak position: in Hz in 2nd dimension +# XW = 'XW' # REQUIRED - Peak width: in points in 1st dimension +# YW = 'YW' # REQUIRED - Peak width: in points in 2nd dimension +# XW_HZ = 'XW_HZ' # REQUIRED - Peak width: in points in 1st dimension +# YW_HZ = 'YW_HZ' # REQUIRED - Peak width: in points in 2nd dimension +# X1 = 'X1' # NOT REQUIRED - Left border of peak in 1st dim, in points +# X3 = 'X3' # NOT REQUIRED - Right border of peak in 1st dim, in points +# Y1 = 'Y1' # NOT REQUIRED - Left border of peak in 2nd dim, in points +# Y3 = 'Y3' # NOT REQUIRED - Right border of peak in 2nd, in points +# HEIGHT = 'HEIGHT' # NOT REQUIRED - Peak height +# DHEIGHT = 'DHEIGHT' # NOT REQUIRED - Peak height error +# VOL = 'VOL' # NOT REQUIRED - Peak volume +# PCHI2 = 'PCHI2' # NOT REQUIRED - the Chi-square probability for the peak (i.e. probability due to the noise) +# TYPE = 'TYPE' # NOT REQUIRED - the peak classification; 1=Peak, 2=Random Noise, 3=Truncation artifact. +# ASS = 'ASS' # REQUIRED - Peak assignment +# CLUSTID = 'CLUSTID' # REQUIRED - Peak cluster id. Peaks with the same CLUSTID value are the overlapped. +# MEMCNT = 'MEMCNT' # REQUIRED - the total number of peaks which are in a given peak's cluster +# (i.e. peaks which have the same CLUSTID value) + + +def get_column_indices(gdb_file): + return {column: index for index, column in enumerate(get_gdb_columns(gdb_file))} + + +def _mean(values): + return sum(values) / len(values) + + +def _get_peak_list_dimension(gdb_file): + + return len(_get_axis_labels(gdb_file)) + + +def _get_axis_labels(gdb_file): + columns = get_gdb_columns(gdb_file) + + result = [] + for var in columns: + if var.endswith("_AXIS"): + result.append(var.split("_")[0]) + return result + + +def get_gdb_columns(gdb_file): + return select_records(gdb_file, VARS)[0].values + + +def check_is_peak_file(gdb_file): + columns = set(get_gdb_columns(gdb_file)) + expected_fields = set(NMRPIPE_PEAK_EXPECTED_FIELDS) + + return expected_fields.issubset(columns) + + +def check_is_shifts_file(gdb_file): + columns = set(get_gdb_columns(gdb_file)) + expected_fields = set(NMRPIPE_SHIFTS_EXPECTED_FIELDS) + + return expected_fields.issubset(columns) + + +def read_peak_file(gdb_file, args): + data = select_records(gdb_file, VALUES) + + dimensions = _get_peak_list_dimension(gdb_file) + + column_indices = get_column_indices(gdb_file) + + axis_labels = _get_axis_labels(gdb_file) + + spectrometer_frequencies = [[] for _ in range(dimensions)] + + raw_peaks = [] + for index, line in enumerate(data, start=1): + + peak = {} + raw_peaks.append(peak) + peak_type = line.values[column_indices["TYPE"]] + if args.filter_noise and peak_type != PEAK_TYPES.PEAK: + continue + + assignment = line.values[column_indices["ASS"]] + assignments = assignment.split("-") + assignments = _propagate_assignments(assignments) + assignments = _assignments_to_atom_labels(assignments, dimensions) + + height = line.values[column_indices["HEIGHT"]] + # height_error = line.values[column_indices["DHEIGHT"]] + # height_percentage_error = height_error / height + volume = line.values[column_indices["VOL"]] + # volume_error = volume * height_percentage_error + + peak_values = PeakValues( + serial=index, volume=volume, height=height, deleted=False, comment="" + ) + peak["values"] = peak_values + + for i, dimension in enumerate("X Y Z A".split()[:dimensions]): + + shift = line.values[column_indices["%s_PPM" % dimension]] + + # point_error = line.values[column_indices["D%s" % dimension]] + + # point = line.values[column_indices["%s_AXIS" % dimension]] + # shift_error = point_error / point * shift + + pos_hz = line.values[column_indices["%s_HZ" % dimension]] + + axis = PeakAxis(atom_labels=assignments[i], ppm=shift, merit=1) + + peak[i] = axis + + sf = pos_hz / shift + + spectrometer_frequencies[i].append(sf) + + spectrometer_frequencies = [ + _mean(frequencies) for frequencies in spectrometer_frequencies + ] + header_data = PeakListData( + num_axis=dimensions, + axis_labels=axis_labels, + data_set=None, + sweep_widths=None, + spectrometer_frequencies=spectrometer_frequencies, + ) + + peak_list = PeakList(header_data, raw_peaks) + + return peak_list + + +def read_shift_file(gdb_file, chain_code="A"): + data = select_records(gdb_file, VALUES) + + column_indices = get_column_indices(gdb_file) + + raw_shifts = [] + for index, line in enumerate(data, start=1): + + atom_name = line.values[column_indices["ATOMNAME"]] + residue_number = line.values[column_indices["RESID"]] + residue_type = line.values[column_indices["RESNAME"]] + shift = line.values[column_indices["SHIFT"]] + + atom = AtomLabel( + SequenceResidue(chain_code, residue_number, residue_type), atom_name + ) + + shift = ShiftData(atom, shift) + + raw_shifts.append(shift) + + return ShiftList(raw_shifts) + + +def _propagate_assignments(assignments): + + current = [] + result = [] + for assignment in assignments: + fields = re.split(r"(\d+)", assignment) + if len(fields) == 3: + current = fields[:2] + elif len(fields) == 1: + fields = [*current, *fields] + + result.append(fields) + + return result + + +def _constructor_to_name(constructor): + constructors_to_type_name = {int: "int", float: "float", str: "str"} + + return constructors_to_type_name[constructor] + + +def _constructor_names(constructors): + + result = [] + for constructor in constructors: + result.append(_constructor_to_name(constructor)) + + return result + + +class BadNmrPipeFile(Exception): + """ + Base exception for bad nmr pipe files, this is the one to catch! + """ + + pass + + +class BadFieldFormat(BadNmrPipeFile): + """ + One of the fields int he file has a bad format + """ + + pass + + +class WrongColumnCount(BadNmrPipeFile): + """ + The number of columns in the VARS FORMAT or data lines disagree in their count + """ + + pass + + +class NoVarsLine(BadNmrPipeFile): + """ + Tried to read data without a VARS line + """ + + pass + + +class NoFormatLine(BadNmrPipeFile): + """ + Tried to read data without a FORMAT line + """ + + pass + + +class MultipleVars(BadNmrPipeFile): + """ + Multiple VARS lines detected + """ + + pass + + +class MultipleFormat(BadNmrPipeFile): + """ + Multiple FORMAT lines detected + """ + + pass + + +class DataBeforeFormat(BadNmrPipeFile): + """ + Data seen before VARS and FORMAT lines detected + """ + + pass + + +def print_pipe_sequence(sequence_1_let: List[str]) -> str: + + """ + convert a set of 1 letter amino acid codes to an nmr pipe DATA SEQUENCE record + :param sequence_1_let: 1 letter aino acid codes as a list of strings + :return: nmrpipe data sequence string nicely formatted + """ + + row_strings = make_chunked_sequence_1let(sequence_1_let) + + for row_string in row_strings: + print(f"DATA SEQUENCE {row_string}") diff --git a/src/nef_pipelines/transcoders/nmrview/__init__.py b/src/nef_pipelines/transcoders/nmrview/__init__.py new file mode 100644 index 0000000..cf347d2 --- /dev/null +++ b/src/nef_pipelines/transcoders/nmrview/__init__.py @@ -0,0 +1,28 @@ +import typer + +from nef_pipelines import nef_app + +app = typer.Typer() +import_app = typer.Typer() +export_app = typer.Typer() + +if nef_app.app: + + nef_app.app.add_typer( + app, + name="nmrview", + help="- read and write nmrview [peaks, sequences & shifts]", + ) + + app.add_typer( + import_app, name="import", help="- import nmrview [peaks, sequences & shifts]" + ) + app.add_typer( + export_app, name="export", help="- export nmrview [peaks, sequences & shifts]" + ) + + # import of specific importers must be after app creation to avoid circular imports + import nef_pipelines.transcoders.nmrview.exporters.peaks # noqa: F401 + import nef_pipelines.transcoders.nmrview.importers.peaks # noqa: F401 + import nef_pipelines.transcoders.nmrview.importers.sequence # noqa: F401 + import nef_pipelines.transcoders.nmrview.importers.shifts # noqa: F401 diff --git a/src/nef_pipelines/transcoders/nmrview/exporters/peaks.py b/src/nef_pipelines/transcoders/nmrview/exporters/peaks.py new file mode 100644 index 0000000..0d64094 --- /dev/null +++ b/src/nef_pipelines/transcoders/nmrview/exporters/peaks.py @@ -0,0 +1,441 @@ +from argparse import Namespace +from collections import Counter +from enum import auto +from fnmatch import fnmatch +from pathlib import Path +from typing import Dict, List, Tuple, Union + +import typer +from strenum import StrEnum +from tabulate import tabulate + +from nef_pipelines.lib.constants import NEF_UNKNOWN +from nef_pipelines.lib.nef_lib import ( + create_entry_from_stdin_or_exit, + loop_row_dict_iter, + loop_row_namespace_iter, +) +from nef_pipelines.lib.structures import AtomLabel, Peak, PeakValues, SequenceResidue +from nef_pipelines.lib.util import exit_error, is_float, parse_comma_separated_options +from nef_pipelines.transcoders.nmrview import export_app + +app = typer.Typer() + + +class HEADINGS(StrEnum): + LABEL = auto() + SHIFT = auto() + WIDTH = auto() + BOUND = auto() + ERROR = auto() + COUPLING = auto() + COMMENT = auto() + + +HEADING_TRANSLATIONS = { + HEADINGS.LABEL: "L", # the assignment + HEADINGS.SHIFT: "P", # the shift + HEADINGS.WIDTH: "W", # width at half height in ppm + HEADINGS.BOUND: "B", # bound width at lowest contour level when picked in ppm, assume gaussian? + HEADINGS.ERROR: "E", # an error code always '++' ? + HEADINGS.COUPLING: "J", # always 0.00 for us + HEADINGS.COMMENT: "U", # user comment always currently {?} +} + + +def _peak_to_atom_labels(row: Namespace) -> List[AtomLabel]: + result = [] + for i in range(1, 15): + dim_chain_code = f"chain_code_{i}" + dim_sequence_code = f"sequence_code_{i}" + dim_residue_name = f"residue_name_{i}" + dim_atom_name = f"atom_name_{i}" + + if ( + dim_chain_code in row + and dim_sequence_code in row + and dim_residue_name in row + and dim_atom_name in row + ): + chain_code = getattr(row, dim_chain_code) + sequence_code = getattr(row, dim_sequence_code) + residue_name = getattr(row, dim_residue_name) + atom_name = getattr(row, dim_atom_name) + + result.append( + AtomLabel( + SequenceResidue(chain_code, sequence_code, residue_name), atom_name + ) + ) + + return result + + +def _atom_label_to_nmrview(label: AtomLabel) -> str: + return f"{{{label.residue.sequence_code}.{label.atom_name}}}" + + +def _convert_to_floats_or_exit( + putative_floats: List[str], thing_types: str +) -> List[float]: + are_floats = [ + is_float(spectrometer_frequency) for spectrometer_frequency in putative_floats + ] + if not all(are_floats): + bad_dims = [ + dim + for dim, is_float_value in enumerate(are_floats, start=1) + if not is_float_value + ] + bad_values = [ + value + for is_float_value, value in zip(are_floats, putative_floats) + if not is_float_value + ] + message = f"""\ + the following {thing_types} frequencies which cannot be converted to floats + dims: {', '.join(bad_dims)} + bad values: {', '.join(bad_values)} + """ + + exit_error(message) + + return [float(value) for value in putative_floats] + + +def _row_to_peak( + axis_names: Tuple[str], row: Dict[str, Union[int, str, float]] +) -> Peak: + peak_id = row["peak_id"] + volume = row["volume"] if row["volume"] != NEF_UNKNOWN else None + volume_uncertainty = ( + row["volume_uncertainty"] if row["volume_uncertainty"] != NEF_UNKNOWN else None + ) + height = row["height"] if row["height"] != NEF_UNKNOWN else None + height_uncertainty = ( + row["height_uncertainty"] if row["height_uncertainty"] != NEF_UNKNOWN else None + ) + + positions = {} + position_uncertainties = {} + for axis_index, axis_name in enumerate(axis_names, start=1): + position_tag = f"position_{axis_index}" + position_uncertainty_tag = f"position_uncertainty_{axis_index}" + positions[axis_name] = row[position_tag] + position_uncertainties[axis_name] = ( + row[position_uncertainty_tag] + if row[position_uncertainty_tag] != NEF_UNKNOWN + else None + ) + position_uncertainties[axis_name] + + assignments = {} + for axis_index, axis_name in enumerate(axis_names, start=1): + assignment_chain_code_tag = f"chain_code_{axis_index}" + assignment_sequence_code_tag = f"sequence_code_{axis_index}" + assignment_residue_name_tag = f"residue_name_{axis_index}" + assignment_atom_name_tag = f"atom_name_{axis_index}" + + chain_code = row[assignment_chain_code_tag] + sequence_code = row[assignment_sequence_code_tag] + residue_name = row[assignment_residue_name_tag] + atom_name = row[assignment_atom_name_tag] + + is_assigned = ( + chain_code != NEF_UNKNOWN + and sequence_code != NEF_UNKNOWN + and atom_name != NEF_UNKNOWN + ) + atom_labels = ( + [ + AtomLabel( + SequenceResidue(chain_code, sequence_code, residue_name), atom_name + ), + ] + if is_assigned + else [] + ) + + assignments[axis_name] = atom_labels + + uncertainties_none = [ + uncertainty is None for uncertainty in position_uncertainties.values() + ] + if all(uncertainties_none): + position_uncertainties = None + + peak_values = PeakValues( + serial=peak_id + 1, + volume=volume, + height=height, + volume_uncertainty=volume_uncertainty, + height_uncertainty=height_uncertainty, + ) + return Peak( + id=peak_id, + values=peak_values, + positions=positions, + position_uncertainties=position_uncertainties, + assignments=assignments, + ) + + +def _peak_to_nmrview_label(atom_label: AtomLabel, default_chain=None): + result = [] + + nmrview_chain = ( + atom_label.residue.chain_code + if atom_label.residue.chain_code != NEF_UNKNOWN + else default_chain + ) + if nmrview_chain is not None: + result.append(nmrview_chain) + + nmrview_residue = ( + atom_label.residue.sequence_code + if atom_label.residue.sequence_code != NEF_UNKNOWN + else "{?}" + ) + + result.append(str(nmrview_residue)) + + nmrview_atom = atom_label.atom_name + result.append(nmrview_atom) + + return f'{{{".".join(result)}}}' + + +def _build_nmrview_row(peak, axis_names): + + # TODO: are we handling peak serials correctly... + result = [peak.id] + nmrview_assignments = _peak_to_assignments(peak, axis_names) + for axis_name in axis_names: + for heading, letter in HEADING_TRANSLATIONS.items(): + if heading == HEADINGS.LABEL: + assignment = nmrview_assignments[axis_name] + if len(assignment.strip()) == 0: + assignment = "{?}" + result.append(assignment) + + if heading == HEADINGS.SHIFT: + result.append(f"{peak.positions[axis_name]:.3f}") + + # TODO need default values per isotope and some calculations + if heading == HEADINGS.WIDTH: + # TODO: place holder + result.append("0.024") + + # TODO need default values per isotope and some calculations or + if heading == HEADINGS.BOUND: + # TODO: place holder + result.append("0.051") + + if heading == HEADINGS.ERROR: + # TODO: is this the correct value for a correct peak + # most probably the best we can do is presume all peaks are good + result.append("++") + + if heading == HEADINGS.COUPLING: + result.append("0.000") + + if heading == HEADINGS.COMMENT: + # TODO: is there a peak axis comment in ccpn/nef + result.append("{?}") + + volume = peak.values.volume if peak.values.volume else 0.000 + result.append(f"{volume:.3f}") + + height = peak.values.height if peak.values.height else 0.000 + result.append(f"{height:.3f}") + + # status + result.append("0") + + if peak.values.comment != "": + result.append(f"{{{peak.values.comment}}}") + else: + result.append("{?}") + + # flag0 + result.append("0") + + return result + + +def _peak_to_assignments(peak: Peak, axis_names: Tuple[str]) -> List[str]: + # print(peak) + num_assignments = [len(peak.assignments[axis]) for axis in axis_names] + max_num_assignments = max(num_assignments) + + # print('assignments', peak.assignments) + + if max_num_assignments == 0: + result = {axis_name: "{?}" for axis_name in axis_names} + else: + result = {} + for axis_name in axis_names: + result[axis_name] = " ".join( + [ + _peak_to_nmrview_label(atom_label) + for atom_label in peak.assignments[axis_name] + ] + ) + + return result + + +def _make_names_unique(axis_names: List[str]) -> List[str]: + + seen = Counter() + for axis_name in axis_names: + seen[axis_name] += 1 + + result = [] + modified = Counter() + for axis_name in axis_names: + if seen[axis_name] > 1: + modified[axis_name] += 1 + axis_name = f"{axis_name}_{modified[axis_name]}" + result.append(axis_name) + + return result + + +# noinspection PyUnusedLocal +@export_app.command() +def peaks( + file_name_template: str = typer.Option( + "%s", + help="the template for the filename to export to %s will get replaced by the axis_name of the peak frame", + metavar="", + ), + frame_selectors: List[str] = typer.Argument( + None, help="the names of the frames to export", metavar="" + ), +): + frame_selectors = parse_comma_separated_options(frame_selectors) + + if len(frame_selectors) == 0: + frame_selectors = [ + "*", + ] + + entry = create_entry_from_stdin_or_exit() + + SPECTRUM_CATEGORY = "nef_nmr_spectrum" + peaks = entry.get_saveframes_by_category(SPECTRUM_CATEGORY) + + names_and_frames = { + frame.name[len(SPECTRUM_CATEGORY) :].lstrip("_"): frame for frame in peaks + } + + selected_frames = {} + for frame_selector in frame_selectors: + selection = { + name: frame + for name, frame in names_and_frames.items() + if fnmatch(name, frame_selector) + } + + selected_frames.update(selection) + + if len(selected_frames) == 0: + for frame_selector in frame_selectors: + selection = { + name: frame + for name, frame in names_and_frames.items() + if fnmatch(name, f"*{frame_selector}*") + } + + selected_frames.update(selection) + + if not "%s" and len(selected_frames) > 1: + exit_error( + f"%s is not in the filename template and there is more than one file, template {file_name_template}" + ) + + # bad_characters = '''\ + # `:';~@$%^&().<>" + # ''' + # replacements = '_' * len(bad_characters) + # translation_table = str.maketrans(bad_characters, replacements) + + for frame_name, frame in selected_frames.items(): + # peak_list_name = f'''{frame_name.translate(translation_table).strip('_')}.xpk''' + + SPECTRUM_DIMESIONS = "_nef_spectrum_dimension" + PEAKS = "_nef_peak" + spectrum_dimensions = list(loop_row_namespace_iter(frame[SPECTRUM_DIMESIONS])) + + spectrum_name = ( + frame["ccpn_spectrum_file_path"][0] + if "ccpn_spectrum_file_path" in frame + else "unknown" + ) + spectrum_name = Path(spectrum_name).parts[-1] + + result = ["label dataset sw sf"] + + axis_names = [ + dimension.axis_code + if "axis_code" in dimension and dimension != NEF_UNKNOWN + else "unknown" + for dimension in spectrum_dimensions + ] + + axis_names = _make_names_unique(axis_names) + + result.append(" ".join(axis_names)) + + # TODO: issue warning if spectrum name doesn't end in nv! + nmrview_spectrum_name = Path(spectrum_name) + nmrview_spectrum_name = f"{nmrview_spectrum_name.stem}.nv" + result.append(nmrview_spectrum_name) + + spectrometer_frequencies = [ + dimension.spectrometer_frequency for dimension in spectrum_dimensions + ] + spectrometer_frequencies = _convert_to_floats_or_exit( + spectrometer_frequencies, "spectrometer_frequencies" + ) + + sweep_widths = [dimension.spectral_width for dimension in spectrum_dimensions] + sweep_widths = _convert_to_floats_or_exit(sweep_widths, "sweep_widths") + + sweep_widths = [ + sweep_width * spectrometer_frequency + for sweep_width, spectrometer_frequency in zip( + sweep_widths, spectrometer_frequencies + ) + ] + + sweep_widths = [f"{{{sweep_width:.3f}}}" for sweep_width in sweep_widths] + result.append(" ".join(sweep_widths)) + + spectrometer_frequencies = [ + f"{{{spectrometer_frequency:.3f}}}" + for spectrometer_frequency in spectrometer_frequencies + ] + result.append(" ".join(spectrometer_frequencies)) + + print("\n".join(result)) + + headings = [ + f"{axis_name}.{item}" for axis_name in axis_names for item in "LPWBEJU" + ] + headings.extend("vol int stat comment flag0".split()) + headings.insert(0, "") + + table = [headings] + + pipeline_peaks = [] + for peak_row in list(loop_row_dict_iter(frame[PEAKS])): + pipeline_peaks.append(_row_to_peak(axis_names, peak_row)) + + for peak in pipeline_peaks: + + row = _build_nmrview_row(peak, axis_names) + table.append(row) + + print(tabulate(table, tablefmt="plain")) diff --git a/src/nef_pipelines/transcoders/nmrview/importers/__init__.py b/src/nef_pipelines/transcoders/nmrview/importers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/nef_pipelines/transcoders/nmrview/importers/peaks.py b/src/nef_pipelines/transcoders/nmrview/importers/peaks.py new file mode 100644 index 0000000..a21743d --- /dev/null +++ b/src/nef_pipelines/transcoders/nmrview/importers/peaks.py @@ -0,0 +1,498 @@ +# TODO: add reading of sequence from input stream +# TODO: xplor names -> iupac and deal with %% and ## properly +# TODO: add common experiment types +# TODO: guess axis codes +# TODO: add a chemical shift list reference +# TODO: _nef_nmr_spectrum: value_first_point, folding, absolute_peak_positions, is_acquisition +# TODO: cleanup +# TODO: add function +# TODO: remove ics +# TODO: multiple assignments per peak... howto in nef +# TODO: add libs pipeline +# TODO axis codes need to be guessed +# TODO doesn't check if peaks are deleted (should have an option to read all peaks?) + +import itertools +from collections import OrderedDict +from pathlib import Path +from typing import Dict, List, Tuple + +import typer +from ordered_set import OrderedSet +from pynmrstar import Entry, Loop, Saveframe + +from nef_pipelines.lib import constants +from nef_pipelines.lib.constants import NEF_UNKNOWN +from nef_pipelines.lib.structures import ( + AtomLabel, + PeakAxis, + PeakList, + PeakListData, + PeakValues, + SequenceResidue, +) +from nef_pipelines.lib.typer_utils import get_args +from nef_pipelines.lib.util import ( + exit_error, + get_pipe_file, + process_stream_and_add_frames, +) +from nef_pipelines.transcoders.nmrview import import_app + +from ..nmrview_lib import parse_float_list, parse_tcl +from .sequence import read_sequence + +app = typer.Typer() + + +# noinspection PyUnusedLocal +@import_app.command(no_args_is_help=True) +def peaks( + entry_name: str = typer.Option( + "nmrview", + "-n", + "--name", + help="a name for a shift frame, additional calls add more names", + ), + chain_code: str = typer.Option( + "A", "--chain", help="chain code", metavar="" + ), + sequence: str = typer.Option( + None, + "-s", + "--sequence", + metavar=".seq)", + help="seq file for the chain .seq", + ), + axis_codes: str = typer.Option( + "1H.15N", + "-a", + "--axis", + metavar="", + help="a list of axis codes joined by dots", + ), + file_names: List[Path] = typer.Argument( + ..., help="input peak files", metavar="" + ), +): + """convert nmrview peak file .xpk files to NEF""" + args = get_args() + + raw_sequence = _get_sequence_or_exit(args) + sequence = _sequence_to_residue_type_lookup(raw_sequence) + + frames = [ + read_xpk_file(args, sequence), + ] + + entry = process_stream_and_add_frames(frames, args) + print(entry) + + +def read_xpk_file(args, sequence, entry_name=None): + + with open(args.file_names[0], "r") as lines: + peaks_list = read_raw_peaks(lines, args.chain_code, sequence) + + if not entry_name: + entry_name = make_peak_list_entry_name(peaks_list) + + return create_spectrum_frame(args, entry_name, peaks_list) + + +def read_raw_peaks(lines, chain_code, sequence): + + header = get_header_or_exit(lines) + + header_data = read_header_data(lines, header) + + column_indices = read_peak_columns(lines, header_data) + + raw_peaks = read_peak_data(lines, header_data, column_indices, chain_code, sequence) + + return PeakList(header_data, raw_peaks) + + +def read_peak_data(lines, header_data, column_indices, chain_code, sequence): + raw_peaks = [] + field = None + axis_index = None + for line_no, raw_line in enumerate(lines): + if not len(raw_line.strip()): + continue + try: + peak = {} + line = parse_tcl(raw_line) + # TODO validate and report errors + peak_index = int(line[0]) + + for axis_index, axis in enumerate(header_data.axis_labels): + axis_values = read_axis_for_peak( + line, axis, column_indices, chain_code, sequence + ) + + peak[axis_index] = PeakAxis(*axis_values) + + raw_values = read_values_for_peak(line, column_indices) + peak["values"] = PeakValues(peak_index, **raw_values) + + raw_peaks.append(peak) + + except Exception as e: + field = str(field) if field else "unknown" + msg = ( + f"failed to parse file a line {line_no} with input: '{raw_line.strip()}' field: {field} axis: " + f" {axis_index + 1} exception: {e}" + ) + exit_error(msg) + return raw_peaks + + +def read_peak_columns(lines, header_data): + line = next(lines) + raw_headings = line.split() + heading_indices = OrderedDict({"index": 0}) + for axis_index, axis in enumerate(header_data.axis_labels): + for axis_field in list("LPWBEJU"): + header = f"{axis}.{axis_field}" + if header in raw_headings: + heading_indices[header] = raw_headings.index(header) + 1 + for peak_item in ["vol", "int", "stat", "comment", "flag0"]: + if peak_item in raw_headings: + heading_indices[peak_item] = raw_headings.index(peak_item) + 1 + return heading_indices + + +def read_header_data(lines, headers): + data_set = None + sweep_widths = [] + spectrometer_frequencies = [] + num_axis = None + axis_labels = None + for header_no, header_type in enumerate(headers): + line = next(lines) + if header_type == "label": + axis_labels = line.strip().split() + num_axis = len(axis_labels) + elif header_type == "dataset": + data_set = line.strip() + elif header_type == "sw": + line_no = header_no + 2 + sweep_widths = parse_float_list(line, line_no) + check_num_fields(sweep_widths, num_axis, "sweep widths", line, line_no) + elif header_type == "sf": + line_no = header_no + 2 + spectrometer_frequencies = parse_float_list(line, line_no) + check_num_fields( + spectrometer_frequencies, + num_axis, + "spectrometer frequencies", + line, + line_no, + ) + + # sweep widths are in ppm for nef! + sweep_widths = [float(sweep_width) for sweep_width in sweep_widths] + spectrometer_frequencies = [ + float(spectrometer_frequency) + for spectrometer_frequency in spectrometer_frequencies + ] + + for i, (sweep_width, spectrometer_frequency) in enumerate( + zip(sweep_widths, spectrometer_frequencies) + ): + sweep_widths[i] = f"{sweep_width / spectrometer_frequency:.4f}" + + # TODO: peaks shifts, spectrometer frequencies how many decimal points + spectrometer_frequencies = [ + f"{spectrometer_frequency:4}" + for spectrometer_frequency in spectrometer_frequencies + ] + + peak_list_data = PeakListData( + num_axis, axis_labels, data_set, sweep_widths, spectrometer_frequencies + ) + return peak_list_data + + +def get_header_or_exit(lines): + header_items = ["label", "dataset", "sw", "sf"] + + line = next(lines) + + headers = [] + if line: + headers = line.strip().split() + + if len(headers) != 4: + msg = f"""this doesn't look like an nmrview xpk file, + i expected a header containing 4 items on the first line: {','.join(header_items)} + i got {line} at line 1""" + exit_error(msg) + + for name in header_items: + if name not in headers: + msg = f"""this doesn't look like an nmrview xpk file, + i expected a header containing the values: {', '.join(header_items)} + i got '{line}' at line 1""" + exit_error(msg) + + return headers + + +def read_axis_for_peak(line, axis, heading_indices, chain_code, sequence): + axis_values = [] + for axis_field in list("LPWBEJU"): + header = f"{axis}.{axis_field}" + field_index = heading_indices[header] + value = line[field_index] + + if axis_field == "L": + label = value[0] if value else "?" + if label == "?": + residue_number = None + atom_name = "" + else: + residue_number, atom_name = label.split(".") + residue_number = int(residue_number) + + if residue_number: + residue_type = sequence.setdefault((chain_code, residue_number), None) + else: + residue_type = "" + + if residue_number: + atom = AtomLabel( + SequenceResidue(chain_code, residue_number, residue_type), + atom_name.upper(), + ) + else: + atom = AtomLabel(SequenceResidue("", None, ""), atom_name.upper()) + axis_values.append(atom) + + elif axis_field == "P": + shift = float(value) + axis_values.append(shift) + elif axis_field in "WJU": + pass + elif axis_field == "E": + merit = value + axis_values.append(merit) + return axis_values + + +def read_values_for_peak(line, heading_indices): + peak_values = {} + for value_field in ["vol", "int", "stat", "comment", "flag0"]: + field_index = heading_indices[value_field] + value = line[field_index] + + if value_field == "vol": + peak_values["volume"] = float(value) + elif value_field == "int": + peak_values["height"] = float(value) + elif value_field == "stat": + peak_values["deleted"] = int(value) < 0 + elif value_field == "comment": + comment = value[0].strip("'") if value else "" + peak_values["comment"] = comment + elif value_field == "flag0": + pass + + return peak_values + + +def check_num_fields(fields, number, field_type, line, line_no): + if len(fields) != number: + msg = f"Expected {number} {field_type} got {len(fields)} for line: {line} at line {line_no}" + exit_error(msg) + + +def _sequence_to_residue_type_lookup( + sequence: List[SequenceResidue], +) -> Dict[Tuple[str, int], str]: + result: Dict[Tuple[str, int], str] = {} + for residue in sequence: + result[residue.chain_code, residue.sequence_code] = residue.residue_name + return result + + +def _get_isotope_code_or_exit(axis, axis_codes): + if axis >= len(axis_codes): + msg = f"can't find isotope code for axis {axis + 1} got axis codes {','.join(axis_codes)}" + exit_error(msg) + axis_code = axis_codes[axis] + return axis_code + + +def sequence_from_frames(frames: Saveframe): + + residues = OrderedSet() + for frame in frames: + for loop in frame: + chain_code_index = loop.tag_index("chain_code") + sequence_code_index = loop.tag_index("sequence_code") + residue_name_index = loop.tag_index("residue_name") + + for line in loop: + chain_code = line[chain_code_index] + sequence_code = line[sequence_code_index] + residue_name = line[residue_name_index] + residue = SequenceResidue(chain_code, sequence_code, residue_name) + residues.append(residue) + + return list(residues) + + +def _get_sequence_or_exit(args): + sequence_file = None + if "sequence" in args: + sequence_file = args.sequence + + if not sequence_file: + try: + stream = get_pipe_file(args) + entry = Entry.from_file(stream) + frames = entry.get_saveframes_by_category("nef_molecular_system") + sequence = sequence_from_frames(frames) + + except Exception as e: + exit_error(f"failed to read sequence from input stream because {e}", e) + + else: + with open(sequence_file, "r") as lines: + sequence = read_sequence(lines, chain_code=args.chain_code) + return sequence + + +def create_spectrum_frame(args, entry_name, peaks_list): + + category = "nef_nmr_spectrum" + frame_code = f"{category}_{entry_name}" + frame = Saveframe.from_scratch(frame_code, category) + + frame.add_tag("sf_category", category) + frame.add_tag("sf_framecode", frame_code) + frame.add_tag("num_dimensions", peaks_list.peak_list_data.num_axis) + frame.add_tag("chemical_shift_list", constants.NEF_UNKNOWN) + loop = Loop.from_scratch("nef_spectrum_dimension") + frame.add_loop(loop) + list_tags = ( + "dimension_id", + "axis_unit", + "axis_code", + "spectrometer_frequency", + "spectral_width", + "value_first_point", + "folding", + "absolute_peak_positions", + "is_acquisition", + ) + loop.add_tag(list_tags) + list_data = peaks_list.peak_list_data + for i in range(list_data.num_axis): + for tag in list_tags: + if tag == "dimension_id": + loop.add_data_by_tag(tag, i + 1) + elif tag == "axis_unit": + loop.add_data_by_tag(tag, "ppm") + elif tag == "axis_code": + axis_codes = args.axis_codes.split(".") + loop.add_data_by_tag(tag, _get_isotope_code_or_exit(i, axis_codes)) + elif tag == "spectrometer_frequency": + loop.add_data_by_tag(tag, list_data.spectrometer_frequencies[i]) + elif tag == "spectral_width": + if list_data.sweep_widths: + loop.add_data_by_tag(tag, list_data.sweep_widths[i]) + else: + loop.add_data_by_tag(tag, NEF_UNKNOWN) + elif tag == "folding": + loop.add_data_by_tag(tag, "circular") + elif tag == "absolute_peak_positions": + loop.add_data_by_tag(tag, "true") + else: + loop.add_data_by_tag(tag, NEF_UNKNOWN) + loop = Loop.from_scratch("nef_spectrum_dimension_transfer") + frame.add_loop(loop) + transfer_dim_tags = ("dimension_1", "dimension_2", "transfer_type") + loop.add_tag(transfer_dim_tags) + loop = Loop.from_scratch("nef_peak") + frame.add_loop(loop) + peak_tags = [ + "index", + "peak_id", + "volume", + "volume_uncertainty", + "height", + "height_uncertainty", + ] + position_tags = [ + (f"position_{i + 1}", f"position_uncertainty_{i + 1}") + for i in range(list_data.num_axis) + ] + position_tags = itertools.chain(*position_tags) + atom_name_tags = [ + ( + f"chain_code_{i + 1}", + f"sequence_code_{i + 1}", + f"residue_name_{i + 1}", + f"atom_name_{i + 1}", + ) + for i in range(list_data.num_axis) + ] + atom_name_tags = itertools.chain(*atom_name_tags) + tags = [*peak_tags, *position_tags, *atom_name_tags] + + loop.add_tag(tags) + for i, peak in enumerate(peaks_list.peaks): + peak_values = peak["values"] + if peak_values.deleted: + continue + + for tag in tags: + + if tag == "index": + loop.add_data_by_tag(tag, i + 1) + elif tag == "peak_id": + loop.add_data_by_tag(tag, peak_values.serial) + elif tag == "volume": + loop.add_data_by_tag(tag, peak_values.volume) + elif tag == "height": + loop.add_data_by_tag(tag, peak_values.height) + elif tag.split("_")[0] == "position" and len(tag.split("_")) == 2: + index = int(tag.split("_")[-1]) - 1 + loop.add_data_by_tag(tag, peak[index].ppm) + elif tag.split("_")[:2] == ["chain", "code"]: + index = int(tag.split("_")[-1]) - 1 + chain_code = peak[index].atom_labels.residue.chain_code + chain_code = chain_code if chain_code is not None else args.chain_code + chain_code = chain_code if chain_code else "." + loop.add_data_by_tag(tag, chain_code) + elif tag.split("_")[:2] == ["sequence", "code"]: + index = int(tag.split("_")[-1]) - 1 + sequence_code = peak[index].atom_labels.residue.sequence_code + sequence_code = sequence_code if sequence_code else "." + loop.add_data_by_tag(tag, sequence_code) + elif tag.split("_")[:2] == ["residue", "name"]: + index = int(tag.split("_")[-1]) - 1 + + # TODO: there could be more than 1 atom label here and this should be a list... + residue_name = peak[index].atom_labels.residue.residue_name + residue_name = residue_name if residue_name else "." + loop.add_data_by_tag(tag, residue_name) + elif tag.split("_")[:2] == ["atom", "name"]: + index = int(tag.split("_")[-1]) - 1 + + atom_name = peak[index].atom_labels.atom_name + atom_name = atom_name if atom_name else "." + loop.add_data_by_tag(tag, atom_name) + else: + loop.add_data_by_tag(tag, constants.NEF_UNKNOWN) + return frame + + +def make_peak_list_entry_name(peaks_list): + entry_name = peaks_list.peak_list_data.data_set.replace(" ", "_") + entry_name = entry_name.removesuffix(".nv") + entry_name = entry_name.replace(".", "_") + return entry_name diff --git a/src/nef_pipelines/transcoders/nmrview/importers/sequence.py b/src/nef_pipelines/transcoders/nmrview/importers/sequence.py new file mode 100644 index 0000000..a38af0f --- /dev/null +++ b/src/nef_pipelines/transcoders/nmrview/importers/sequence.py @@ -0,0 +1,74 @@ +from argparse import Namespace +from pathlib import Path +from typing import List + +import typer + +from nef_pipelines.lib.sequence_lib import chain_code_iter, sequence_to_nef_frame +from nef_pipelines.lib.typer_utils import get_args +from nef_pipelines.lib.util import exit_error, process_stream_and_add_frames +from nef_pipelines.transcoders.nmrview import import_app +from nef_pipelines.transcoders.nmrview.nmrview_lib import read_sequence + +app = typer.Typer() + + +# noinspection PyUnusedLocal +@import_app.command(no_args_is_help=True) +def sequence( + chain_codes: str = typer.Option( + "A", + "--chains", + help="chain codes as a list of names separated by dots", + metavar="", + ), + no_chain_start: bool = typer.Option( + False, + "--no-chain-start/", + help="don't include a start of chain link type for the first residue", + ), + no_chain_end: bool = typer.Option( + False, + "--no-chain-end/", + help="don't include an end of chain link type for the last residue", + ), + entry_name: str = typer.Option("nmrview", help="a name for the entry"), + pipe: Path = typer.Option( + None, + metavar="|PIPE|", + help="pipe to read NEF data from, for testing [overrides stdin !use stdin instead!]", + ), + file_names: List[Path] = typer.Argument( + ..., help="input files of type nmrview.seq", metavar="" + ), +): + """convert nmrview sequence file .seq files to NEF""" + try: + args = get_args() + + process_sequence(args) + except Exception as e: + exit_error(f"reading sequence failed because {e}") + + +def process_sequence(args: Namespace): + nmrview_frames = [] + + for file_name, chain_code in zip( + args.file_names, chain_code_iter(args.chain_codes) + ): + with open(file_name, "r") as lines: + nmrview_sequence = read_sequence(lines, chain_code=chain_code) + + frame = sequence_to_nef_frame(nmrview_sequence, args.entry_name) + + nmrview_frames.append(frame) + + entry = process_stream_and_add_frames(nmrview_frames, args) + + print(entry) + + +if __name__ == "__main__": + + typer.run(sequence) diff --git a/src/nef_pipelines/transcoders/nmrview/importers/shifts.py b/src/nef_pipelines/transcoders/nmrview/importers/shifts.py new file mode 100644 index 0000000..85c179a --- /dev/null +++ b/src/nef_pipelines/transcoders/nmrview/importers/shifts.py @@ -0,0 +1,136 @@ +from argparse import Namespace +from pathlib import Path +from typing import Dict, List, Tuple + +import typer +from pynmrstar import Entry, Saveframe + +from nef_pipelines.lib.sequence_lib import chain_code_iter +from nef_pipelines.lib.shift_lib import shifts_to_nef_frame +from nef_pipelines.lib.structures import SequenceResidue +from nef_pipelines.lib.typer_utils import get_args +from nef_pipelines.lib.util import ( + cached_file_stream, + exit_error, + get_pipe_file_or_exit, + process_stream_and_add_frames, +) +from nef_pipelines.transcoders.nmrview import import_app +from nef_pipelines.transcoders.nmrview.nmrview_lib import parse_shifts, read_sequence + +app = typer.Typer() + + +# noinspection PyUnusedLocal +@import_app.command(no_args_is_help=True) +def shifts( + chain_codes: str = typer.Option( + "A", + "--chains", + help="chain codes as a list of names spearated by dots", + metavar="", + ), + entry_name: str = typer.Option("nmrview", help="a name for the entry"), + pipe: Path = typer.Option( + None, + metavar="|PIPE|", + help="pipe to read NEF data from, for testing [overrides stdin !use stdin instead!]", + ), + file_names: List[Path] = typer.Argument( + ..., help="input files of type nmrview.out", metavar=".out" + ), +): + """convert nmrview shift file .out to NEF""" + + try: + args = get_args() + + process_shifts(args) + except Exception as e: + exit_error(f"reading sequence failed because {e}", e) + + +# +# +def process_shifts(args: Namespace): + nmrview_frames = [] + + for file_name, chain_code in zip( + args.file_names, chain_code_iter(args.chain_codes) + ): + + with cached_file_stream(file_name) as lines: + + sequence = _get_sequence_or_exit(args) + + chain_seqid_to_type = _sequence_to_residue_type_lookup(sequence) + + nmrview_shifts = parse_shifts( + lines, chain_seqid_to_type, chain_code=chain_code, file_name=file_name + ) + + frame = shifts_to_nef_frame(nmrview_shifts, args.entry_name) + + nmrview_frames.append(frame) + + entry = process_stream_and_add_frames(nmrview_frames, args) + + print(entry) + + +def sequence_from_frames(frames: Saveframe) -> List[SequenceResidue]: + + residues = [] + for frame in frames: + for loop in frame: + chain_code_index = loop.tag_index("chain_code") + sequence_code_index = loop.tag_index("sequence_code") + residue_name_index = loop.tag_index("residue_name") + + for line in loop: + chain_code = line[chain_code_index] + sequence_code = int(line[sequence_code_index]) + residue_name = line[residue_name_index] + residue = SequenceResidue(chain_code, sequence_code, residue_name) + residues.append(residue) + + return residues + + +# TODO this should be replaced by a library function from nef or sequence utils... +# also need ro report what file we are trying to read shifts from +def _get_sequence_or_exit(args): + sequence_file = None + if "sequence" in args: + sequence_file = args.sequence + + sequence = None + if not sequence_file: + try: + stream = get_pipe_file_or_exit(args) + + entry = Entry.from_file(stream) + frames = entry.get_saveframes_by_category("nef_molecular_system") + sequence = sequence_from_frames(frames) + + except Exception as e: + exit_error(f"failed to read sequence from input stream because {e}", e) + + else: + with open(sequence_file, "r") as lines: + sequence = read_sequence(lines, chain_code=args.chain_code) + return sequence + + +def _sequence_to_residue_type_lookup( + sequence: List[SequenceResidue], +) -> Dict[Tuple[str, int], str]: + result: Dict[Tuple[str, int], str] = {} + for residue in sequence: + result[residue.chain_code, residue.sequence_code] = residue.residue_name + return result + + +if __name__ == "__main__": + + typer.run(shifts) diff --git a/src/nef_pipelines/transcoders/nmrview/nmrview_lib.py b/src/nef_pipelines/transcoders/nmrview/nmrview_lib.py new file mode 100644 index 0000000..c08c436 --- /dev/null +++ b/src/nef_pipelines/transcoders/nmrview/nmrview_lib.py @@ -0,0 +1,305 @@ +import functools +from textwrap import dedent +from typing import Dict, Iterable, List, Tuple + +from pyparsing import ( + Forward, + Group, + ParseException, + ParserElement, + ParseResults, + Suppress, + Word, + ZeroOrMore, + printables, + restOfLine, +) + +from nef_pipelines.lib.nef_lib import UNUSED +from nef_pipelines.lib.structures import ( + AtomLabel, + SequenceResidue, + ShiftData, + ShiftList, +) +from nef_pipelines.lib.util import exit_error + + +def _process_emptys_and_singles(value: ParseResults) -> ParseResults: + """ + a parse action for tcl_parser that ignores empty lists and promotes lists with a single item to a single item + Args: + value (ParseResults): parse resultsd to be modified + + Returns: + ParseResults: corrected parse results + """ + + for i, item in enumerate(value): + if len(item) == 0: + value[i] = "" + + if len(value) == 1: + value = value[0] + + return value + + +@functools.cache +def get_tcl_parser() -> ParserElement: + """ + build a simple tcl parser suitable for nmrview files and cach it + + Returns: + pyparsing parser + + """ + + # TODO this should be printables excluding : " { } + simple_word = Word(initChars=printables, excludeChars='"{}') + simple_word.setName("simple_word") + + expression = Forward() + expression.setName("expression") + + DBL_QUOTE = Suppress('"') + LEFT_PAREN = Suppress("{") + RIGHT_PAREN = Suppress("}") + + quoted_simple_word = DBL_QUOTE + simple_word + DBL_QUOTE + quoted_simple_word.setName("quoted_simple_word") + + quoted_complex_word = Group(DBL_QUOTE + ZeroOrMore(expression) + DBL_QUOTE) + quoted_complex_word.setName("quoted complex word") + + complex_list = Group(LEFT_PAREN + ZeroOrMore(expression) + RIGHT_PAREN) + complex_list.setName("complex list") + + expression << ( + simple_word | quoted_simple_word | quoted_complex_word | complex_list + ) + + remainder = restOfLine() + remainder.setName("remainder") + + top_level = ZeroOrMore(expression) + top_level.setParseAction(_process_emptys_and_singles) + top_level.setName("phrase") + restOfLine() + + return top_level + + +def parse_tcl(in_str, file_name="unknown", line_no=0) -> ParseResults: + """ + parse a tcl data file or fragment using pyparsing + + Args: + in_str (str): tcl source + file_name (str): file name for error reporting + line_no (str): base line number for error reporting, the line no reported by py parsing will be added to this + + Returns: + ParseResults: pyparsing parse result + """ + + parser = get_tcl_parser() + + result = None + try: + result = parser.parseString(in_str, parseAll=True) + except ParseException as pe: + line_no += pe.lineno + msg = f"""\ + Failed while parsing tcl at line: {line_no} in file: {file_name} + + Explanation: + """ + + msg = dedent(msg) + + msg += pe.explain_exception(pe) + + exit_error(msg) + + return result + + +# TODO: add line info for better error handling +def parse_float_list(line: str, line_no: int) -> List[float]: + """ + parse a tcl list into a list of floats using the tcl parser + + Args: + line (str): the input string + line_no: line number for error reporting + + Returns: + List[float]: a list of floats + """ + + raw_fields = [] + parsed_tcl = parse_tcl(line) + for field in parsed_tcl: + if isinstance(field, ParseResults): + raw_fields.extend(field) + elif isinstance(field, str): + raw_fields.append(field) + else: + exit_error( + f"Error: unexpected internal error tcl failed to parse: '{line}' was not parsed properly got: \ + {parsed_tcl} field type was: {field.__class__}" + ) + + result = [] + for field_index, field in enumerate(raw_fields): + try: + field = float(field) + except ValueError as e: + msg = f"Couldn't convert sweep width {field_index} [{field}] to float for line {line} at line number \ + {line_no}" + exit_error(msg, e) + result.append(field) + + return result + + +def read_sequence( + sequence_lines: Iterable[str], + chain_code: str = "A", + sequence_file_name: str = "unknown", +) -> List[SequenceResidue]: + + """ + read an nmrview sequence from a file + + Args: + sequence_lines (Iterable[str]): the lines for the file + chain_code (str): a chaning code to use + sequence_file_name (str): the file being read from for reporting errors + + Returns: + List[SequenceResidue]: The residues as structures + """ + + start_residue = 1 + result = [] + for i, line in enumerate(sequence_lines): + line = line.strip() + fields = line.split() + + msg = f"""nmview sequences have one residue name per line, + except for the first line which can also contain a starting residue number, + at line {i + 1} i got {line} in file {sequence_file_name} + line was: {line}""" + + if len(fields) > 1 and i != 0: + exit_error(msg) + + if i == 0 and len(fields) > 2: + exit_error( + f"""at the first line the should be one 3 letter code and an optional residue number + in file {sequence_file_name} at line {i+1} got {len(fields)} fields + line was: {line}""" + ) + + if i == 0 and len(fields) == 2: + try: + start_residue = int(fields[1]) + except ValueError: + msg = f"""couldn't convert second field {fields[0]} to an integer + at line {i + 1} in file {sequence_file_name} + line was: {line} + """ + exit_error(msg) + + if len(fields) > 0: + result.append(SequenceResidue(chain_code, start_residue + i, fields[0])) + + return result + + +def parse_shifts( + lines: Iterable[str], + chain_seqid_to_type: Dict[Tuple[str, int], str], + chain_code: str = "A", + file_name="unknown", +) -> ShiftList: + + shifts = [] + for i, line in enumerate(lines): + + line = line.strip() + + if len(line) == 0: + continue + + fields = line.split() + num_fields = len(fields) + if num_fields != 3: + msg = f"""An nmrview ppm.out file should have 3 fields per line + i got {num_fields} at line {i + 1} + with data: {line}""" + exit_error(msg) + + shift, stereo_specificty_code = fields[1:] + + atom_fields = fields[0].split(".") + num_atom_fields = len(atom_fields) + if num_atom_fields != 2: + msg = f"""An nmrview ppm.out file should have atom specfiers of the form '1.CA' + i got{num_atom_fields} at line {i + 1} + with data: {line}""" + exit_error(msg) + residue_number, atom = atom_fields + + try: + residue_number = int(residue_number) + except ValueError: + msg = f"""An nmrview residue number should be an integer + i got {residue_number} at line {i + 1}""" + exit_error(msg) + + try: + shift = float(shift) + except ValueError: + msg = f"""A chemical shift should be a float + i got {shift} at line {i + 1}""" + exit_error(msg) + + try: + stereo_specificty_code = int(stereo_specificty_code) + except ValueError: + msg = f"""An nmrview stereo specificty code should be an integer + i got {stereo_specificty_code} at line {i + 1}""" + exit_error(msg) + + if stereo_specificty_code not in [1, 2, 3]: + msg = f"""An nmrview stereo specificty code should be an integer between 1 and 3, + i got {stereo_specificty_code} at line {i + 1}""" + exit_error(msg) + + seq_key = chain_code, residue_number + + residue_name = chain_seqid_to_type.setdefault(seq_key, UNUSED).upper() + + if ( + residue_number != UNUSED and chain_code != UNUSED + ) and residue_name == UNUSED: + msg = f"""\ + [nmrview shifts] residue not defined for chain {chain_code} residue {residue_number} + line number {i+1} + line was |{line}| + in file {file_name} + did you read a sequence?""" + msg = dedent(msg) + exit_error(msg) + + atom = AtomLabel( + SequenceResidue(chain_code, residue_number, residue_name), atom + ) + shift = ShiftData(atom, shift) + shifts.append(shift) + + result = ShiftList(shifts) + + return result diff --git a/src/nef_pipelines/transcoders/pales/__init__.py b/src/nef_pipelines/transcoders/pales/__init__.py new file mode 100644 index 0000000..4c471bd --- /dev/null +++ b/src/nef_pipelines/transcoders/pales/__init__.py @@ -0,0 +1,19 @@ +import typer + +import nef_pipelines +from nef_pipelines import nef_app + +app = typer.Typer() +import_app = typer.Typer() +export_app = typer.Typer() + +if nef_app.app: + + nef_app.app.add_typer(app, name="pales", help="- read and write pales/dc [rdcs]") + + # app.add_typer(import_app, name='import', help='- import pales/dc [rdc restraints]') + app.add_typer(export_app, name="export", help="- export pales/dc [rdc fits]") + + # import of specific importers must be after app creation to avoid circular imports + import nef_pipelines.transcoders.pales.exporters.rdcs # noqa: F401 + import nef_pipelines.transcoders.pales.exporters.template # noqa: F401 diff --git a/src/nef_pipelines/transcoders/pales/exporters/rdcs.py b/src/nef_pipelines/transcoders/pales/exporters/rdcs.py new file mode 100644 index 0000000..d744934 --- /dev/null +++ b/src/nef_pipelines/transcoders/pales/exporters/rdcs.py @@ -0,0 +1,210 @@ +from dataclasses import replace +from typing import Dict, List, Tuple + +import tabulate +import typer +from pynmrstar import Saveframe + +from nef_pipelines.lib.nef_lib import ( + create_entry_from_stdin_or_exit, + loop_row_dict_iter, + select_frames_by_name, +) +from nef_pipelines.lib.sequence_lib import ( + get_sequence_or_exit, + sequence_residues_to_sequence_3let, + translate_3_to_1, +) +from nef_pipelines.lib.structures import AtomLabel, RdcRestraint, SequenceResidue +from nef_pipelines.lib.util import exit_error, read_float_or_exit +from nef_pipelines.transcoders.nmrpipe.nmrpipe_lib import print_pipe_sequence +from nef_pipelines.transcoders.pales import export_app + +app = typer.Typer() + +# TODO: name translations +# TODO: correct weights +# TODO: move utuilities to lib +# TODO: support multiple chains + + +# noinspection PyUnusedLocal +@export_app.command() +def rdcs( + chains: str = typer.Option( + [], + "-c", + "--chain", + help="chains to export, to add mutiple chains use repeated calls [default: 'A']", + metavar="", + ), + raw_weights: List[str] = typer.Option( + [], + "-w", + "--weight", + help="weights for rdcs as a comma separated triple of 2 atom names and a weight [no spaces e.g HN,N,1.0]," + " multiple weights may be added by repeated calls, efault is HN,N,1.0", + ), + frame_selectors: List[str] = typer.Option( + "*", + "-f", + "--frame", + help="selector for the rdc restraint frame to use, can be called multiple times and include wild cards", + ), +): + """- convert nef rdc restraints to pales""" + + if not chains: + chains = ["A"] + weights = _parse_weights(raw_weights) + sequence = get_sequence_or_exit() + + sequence_3_let = sequence_residues_to_sequence_3let(sequence) + + sequence_1_let = translate_3_to_1(sequence_3_let) + + NEF_RDC_CATEGORY = "nef_rdc_restraint_list" + + entry = create_entry_from_stdin_or_exit() + rdc_frames = entry.get_saveframes_by_category(NEF_RDC_CATEGORY) + + if frame_selectors is None: + frame_selectors = [""] + frames = select_frames_by_name(rdc_frames, frame_selectors) + + restaints = _rdc_restraints_from_frames(frames, chains, weights) + + restaints = _translate_atom_names(restaints, "xplor") + + print(f'REMARK NEF CHAIN {", ".join(chains)}') + print(f"REMARK NEF START RESIDUE {sequence[0].sequence_code}") + print() + print_pipe_sequence(sequence_1_let) + print() + _print_restraints(restaints, weights) + + +def _translate_atom_names( + restraints: list[RdcRestraint], naming_scheme="iupac" +) -> List[RdcRestraint]: + result = [] + for restraint in restraints: + atom_1 = restraint.atom_1 + atom_2 = restraint.atom_2 + + if atom_1.atom_name == "H": + restraint.atom_1 = replace(atom_1, atom_name="HN") + + if atom_2.atom_name == "H": + restraint.atom_1 = replace(atom_2, atom_name="HN") + + result.append(restraint) + + return result + + +def _rdc_restraints_from_frames( + frames: List[Saveframe], chains: List[str], weights: Dict[Tuple[str, str], float] +): + result = [] + for frame in frames: + for row in loop_row_dict_iter(frame.loops[0]): + + atom_1 = AtomLabel( + SequenceResidue( + row["chain_code_1"], + int(row["sequence_code_1"]), + row["residue_name_1"], + ), + row["atom_name_1"], + ) + atom_2 = AtomLabel( + SequenceResidue( + row["chain_code_2"], + int(row["sequence_code_2"]), + row["residue_name_2"], + ), + row["atom_name_2"], + ) + + weight_key = _build_weights_key(row["atom_name_1"], row["atom_name_2"]) + # this should be + weight = weights[weight_key] if weight_key in weights else 1.0 + rdc = RdcRestraint( + atom_1, + atom_2, + row["target_value"], + row["target_value_uncertainty"], + weight, + ) + + result.append(rdc) + + return sorted(result) + + +def _parse_weights(raw_weights): + + result = {} + for raw_weight in raw_weights: + + raw_weight_fields = raw_weight.split(",") + + if len(raw_weight_fields) != 3: + exit_error( + f"bad weight {raw_weight} weights should have 3 fields separated by commas [no spaces]" + f" atom_1,atom_2,weight. e.g HN,N,1.0" + ) + + atom_1, atom_2, raw_rdc_weight = raw_weight_fields + + rdc_weight = read_float_or_exit( + raw_rdc_weight, + message=f"weights should have 3 fields separated by commas [no spaces] atom_1,atom_2,weight weight should" + f" be a float i got {raw_weight} for field feild [{raw_rdc_weight}] which is not a float", + ) + + key = _build_weights_key(atom_1, atom_2) + result[key] = rdc_weight + + # it not possibe to add default weights? + HN_N_key = _build_weights_key("HN", "N") + if HN_N_key not in result: + result[HN_N_key] = 1.0 + + return result + + +def _build_weights_key(atom_1, atom_2): + return tuple(sorted([atom_1, atom_2])) + + +def _print_restraints( + restraints: List[RdcRestraint], weights: Dict[Tuple[str, str], float] +): + VARS = "VARS RESID_I RESNAME_I ATOMNAME_I RESID_J RESNAME_J ATOMNAME_J D DD W".split() + FORMAT = "FORMAT %5d %6s %6s %5d %6s %6s %9.3f %9.3f %.2f".split() + + table = [VARS, FORMAT] + + for i, restraint in enumerate(restraints): + atom_1 = restraint.atom_1 + atom_2 = restraint.atom_2 + + weights_key = _build_weights_key(atom_1.atom_name, atom_2.atom_name) + weight = weights[weights_key] + row = [ + "", + atom_1.residue.sequence_code, + atom_1.residue.residue_name, + atom_1.atom_name, + atom_2.residue.sequence_code, + atom_2.residue.residue_name, + atom_2.atom_name, + restraint.rdc, + restraint.rdc_error, + weight, + ] + table.append(row) + + print(tabulate.tabulate(table, tablefmt="plain")) diff --git a/src/nef_pipelines/transcoders/pales/exporters/template.py b/src/nef_pipelines/transcoders/pales/exporters/template.py new file mode 100644 index 0000000..58c9158 --- /dev/null +++ b/src/nef_pipelines/transcoders/pales/exporters/template.py @@ -0,0 +1,179 @@ +from typing import Dict, List, Tuple + +import tabulate +import typer + +from nef_pipelines.lib.sequence_lib import ( + get_sequence_or_exit, + sequence_residues_to_sequence_3let, + translate_3_to_1, +) +from nef_pipelines.lib.structures import ( + AtomLabel, + Linking, + RdcRestraint, + SequenceResidue, +) +from nef_pipelines.lib.util import chunks, exit_error, read_float_or_exit +from nef_pipelines.transcoders.pales import export_app + +app = typer.Typer() + + +# noinspection PyUnusedLocal +@export_app.command() +def template( + chain_code: str = typer.Option( + "A", + "-c", + "--chain", + help="chain code a single chain code", + metavar="", + ), + template_atoms: Tuple[str, str] = typer.Option( + ("HN", "N"), + "-a", + "--atoms", + help="for templates the atoms for the restraintsb to be between", + ), + # see _parse_weights for parsing and adding defaults + raw_weights: List[str] = typer.Option( + [], + "-w", + "--weight", + help="weights for rdcs as a comma separated triple of 2 atom names and a weight [no spaces e.g HN,N,1.0], " + "multiple weights may be added by repeated calls, efault is HN,N,1.0", + ), +): + """convert nef rdc restraints to pales template file for prediction of rdcs""" + + weights = _parse_weights(raw_weights) + sequence = get_sequence_or_exit() + + sequence_3_let = sequence_residues_to_sequence_3let(sequence) + + sequence_1_let = translate_3_to_1(sequence_3_let) + + print(f"REMARK NEF CHAIN {chain_code}") + print(f"REMARK NEF START RESIDUE {sequence[0].sequence_code}") + print() + _print_pipe_sequence(sequence_1_let) + + restaints = _build_dummy_restraints(sequence, template_atoms) + + print() + _print_restraints(restaints, weights=weights) + + +def _parse_weights(raw_weights): + + result = {} + for raw_weight in raw_weights: + + raw_weight_fields = raw_weight.split(",") + + if len(raw_weight_fields) != 3: + exit_error( + f"bad weight {raw_weight} weights should have 3 fields separated by commas [no spaces] " + f"atom_1,atom_2,weight. e.g HN,N,1.0" + ) + + atom_1, atom_2, raw_rdc_weight = raw_weight_fields + + rdc_weight = read_float_or_exit( + raw_rdc_weight, + message=f"weights should have 3 fields separated by commas [no spaces] atom_1,atom_2,weight weight should " + f"be a float i got {raw_weight} for field feild [{raw_rdc_weight}] which is not a float", + ) + + key = _build_weights_key(atom_1, atom_2) + result[key] = rdc_weight + + # is it not possible to add default weights? + HN_N_key = _build_weights_key("HN", "N") + if HN_N_key not in result: + result[HN_N_key] = 1.0 + + return result + + +def _build_weights_key(atom_1, atom_2): + return tuple(sorted([atom_1, atom_2])) + + +def _build_dummy_restraints( + sequence: SequenceResidue, atom_names: Tuple[str, str] +) -> List[RdcRestraint]: + """ + note we use xplor names as pales is a NIH product + :param sequence: + :param atom_names: + :return: + """ + restraints = [] + for residue in sequence: + + # special case prolines, but really we should check chem comap for atoms... + if residue.residue_name == "PRO" and "HN" in atom_names: + continue + + if residue.linking == Linking.START and "HN" in atom_names: + continue + + atom_1 = AtomLabel( + SequenceResidue( + residue.chain_code, residue.sequence_code, residue.residue_name + ), + atom_names[0], + ) + atom_2 = AtomLabel( + SequenceResidue( + residue.chain_code, residue.sequence_code, residue.residue_name + ), + atom_names[1], + ) + restraint = RdcRestraint(atom_1, atom_2, 0.0, 0.0) + restraints.append(restraint) + + return restraints + + +def _print_restraints( + restraints: List[RdcRestraint], weights: Dict[Tuple[str, str], float] +): + VARS = "VARS RESID_I RESNAME_I ATOMNAME_I RESID_J RESNAME_J ATOMNAME_J D DD W".split() + FORMAT = "FORMAT %5d %6s %6s %5d %6s %6s %9.3f %9.3f %.2f".split() + + table = [VARS, FORMAT] + + for i, restraint in enumerate(restraints): + atom_1 = restraint.atom_1 + atom_2 = restraint.atom_2 + + weights_key = _build_weights_key(atom_1.atom_name, atom_2.atom_name) + weight = weights[weights_key] + row = [ + "", + atom_1.residue.sequence_code, + atom_1.residue.residue_name, + atom_1.atom_name, + atom_2.residue.sequence_code, + atom_2.residue.residue_name, + atom_2.atom_name, + 0.000, + 0.000, + weight, + ] + table.append(row) + + print(tabulate.tabulate(table, tablefmt="plain")) + + +def _print_pipe_sequence(sequence_1_let: List[str]): + + rows = chunks(sequence_1_let, 100) + + for row in rows: + row_chunks = list(chunks(row, 10)) + row_strings = ["".join(chunk) for chunk in row_chunks] + print(f'DATA SEQUENCE {" ".join(row_strings)}') diff --git a/src/nef_pipelines/transcoders/pdb/__init__.py b/src/nef_pipelines/transcoders/pdb/__init__.py new file mode 100644 index 0000000..b56f7a6 --- /dev/null +++ b/src/nef_pipelines/transcoders/pdb/__init__.py @@ -0,0 +1,15 @@ +import typer + +import nef_pipelines +from nef_pipelines import nef_app + +app = typer.Typer() +import_app = typer.Typer() + +if nef_app.app: + nef_app.app.add_typer(app, name="pdb", help="- read pdb [sequences]") + + app.add_typer(import_app, name="import", help="- import pdb [sequences]") + + # import of specific importers must be after app creation to avoid circular imports + import nef_pipelines.transcoders.pdb.importers.sequence # noqa: F401 diff --git a/src/nef_pipelines/transcoders/pdb/importers/sequence.py b/src/nef_pipelines/transcoders/pdb/importers/sequence.py new file mode 100644 index 0000000..8a2e731 --- /dev/null +++ b/src/nef_pipelines/transcoders/pdb/importers/sequence.py @@ -0,0 +1,135 @@ +import warnings +from argparse import Namespace +from pathlib import Path +from typing import List + +import typer + +from nef_pipelines.lib.sequence_lib import sequence_to_nef_frame +from nef_pipelines.lib.structures import SequenceResidue +from nef_pipelines.lib.typer_utils import get_args +from nef_pipelines.lib.util import ( + exit_error, + parse_comma_separated_options, + process_stream_and_add_frames, +) +from nef_pipelines.transcoders.pdb import import_app + +app = typer.Typer() + +CHAINS_HELP = """chains [or segid] to read', metavar=', multiple chains can be added as a comma joined list + [eg A.B.C] ] or by calling this option mutiple times""" +NO_CHAIN_START_HELP = """don't include the start chain link type on a chain for the first residue [linkage will be + middle] for the named chains. Either use a comma joined list of chains [e.g. A,B] or call this + option multiple times to set chain starts for multiple chains""" +NO_CHAIN_END_HELP = """don't include the end chain link type on a chain for the last residue [linkage will be + middle] for the named chains. Either use a comma joined list of chains [e.g. A,B] or call this + option multiple times to set chain ends for multiple chains""" + + +# noinspection PyUnusedLocal +@import_app.command(no_args_is_help=True) +def sequence( + chain_codes: List[str] = typer.Option(None, "-c", "--chains", help=CHAINS_HELP), + use_segids: bool = typer.Option( + False, "-s", "--segid", help="use segid instead of chain" + ), + no_chain_starts: List[str] = typer.Option( + [], "--no-chain-start", help=NO_CHAIN_START_HELP + ), + no_chain_ends: List[str] = typer.Option( + [], "--no-chain-end", help=NO_CHAIN_END_HELP + ), + entry_name: str = typer.Option( + "pdb", help="a name for the nef entry if not updating an existing entry" + ), + file_name: Path = typer.Argument(..., help="input pdb file", metavar=""), +): + """extracts a sequence from a pdb file""" + + chain_codes = parse_comma_separated_options(chain_codes) + no_chain_starts = parse_comma_separated_options(no_chain_starts) + no_chain_ends = parse_comma_separated_options(no_chain_ends) + + try: + args = get_args() + + process_sequence(args) + + except Exception as e: + exit_error(f"reading sequence failed because {e}", e) + + +def process_sequence(args: Namespace): + + pdb_sequences = read_sequences( + args.file_name, args.chain_codes, use_segids=args.use_segids + ) + + if len(pdb_sequences) == 0: + exit_error(f"no chains read from {args.file_name}") + + pdb_frame = sequence_to_nef_frame( + pdb_sequences, set(args.no_chain_starts), set(args.no_chain_ends) + ) + + # TODO: need a a warning if the sequence already exists in a molecular system and ability to merge + entry = process_stream_and_add_frames( + [ + pdb_frame, + ], + args, + ) + + print(entry) + + +def read_sequences(path, target_chain_codes, use_segids=False): + + from Bio.PDB.PDBExceptions import PDBConstructionWarning + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", PDBConstructionWarning) + + from Bio.PDB.PDBParser import PDBParser + + parser = PDBParser(PERMISSIVE=1) + structure = parser.get_structure("pdb", path) + model = structure[0] + + sequences = [] + + if not use_segids: + for chain in model: + id = chain.id.strip() + if len(id) == 0: + use_segids = True + + all_chains = len(target_chain_codes) == 0 + + for chain in model: + for residue in chain: + chain_code = residue.segid if use_segids else chain.id + chain_code = chain_code.strip() + + if not all_chains and chain_code not in target_chain_codes: + continue + + hetero_atom_flag, sequence_code, _ = residue.get_id() + + if len(hetero_atom_flag.strip()) != 0: + continue + + if chain_code == "": + exit_error( + f"residue with no chain code found for file {path} sequence_code is {sequence_code} \ + residue_name is {residue.get_resname()}" + ) + residue = SequenceResidue( + chain_code=chain_code, + sequence_code=sequence_code, + residue_name=residue.get_resname(), + ) + sequences.append(residue) + + return sequences