diff --git a/.circleci/config.yml b/.circleci/config.yml index ea8a648..a5b9f0a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,15 +1,9 @@ -# Python 3 CircleCI 2.0 configuration file -# For Synthego ICE -# -# https://circleci.com/docs/2.0/language-python/ -# version: 2 jobs: - build: + test: working_directory: ~/ice docker: - - image: circleci/python:3.6.1 - + - image: circleci/python:3.6 steps: - checkout - restore_cache: @@ -35,3 +29,35 @@ jobs: destination: tr1 - store_test_results: path: test-reports/ + publish: + docker: + - image: circleci/python:3.6 + steps: + - checkout + - run: + name: init .pypirc + command: | + echo -e "[pypi]" >> ~/.pypirc + echo -e "username = __token__" >> ~/.pypirc + echo -e "password = $PYPI_TOKEN" >> ~/.pypirc + - run: + name: Build the package + command: python3 setup.py sdist bdist_wheel + - run: + name: Upload to pypi + command: + python3 -m venv venv + . venv/bin/activate + pip install twine + python3 -m twine upload dist/* +workflows: + version: 2 + test-and-publish: + jobs: + - test + - publish: + filters: + branches: + ignore: /.*/ + tags: + only: /^v.*/ \ No newline at end of file diff --git a/.gitignore b/.gitignore index 49f875a..d1e5532 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ __pycache__/ *.py[cod] *$py.class +.pytest_cache + # Installer logs pip-log.txt @@ -37,3 +39,8 @@ env # default results results/ + +# python build files +*.egg-info/ +*.egg +dist/ diff --git a/build/lib/ice/__init__.py b/build/lib/ice/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build/lib/ice/__version__.py b/build/lib/ice/__version__.py new file mode 100644 index 0000000..9d4bf15 --- /dev/null +++ b/build/lib/ice/__version__.py @@ -0,0 +1 @@ +__version__ = '1.1.1-alpha1' diff --git a/build/lib/ice/analysis.py b/build/lib/ice/analysis.py new file mode 100644 index 0000000..a900e9f --- /dev/null +++ b/build/lib/ice/analysis.py @@ -0,0 +1,317 @@ +""" +Copyright 2018 Synthego Corporation All Rights Reserved + +The Synthego ICE software was developed at Synthego Corporation. + +Permission to use, copy, modify and distribute any part of Synthego ICE for +educational, research and non-profit purposes, without fee, and without a +written agreement is hereby granted, provided that the above copyright notice, +this paragraph and the following paragraphs appear in all copies. + +Those desiring to incorporate this Synthego ICE software into commercial +products or use for commercial purposes should contact Synthego support at Ph: +(888) 611-6883 ext:1, E-MAIL: support@synthego.com. + +In no event shall Synthego Corporation be liable to any party for direct, +indirect, special, incidental, or consequential damages, including lost +profits, arising out of the use of Synthego ICE, even if Synthego Corporation +has been advised of the possibility of such damage. + +The Synthego ICE tool provided herein is on an "as is" basis, and the Synthego +Corporation has no obligation to provide maintenance, support, updates, +enhancements, or modifications. The Synthego Corporation makes no +representations and extends no warranties of any kind, either implied or +express, including, but not limited to, the implied warranties of +merchantability or fitness for a particular purpose, or that the use of +Synthego ICE will not infringe any patent, trademark or other rights. +""" + +import argparse +import os +import traceback + +import pandas as pd +import datetime +import json + +from ice.classes.ice_result import ICEResult +from ice.classes.sanger_analysis import SangerAnalysis +from ice.utility.misc import version +from ice.utility.sequence import is_nuc_acid + +from .__version__ import __version__ + + +def single_sanger_analysis(control_path, sample_path, base_outputname, guide, donor=None, verbose=False, + allprops=False): + """ + + :param control_path: path to control ab1 file + :param sample_path: path to sample ab1 file + :param base_outputname: path to output (eg, /path/to/out/basename) will be saved using basename + :param guide: RNA/DNA sequence of guide + :param verbose: verbosity True/False + :return: results dictionary + """ + + if control_path is None or not os.path.exists(control_path): + raise Exception('Control @ {} not found'.format(control_path)) + + if not os.path.exists(sample_path): + raise Exception('Experiment sample @ {} not found'.format(sample_path)) + + # create the base directory if this location doesn't exist + base_dir = os.path.join(*os.path.split(os.path.abspath(base_outputname))[:-1]) + if verbose: + print('Base dir: %s' % base_dir) + + if not os.path.exists(base_dir): + os.makedirs(base_dir, exist_ok=True) + + sa = SangerAnalysis(verbose=verbose) + sa.debug = False + sa.allprops = allprops + sa.initialize_with(control_path=control_path, + edited_path=sample_path, + gRNA_sequences=guide, + indel_max_size=20, + base_outputname=base_outputname, + donor=donor, + allprops=allprops) + try: + sa.analyze_sample() + return sa.results.to_json(sa.guide_targets, sa.warnings) + + except Exception as e: + results = ICEResult() + + print('Exception Caught!') + traceback.print_exc() + if isinstance(e,KeyError): + return results.to_json(sa.guide_targets, [';'.join(sa.warnings)]) + else: + return results.to_json(sa.guide_targets, [str(e)]) + + +def single_sanger_analysis_cli(): + """ provides command line arg parsing for single sample analysis""" + + parser = argparse.ArgumentParser(description='Analyze Sanger reads to Infer Crispr Edit outcomes') + parser.add_argument('--control', dest='control', help='The wildtype / unedited ab1 file (REQUIRED)', required=True) + parser.add_argument('--edited', dest='edited', help='The edited ab1 file (REQUIRED)', required=True, default=None) + parser.add_argument('--target', dest='target', help='Target sequence(s) (17-23 bases, RNA or DNA, comma separated), (REQUIRED)', + required=True) + parser.add_argument('--out', dest='out', help='Output base path (Defaults to ./results/single)', required=False, + default=None) + parser.add_argument('--donor', dest='donor', help='Donor DNA sequence for HDR (Optional)', required=False, + default=None) + parser.add_argument('--verbose', dest='verbose', action='store_true') + parser.add_argument('--version', action='version', version='%(prog)s {version}'.format(version=__version__)) + parser.add_argument('--allprops', dest='allprops', action='store_true', default=False, help="Output all Edit Proposals, even if they have zero contribution") + + args = parser.parse_args() + + assert os.path.isfile(args.control) + assert os.path.isfile(args.edited) + + if args.out is None: + out_dir = os.path.join(os.path.abspath('.'), 'results', 'single') + else: + out_dir = os.path.abspath(args.out) + + base_dir = os.path.join(*os.path.split(os.path.abspath(out_dir))[:-1]) + if not os.path.exists(base_dir): + os.makedirs(base_dir, exist_ok=True) + + print('Synthego ICE (https://synthego.com)') + print('Version: {}'.format(__version__)) + + single_sanger_analysis(control_path=args.control, + sample_path=args.edited, + base_outputname=out_dir, + guide=args.target, + donor=args.donor, + verbose=args.verbose, + allprops=args.allprops) + + +def XLSDictReader(f, sheet_index=0): + book = xlrd.open_workbook(file_contents=mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)) + sheet = book.sheet_by_index(sheet_index) + headers = dict( (i, sheet.cell_value(0, i) ) for i in range(sheet.ncols) ) + + return ( dict( (headers[j], sheet.cell_value(i, j)) for j in headers ) for i in range(1, sheet.nrows) ) + +def multiple_sanger_analysis(definition_file, output_dir, + data_dir=None, + verbose=False, + single_line=None, + allprops=False): + ''' + :param definition_file: input excel file that defines sample/control/data associations + :param output_dir: output directory + :return: + ''' + + + + + input_df = pd.read_excel(definition_file) + + input_df = input_df.rename(columns={"Donor Sequence": "Donor", "Control": "Control File", "Experiment": "Experiment File"}) + + results = [] + + fails = [] + + jobs = [] + n = 0 + for m, experiment in input_df.iterrows(): + + label = experiment['Label'] + + base_outputname = os.path.join(output_dir, '%s-%s' % (n, label)) + + control_sequence_file = experiment['Control File'] + + edit_sequence_file = experiment['Experiment File'] + + guide = experiment['Guide Sequence'] + + if 'Donor' in experiment and is_nuc_acid(experiment['Donor']): + donor = experiment['Donor'] + else: + donor = None + + print(donor) + try: + if pd.isnull(control_sequence_file): + raise IOError("Control filepath not specified at line {} in definition file".format(n+1)) + if pd.isnull(edit_sequence_file): + raise IOError("Edit filepath not specified at line {} in definition file".format(n+1)) + + control_sequence_path = os.path.join(data_dir, control_sequence_file) + edit_sequence_path = os.path.join(data_dir, edit_sequence_file) + + if single_line is not None: + if n != single_line: + continue + + msg = "analyzing" + print("-" * 50, msg, n, experiment['Label'], guide) + + + job_args = (control_sequence_path, edit_sequence_path, base_outputname, guide) + job_kwargs = { + 'verbose': verbose, + 'allprops': allprops, + 'donor': donor + } + result = single_sanger_analysis(*job_args, **job_kwargs) + jobs.append((experiment, result)) + + except Exception as e: + fails.append(experiment) + print("Single Sanger analysis failed", e) + import traceback, sys + traceback.print_exc(file=sys.stdout) + + n += 1 + + for job in jobs: + r = job[1] + experiment = job[0] + if 'Donor' in experiment and is_nuc_acid(experiment['Donor']): + donor = experiment['Donor'] + else: + donor = None + + if r is not None: + tmp = [experiment['Label'], r['ice'], r['ice_d'], r['rsq'], r['hdr_pct'],r['ko_score'], r['guides'], + r['notes'], experiment['Experiment File'], experiment['Control File'], donor] + else: + tmp = [experiment['Label'], 'Failed', '', '', '', '', '', '', '', ''] + results.append(tmp) + + if results: + + input_df = pd.DataFrame(results) + timestamp = '{:%Y-%m-%d-%H%M%S}'.format(datetime.datetime.now()) + out_file = os.path.join(output_dir, "ice.results.{}.xlsx".format(timestamp)) + + header = ["sample_name", "ice", 'ice_d', "r_squared", "hdr_pct","ko_score", "guides", "notes", + "experiment_file", "control_file", "donor"] + input_df.columns = header + # to json + out_dict = [] + for r in results: + row = {} + for idx, c in enumerate(header): + row[c] = r[idx] + out_dict.append(row) + with open(out_file.replace('.xlsx', '.json'), 'w') as f: + json.dump(out_dict, f, ensure_ascii=False) + + with pd.ExcelWriter(out_file) as writer: + input_df.to_excel(writer, sheet_name="Results") + + md = {'version': __version__} + metadata = pd.DataFrame.from_dict([md]) + metadata.to_excel(writer, sheet_name='Metadata') + + writer.save() + + return out_dict + else: + print("None of the samples were able to be analyzed") + return False + + +def multiple_sanger_analysis_cli(): + """ provides command line arg parsing for batch analysis""" + + parser = argparse.ArgumentParser(description='Analyze Sanger reads to infer crispr edit outcomes') + parser.add_argument('--in', dest='input', help='Input definition file in Excel xlsx format (required)', + required=True) + parser.add_argument('--out', dest='out', help='Output directory path (defaults to .)', required=False, default=None) + parser.add_argument('--data', dest='data', help='Data path, where .ab1 files are located (required)', required=True, + default=None) + parser.add_argument('--verbose', dest='verbose', action='store_true', help='Display verbose output') + parser.add_argument('--line', dest='line', default=None, type=int, + help="Only run specified line in the Excel xlsx definition file") + parser.add_argument('--allprops', dest='allprops', action='store_true', default=False, help="Output all Edit Proposals, even if they have zero contribution") + parser.add_argument('--version', action='version', version='%(prog)s {version}'.format(version=__version__)) + + # parse args + args = parser.parse_args() + verbose = False + if args.verbose: + verbose = True + if args.allprops: + allprops = True + else: + allprops = False + + assert os.path.isfile(args.input) + + if args.out is None: + out_dir = os.path.abspath(os.path.curdir) + else: + out_dir = os.path.abspath(args.out) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + + if args.data is None: + data_dir = os.path.abspath(os.path.curdir) + else: + data_dir = os.path.abspath(args.data) + + print('Synthego ICE (https://synthego.com)') + print('Version: {}'.format(__version__)) + + multiple_sanger_analysis(args.input, + output_dir=out_dir, + data_dir=data_dir, + verbose=verbose, + single_line=args.line, + allprops=allprops) diff --git a/build/lib/ice/classes/__init__.py b/build/lib/ice/classes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build/lib/ice/classes/edit_proposal.py b/build/lib/ice/classes/edit_proposal.py new file mode 100644 index 0000000..2f6f4a4 --- /dev/null +++ b/build/lib/ice/classes/edit_proposal.py @@ -0,0 +1,145 @@ +""" +Copyright 2018 Synthego Corporation All Rights Reserved + +The Synthego ICE software was developed at Synthego Corporation. + +Permission to use, copy, modify and distribute any part of Synthego ICE for +educational, research and non-profit purposes, without fee, and without a +written agreement is hereby granted, provided that the above copyright notice, +this paragraph and the following paragraphs appear in all copies. + +Those desiring to incorporate this Synthego ICE software into commercial +products or use for commercial purposes should contact Synthego support at Ph: +(888) 611-6883 ext:1, E-MAIL: support@synthego.com. + +In no event shall Synthego Corporation be liable to any party for direct, +indirect, special, incidental, or consequential damages, including lost +profits, arising out of the use of Synthego ICE, even if Synthego Corporation +has been advised of the possibility of such damage. + +The Synthego ICE tool provided herein is on an "as is" basis, and the Synthego +Corporation has no obligation to provide maintenance, support, updates, +enhancements, or modifications. The Synthego Corporation makes no +representations and extends no warranties of any kind, either implied or +express, including, but not limited to, the implied warranties of +merchantability or fitness for a particular purpose, or that the use of +Synthego ICE will not infringe any patent, trademark or other rights. +""" + +from ice.classes.proposal_base import ProposalBase + + +class EditProposal: + """ + Stores the edit information for a single proposal. + """ + BP_MIN_DEFAULT_PADDING = 25 # min default padding around cutsite for human readable sequence + + def __init__(self, wildtype=False): + # if proposal is wildtype + self.wildtype = wildtype + + # list of ProposalBases + self.sequence_data = None + + # the index in the sequence_data @ where the cutsite is + self.cutsite = None + + # to store the second cutsite, if there is any + self.cutsite2 = None + self.trace_data = None + + # -1, +3, -10, etc for indels + # 'HDR' for recombination, etc + self.bases_changed = None + + # more comprehensive code for bases_changed + self.summary = None + + # details of bases changed in json format + self.summary_json = None + + # absolute and relative coefficients from the regression + self.x_rel = None + self.x_abs = None + + @property + def multiplex(self): + """ + :return: bool indicating if proposal is multiplex (more than one proposed cutsite) + """ + return self.cutsite and self.cutsite2 + + @property + def sequence(self): + seq = "" + for base in self.sequence_data: + if base.base in 'ATCGNatcgn': + seq += base.base + return seq + + @property + def min_cutsite(self): + if self.cutsite2 is None: + return self.cutsite + return min(self.cutsite, self.cutsite2) + + @property + def max_cutsite(self): + if self.cutsite2 is None: + return self.cutsite + return max(self.cutsite, self.cutsite2) + + def position_at_cutsite(self, pos): + """ + :param pos: position to check if at cutsite + :return: bool indicating if pos is at a cutsite + """ + return (self.cutsite and self.cutsite == pos) or (self.cutsite2 and self.cutsite2 == pos) + + def _calc_default_bp_after_cutsite(self): + """ + Calculates default basepair after cutsite padding for human_readable_sequence + :return: number of basepairs to pad + """ + default_bp_after_cutsite = 50 + + # trim some of extra bp after based on distance between cutsites + if self.multiplex: + dist_between_cutsites = self.cutsite2 - self.cutsite + return max(default_bp_after_cutsite - dist_between_cutsites, self.BP_MIN_DEFAULT_PADDING) + return default_bp_after_cutsite + + def _calc_default_bp_before_cutsite(self): + """ + Calculates default basepair before cutsite padding for human_readable_sequence + :return: number of basepairs to pad + """ + return self.BP_MIN_DEFAULT_PADDING + + def human_readable_sequence(self, bp_before_cutsite=None, bp_after_cutsite=None): + """ + Returns sequence around proposed edit site(s) showing indels. + + :param bp_before_cutsite: padding basepair count before min cutsite position + :param bp_after_cutsite: padding basepair count after max cutsite position + :return: sequence string around proposed cutsites with cutsites and indels shown + """ + if bp_before_cutsite is None: + bp_before_cutsite = self._calc_default_bp_before_cutsite() + + if bp_after_cutsite is None: + bp_after_cutsite = self._calc_default_bp_after_cutsite() + + start_pos = max(self.min_cutsite - bp_before_cutsite + 1, 0) + end_pos = min(self.max_cutsite + bp_after_cutsite + 1, len(self.sequence_data) - 1) + seq = '' + for seq_idx, base in enumerate(self.sequence_data[start_pos:end_pos], start_pos): + if base.base_type is ProposalBase.WILD_TYPE: + seq += base.base + else: + seq += base.base.lower() + + if self.position_at_cutsite(seq_idx): + seq += '|' + return seq diff --git a/build/lib/ice/classes/edit_proposal_creator.py b/build/lib/ice/classes/edit_proposal_creator.py new file mode 100644 index 0000000..bd378ba --- /dev/null +++ b/build/lib/ice/classes/edit_proposal_creator.py @@ -0,0 +1,398 @@ +""" +Copyright 2018 Synthego Corporation All Rights Reserved + +The Synthego ICE software was developed at Synthego Corporation. + +Permission to use, copy, modify and distribute any part of Synthego ICE for +educational, research and non-profit purposes, without fee, and without a +written agreement is hereby granted, provided that the above copyright notice, +this paragraph and the following paragraphs appear in all copies. + +Those desiring to incorporate this Synthego ICE software into commercial +products or use for commercial purposes should contact Synthego support at Ph: +(888) 611-6883 ext:1, E-MAIL: support@synthego.com. + +In no event shall Synthego Corporation be liable to any party for direct, +indirect, special, incidental, or consequential damages, including lost +profits, arising out of the use of Synthego ICE, even if Synthego Corporation +has been advised of the possibility of such damage. + +The Synthego ICE tool provided herein is on an "as is" basis, and the Synthego +Corporation has no obligation to provide maintenance, support, updates, +enhancements, or modifications. The Synthego Corporation makes no +representations and extends no warranties of any kind, either implied or +express, including, but not limited to, the implied warranties of +merchantability or fitness for a particular purpose, or that the use of +Synthego ICE will not infringe any patent, trademark or other rights. +""" + +from ice.classes.edit_proposal import EditProposal +from ice.classes.pair_alignment import DonorAlignment, PairAlignment +from ice.classes.proposal_base import ProposalBase +from ice.utility.sequence import reverse_complement + + +class EditProposalCreator: + MIN_HOMOLOGY_ARM = 15 + + def __init__(self, wt_basecalls, use_ctrl_trace=True, sanger_object=None): + self.sanger_object = sanger_object + self.base_order = 'ATCG' + self.use_ctrl_trace = use_ctrl_trace + self.wt_basecalls = wt_basecalls + self.wt_trace = None + + if self.use_ctrl_trace: + if self.sanger_object is None: + raise Exception('If you wish to use a ctrl Trace, you must supply a SangerObject') + self.base_order = self.sanger_object.base_order + self.wt_trace = self.trace_values + else: + proposal_trace = {'A': [], 'T': [], 'C': [], 'G': []} + for idx, base in enumerate(self.wt_basecalls): + if base == 'N': + for base_color in self.base_order: + proposal_trace[base_color].append(0.25) + else: + for base_color in self.base_order: + if base_color == base: + proposal_trace[base_color].append(0.97) + else: + proposal_trace[base_color].append(0.01) + self.wt_trace = proposal_trace + + @property + def trace_values(self): + return self.sanger_object.get_peak_values() + + def single_cut_edit_proposal(self, cutsite, label, del_before=0, del_after=0, insertion=0): + cutsite = cutsite - 1 + proposal_bases = [] + proposal_trace = [] + + # deletion case + if del_before > 0 or del_after > 0: + deleted_bases = [cutsite - i for i in range(del_before)] + [cutsite + i + 1 for i in range(del_after)] + for idx, base in enumerate(self.wt_basecalls): + if idx in deleted_bases: + proposal_base = ProposalBase('-', ProposalBase.DELETION, idx) + else: + proposal_base = ProposalBase(base, ProposalBase.WILD_TYPE, idx) + for base_index, base_color in enumerate(self.base_order): + proposal_trace.append(self.wt_trace[base_color][idx]) + proposal_bases.append(proposal_base) + ep = EditProposal() + ep.sequence_data = proposal_bases + ep.cutsite = cutsite + ep.bases_changed = -(del_before + del_after) + ep.summary = '{}[{}]'.format(-(del_before + del_after), label) + ep.summary_json = {'total': ep.bases_changed, + 'details': [{'label': label, 'value': ep.bases_changed}]} + ep.trace_data = proposal_trace + # insertion case + elif insertion > 0: + for idx, base in enumerate(self.wt_basecalls): + if idx == cutsite: + + proposal_bases.append(ProposalBase(base, ProposalBase.WILD_TYPE, idx)) + for base_index, base_color in enumerate(self.base_order): + proposal_trace.append(self.wt_trace[base_color][idx]) + for i in range(insertion): + + proposal_bases.append(ProposalBase('n', ProposalBase.INSERTION, idx)) + for base_index, base in enumerate(self.base_order): + proposal_trace.append(0.25) + else: + proposal_bases.append(ProposalBase(base, ProposalBase.WILD_TYPE, idx)) + for base_index, base_color in enumerate(self.base_order): + proposal_trace.append(self.wt_trace[base_color][idx]) + ep = EditProposal() + ep.sequence_data = proposal_bases + ep.cutsite = cutsite + ep.bases_changed = insertion + ep.summary = '{}[{}]'.format(insertion, label) + ep.summary_json = {'total': ep.bases_changed, + 'details': [{'label': label, 'value': ep.bases_changed}]} + ep.trace_data = proposal_trace + # wildtype case + else: + for idx, base in enumerate(self.wt_basecalls): + proposal_base = ProposalBase(base, ProposalBase.WILD_TYPE, idx) + proposal_bases.append(proposal_base) + for base_index, base_color in enumerate(self.base_order): + proposal_trace.append(self.wt_trace[base_color][idx]) + ep = EditProposal(wildtype=True) + ep.sequence_data = proposal_bases + ep.cutsite = cutsite + ep.bases_changed = 0 + ep.summary = '0[{}]'.format(label) + ep.summary_json = {'total': ep.bases_changed, + 'details': []} + ep.trace_data = proposal_trace + return ep + + def multiplex_proposal(self, cutsite1, cutsite2, label1, label2, cut1_del=(0, 0), cut1_ins=0, cut2_del=(0, 0), + cut2_ins=0, dropout=False): + """ + Makes proposals for two guides cutting at the same time. This method handles both two separate indel proposals + or a dropout proposals. Both sites must be part of the edit and cutsite1 must come before cutsite2. + :param cutsite1: position of first cutsite + :param cutsite2: position of second cutsite + :param label1: label of guide target that creates first cutsite + :param label2: label of guide target that creates second cutsite + :param cut1_del: deletion size before and after first cutsite + :param cut1_ins: insertion size at first cutsite + :param cut2_del: deletion size before and after second cutsite + :param cut2_ins: insertion size at first cutsite + :param dropout: if the proposal is a dropout (removal of all bases between cutsites) + :return: EditProposal instance + """ + if cutsite2 <= cutsite1: + raise Exception('cutsite1 must come before cutsite2 values are ({}, {})'.format(cutsite1, cutsite2)) + + cutsite1 -= 1 + cutsite2 -= 1 + proposal_bases = [] + proposal_trace = [] + summary_code = 'm' # multiplex + deleted_bases = [] + + if dropout: + summary_code = 'md' # multiplex dropout + for i in range(cutsite1 + 1, cutsite2 + 1): + deleted_bases.append(i) + + # both cutsites results in deletions + # TODO, there are no safeguards for if the deletions for one cutsite exceed the boundaries of the other cutsite + # that info matters a lot for the summary + # for cut1 allow all deletions before + # for cut2 allow all deletions after + # for cut1 deletions after, stop if cut2 reached + # for cut2 deletions before, stop if cut1 reached + if cut1_del != (0, 0) and cut2_del != (0, 0): + # TODO, this should also handle the straight dropout case w/ no extra deletions + if dropout: + # zero out deletions between cutsites because dropout logic will take these deleted bases into account + # without double counting deletions + cut1_del_after = 0 + cut2_del_before = 0 + else: + cut1_del_after = cut1_del[1] + cut2_del_before = cut2_del[0] + + cut1 = (cutsite1, cut1_del[0], cut1_del_after) + cut2 = (cutsite2, cut2_del_before, cut2_del[1]) + for cutsite, del_before, del_after in [cut1, cut2]: + deleted_bases += [cutsite - i for i in range(del_before)] + [cutsite + i + 1 for i in range(del_after)] + + for idx, base in enumerate(self.wt_basecalls): + if idx in deleted_bases: + proposal_base = ProposalBase('-', ProposalBase.DELETION, idx) + else: + proposal_base = ProposalBase(base, ProposalBase.WILD_TYPE, idx) + for base_index, base_color in enumerate(self.base_order): + proposal_trace.append(self.wt_trace[base_color][idx]) + proposal_bases.append(proposal_base) + + #If deletion, restore length by padding end with N + for pad in range(len(deleted_bases)): + proposal_base = (ProposalBase('n', ProposalBase.INSERTION, idx+pad)) + for base_index, base in enumerate(self.base_order): + proposal_trace.append(0.25) + proposal_bases.append(proposal_base) + + ep = EditProposal() + ep.sequence_data = proposal_bases + ep.cutsite = cutsite1 + ep.cutsite2 = cutsite2 + total_deleted = -cut1_del[0] - cut1_del_after - cut2_del_before - cut2_del[1] + if dropout: + total_deleted += -(cutsite2 - cutsite1) + ep.bases_changed = total_deleted + cut1_del_size = cut1_del[0] + cut1_del_after + cut2_del_size = cut2_del_before + cut2_del[1] + ep.summary = '{}:{}-{}[{}],-{}[{}]'.format(total_deleted, summary_code, + cut1_del_size, + label1, + cut2_del_size, + label2 + ) + summary_json = {} + summary_json['total'] = ep.bases_changed + summary_json['details'] = [] + if cut1_del_size > 0: + summary_json['details'].append({'label': label1, 'value': -cut1_del_size}) + if cut2_del_size > 0: + summary_json['details'].append({'label': label2, 'value': -cut2_del_size}) + if dropout: + summary_json['details'].append({'label': 'dropout', 'value': cutsite1 - cutsite2}) + ep.summary_json = summary_json + ep.trace_data = proposal_trace + + # insertion case + elif cut1_ins != 0 or cut2_ins != 0: + if dropout: + cut2_ins = 0 + + cut1 = (cutsite1, cut1_ins) + cut2 = (cutsite2, cut2_ins) + + for idx, base in enumerate(self.wt_basecalls): + if idx in deleted_bases: + proposal_bases.append(ProposalBase('-', ProposalBase.DELETION, idx)) + # if base is in range where we need to do insertion + elif idx in [cutsite1, cutsite2]: + for cutsite, insertion_length in [cut1, cut2]: + if cutsite == idx: + proposal_bases.append(ProposalBase(base, ProposalBase.WILD_TYPE, idx)) + for base_index, base_color in enumerate(self.base_order): + proposal_trace.append(self.wt_trace[base_color][idx]) + for i in range(insertion_length): + proposal_bases.append(ProposalBase('n', ProposalBase.INSERTION, idx)) + for base_index, base in enumerate(self.base_order): + proposal_trace.append(0.25) + else: + proposal_bases.append(ProposalBase(base, ProposalBase.WILD_TYPE, idx)) + for base_index, base_color in enumerate(self.base_order): + proposal_trace.append(self.wt_trace[base_color][idx]) + + # If deletion, restore length by padding end with N + for pad in range(len(deleted_bases)): + proposal_base = (ProposalBase('n', ProposalBase.INSERTION, idx + pad)) + for base_index, base in enumerate(self.base_order): + proposal_trace.append(0.25) + proposal_bases.append(proposal_base) + + ep = EditProposal() + ep.sequence_data = proposal_bases + ep.cutsite = cutsite1 + cut1_ins + ep.cutsite2 = cutsite1 + cut1_ins + (cutsite2 - cutsite1) + cut2_ins + ep.bases_changed = cut1_ins + cut2_ins + if dropout: + ep.bases_changed -= (cutsite2 - cutsite1) + ep.summary = '{}:{}+{}[{}],+{}[{}]'.format( + ep.bases_changed, + summary_code, + cut1_ins, + label1, + cut2_ins, + label2 + ) + summary_json = {} + summary_json['total'] = ep.bases_changed + summary_json['details'] = [] + if cut1_ins > 0: + summary_json['details'].append({'label': label1, 'value': cut1_ins}) + if cut2_ins > 0: + summary_json['details'].append({'label': label2, 'value': cut2_ins}) + if dropout: + summary_json['details'].append({'label': 'dropout', 'value': cutsite1 - cutsite2}) + ep.summary_json = summary_json + ep.trace_data = proposal_trace + ep.trace_data = proposal_trace + # the intervening sequence is dropped out and no bases inserted or deleted + else: + if dropout: + for idx, base in enumerate(self.wt_basecalls): + if idx in deleted_bases: + proposal_base = ProposalBase('-', ProposalBase.DELETION, idx) + else: + proposal_base = ProposalBase(base, ProposalBase.WILD_TYPE, idx) + for base_index, base_color in enumerate(self.base_order): + proposal_trace.append(self.wt_trace[base_color][idx]) + proposal_bases.append(proposal_base) + + # If deletion, restore length by padding end with N + for pad in range(len(deleted_bases)): + proposal_base = (ProposalBase('n', ProposalBase.INSERTION, idx + pad)) + for base_index, base in enumerate(self.base_order): + proposal_trace.append(0.25) + proposal_bases.append(proposal_base) + + ep = EditProposal() + ep.sequence_data = proposal_bases + ep.cutsite = cutsite1 + ep.cutsite2 = cutsite2 + + total_deleted = -(cutsite2 - cutsite1) + ep.bases_changed = total_deleted + ep.summary = '{}:{}-0[{}],-0[{}]'.format(total_deleted, summary_code, label1, label2) + ep.bases_changed = total_deleted + ep.summary_json = {'total': ep.bases_changed, + 'details': [{'label': 'dropout', 'value': total_deleted}]} + ep.trace_data = proposal_trace + else: + # wild type case, use the single edit to model this case + return None + return ep + + def recombined_outcome_sequence(self, alignment, donor_sequence): + proposal_bases = [] + proposal_trace = [] + wt_seq = list(alignment[0]) + aligned_odn = alignment[1] + changed_bases = [] + odn_start_pos = None + odn_idx = 0 + ctrl_seq_idx = 0 + for idx, base in enumerate(aligned_odn): + if base is not '-': + odn_idx += 1 + # within the region affected by the donor sequence + if odn_idx > 0 and odn_idx <= len(donor_sequence) - 2: + if base != wt_seq[idx]: + proposal_bases.append(ProposalBase(base, ProposalBase.HDR, ctrl_seq_idx)) + changed_bases.append(ctrl_seq_idx) + for base_index, base_color in enumerate(self.base_order): + if base_color == base: + proposal_trace.append(0.97) + else: + proposal_trace.append(0.01) + else: + # same base as wt + proposal_bases.append(ProposalBase(base, ProposalBase.WILD_TYPE, ctrl_seq_idx)) + for base_index, base_color in enumerate(self.base_order): + proposal_trace.append(self.wt_trace[base_color][ctrl_seq_idx]) + if odn_start_pos is None: + odn_start_pos = idx + # all other positions outside of the region affected by HDR + else: + proposal_bases.append(ProposalBase(wt_seq[idx], ProposalBase.WILD_TYPE, ctrl_seq_idx)) + for base_index, base_color in enumerate(self.base_order): + proposal_trace.append(self.wt_trace[base_color][ctrl_seq_idx]) + # HDR may also cause deletions or insertions, so we need to keep track of ctrl_seq_idx separately + if wt_seq[idx] != '-': + ctrl_seq_idx += 1 + return proposal_bases, proposal_trace, changed_bases, odn_start_pos + + def homologous_recombination_proposal(self, cutsite, donor_sequence): + cutsite = cutsite - 1 + # check orientation and verify at least 15nt matching on both ends of ODN + + if reverse_complement(donor_sequence[:self.__class__.MIN_HOMOLOGY_ARM]) in self.wt_basecalls: + donor_sequence = reverse_complement(donor_sequence) + + if donor_sequence[:self.__class__.MIN_HOMOLOGY_ARM] in self.wt_basecalls and \ + donor_sequence[-self.__class__.MIN_HOMOLOGY_ARM:] in self.wt_basecalls: + + + # todo confirm that cutsite is inside the HDR region + donor_alignment = DonorAlignment(self.wt_basecalls, donor_sequence) + donor_alignment.align_ssodn() + + proposal_bases, proposal_trace, changed_bases, odn_start_pos = self.recombined_outcome_sequence( + donor_alignment.all_aligned_seqs, donor_sequence) + + ep = EditProposal() + ep.sequence_data = proposal_bases + ep.trace_data = proposal_trace + ep.cutsite = cutsite + ep.bases_changed = ProposalBase.HDR + ep.summary = ProposalBase.HDR + ep.summary_json = {'total': ep.bases_changed, + 'details': [{'label': ProposalBase.HDR, 'value': ep.bases_changed}]} + + return ep, changed_bases, odn_start_pos, donor_alignment + else: + raise Exception( + "Homology arms of length {} not found in control sequence".format(self.__class__.MIN_HOMOLOGY_ARM)) diff --git a/build/lib/ice/classes/guide_target.py b/build/lib/ice/classes/guide_target.py new file mode 100644 index 0000000..21701c6 --- /dev/null +++ b/build/lib/ice/classes/guide_target.py @@ -0,0 +1,46 @@ +""" +Copyright 2018 Synthego Corporation All Rights Reserved + +The Synthego ICE software was developed at Synthego Corporation. + +Permission to use, copy, modify and distribute any part of Synthego ICE for +educational, research and non-profit purposes, without fee, and without a +written agreement is hereby granted, provided that the above copyright notice, +this paragraph and the following paragraphs appear in all copies. + +Those desiring to incorporate this Synthego ICE software into commercial +products or use for commercial purposes should contact Synthego support at Ph: +(888) 611-6883 ext:1, E-MAIL: support@synthego.com. + +In no event shall Synthego Corporation be liable to any party for direct, +indirect, special, incidental, or consequential damages, including lost +profits, arising out of the use of Synthego ICE, even if Synthego Corporation +has been advised of the possibility of such damage. + +The Synthego ICE tool provided herein is on an "as is" basis, and the Synthego +Corporation has no obligation to provide maintenance, support, updates, +enhancements, or modifications. The Synthego Corporation makes no +representations and extends no warranties of any kind, either implied or +express, including, but not limited to, the implied warranties of +merchantability or fitness for a particular purpose, or that the use of +Synthego ICE will not infringe any patent, trademark or other rights. +""" + + +class GuideTarget: + + def __init__(self, + orientation=None, + cut_offset=None, + guide_start=None, + guide_end=None, + cutsite=None, + sequence=None, + label=None): + self.orientation = orientation + self.cut_offset = cut_offset + self.guide_start = guide_start + self.guide_end = guide_end + self.cutsite = cutsite + self.sequence = sequence + self.label = label diff --git a/build/lib/ice/classes/ice_result.py b/build/lib/ice/classes/ice_result.py new file mode 100644 index 0000000..c203718 --- /dev/null +++ b/build/lib/ice/classes/ice_result.py @@ -0,0 +1,76 @@ +""" +Copyright 2018 Synthego Corporation All Rights Reserved + +The Synthego ICE software was developed at Synthego Corporation. + +Permission to use, copy, modify and distribute any part of Synthego ICE for +educational, research and non-profit purposes, without fee, and without a +written agreement is hereby granted, provided that the above copyright notice, +this paragraph and the following paragraphs appear in all copies. + +Those desiring to incorporate this Synthego ICE software into commercial +products or use for commercial purposes should contact Synthego support at Ph: +(888) 611-6883 ext:1, E-MAIL: support@synthego.com. + +In no event shall Synthego Corporation be liable to any party for direct, +indirect, special, incidental, or consequential damages, including lost +profits, arising out of the use of Synthego ICE, even if Synthego Corporation +has been advised of the possibility of such damage. + +The Synthego ICE tool provided herein is on an "as is" basis, and the Synthego +Corporation has no obligation to provide maintenance, support, updates, +enhancements, or modifications. The Synthego Corporation makes no +representations and extends no warranties of any kind, either implied or +express, including, but not limited to, the implied warranties of +merchantability or fitness for a particular purpose, or that the use of +Synthego ICE will not infringe any patent, trademark or other rights. +""" + + +class ICEResult: + + def __init__(self): + self.control_discordances = [] + self.edited_discordances = [] + self.mean_discord_before = None + self.mean_discord_after = None + self.aggregate = None + self.contribs = None + self.edit_eff = None + self.ice_d = None + self.r_squared = None + self.hdr_percentage = None + self.ko_score= None + + def to_json(self, guide_targets, warnings): + + results = {} + if self.edit_eff is not None: + results['ice'] = int(round(self.edit_eff * 100, 0)) + else: + results['ice'] = None + if self.mean_discord_before is not None: + results['mean_discord_before'] = round(self.mean_discord_before, 3) + else: + results['mean_discord_before'] = None + if self.mean_discord_after is not None: + results['mean_discord_after'] = round(self.mean_discord_after, 3) + else: + results['mean_discord_after'] = None + if self.ice_d is not None: + results['ice_d'] = int(round(self.ice_d, 0)) + else: + results['ice_d'] = None + if self.r_squared is not None: + results['rsq'] = round(self.r_squared, 2) + else: + results['rsq'] = None + + results['hdr_pct'] = self.hdr_percentage + results['ko_score'] = self.ko_score + results['notes'] = "; ".join(warnings) + results['guides'] = [vars(g) for g in guide_targets] + + results['status'] = 'succeeded' + + return results diff --git a/build/lib/ice/classes/pair_alignment.py b/build/lib/ice/classes/pair_alignment.py new file mode 100644 index 0000000..0a81207 --- /dev/null +++ b/build/lib/ice/classes/pair_alignment.py @@ -0,0 +1,281 @@ +""" +Copyright 2018 Synthego Corporation All Rights Reserved + +The Synthego ICE software was developed at Synthego Corporation. + +Permission to use, copy, modify and distribute any part of Synthego ICE for +educational, research and non-profit purposes, without fee, and without a +written agreement is hereby granted, provided that the above copyright notice, +this paragraph and the following paragraphs appear in all copies. + +Those desiring to incorporate this Synthego ICE software into commercial +products or use for commercial purposes should contact Synthego support at Ph: +(888) 611-6883 ext:1, E-MAIL: support@synthego.com. + +In no event shall Synthego Corporation be liable to any party for direct, +indirect, special, incidental, or consequential damages, including lost +profits, arising out of the use of Synthego ICE, even if Synthego Corporation +has been advised of the possibility of such damage. + +The Synthego ICE tool provided herein is on an "as is" basis, and the Synthego +Corporation has no obligation to provide maintenance, support, updates, +enhancements, or modifications. The Synthego Corporation makes no +representations and extends no warranties of any kind, either implied or +express, including, but not limited to, the implied warranties of +merchantability or fitness for a particular purpose, or that the use of +Synthego ICE will not infringe any patent, trademark or other rights. +""" + +from io import StringIO +import json + +from Bio import pairwise2, AlignIO + + +class PairAlignment: + ''' + Uses a list of lists and two dictionaries to store data about a pairwise alignment + ''' + + def __init__(self, seq1, seq2): + self.seq1 = seq1 + self.seq2 = seq2 + self.name1 = "control" + self.name2 = "edited" + self.alignment_pairs = None + self.all_aligned_seqs = None + self.aln_seqs = None + + self.sample_to_control = {} + self.control_to_sample = {} + self.all_aligned_clustal = None + self.aligned_clustal = None + self.last_aligned_pair_idx = None + + self.aln_clustal = None + + # def convert to clustal format + # def write alignment to file + + def __str__(self): + return self.all_aligned_seqs[0] + "\n" + self.all_aligned_seqs[1] + + @staticmethod + def write_aln(aln_clustal, to_file=None): + if to_file is not None: + with open(to_file, 'w') as out_file: + print(aln_clustal, file=out_file) + + @staticmethod + def write_json(aln_seqs, to_file): + if aln_seqs: + out_dict = {'control': aln_seqs[0], 'edited': aln_seqs[1]} + with open(to_file, 'w') as f: + json.dump(out_dict, f) + + def align_list_to_clustal(self, aln, name1, name2): + my_aln = ">{}\n".format(name1) + aln[0] + "\n" + ">{}\n".format(name2) + aln[1] + f = StringIO(my_aln) + aln_objs = list(AlignIO.parse(f, "fasta")) + alignment_txt = aln_objs[0].format("clustal").split('\n', 2)[2] + return alignment_txt + + def align_all(self): + seq1 = self.seq1 + seq2 = self.seq2 + + match_bonus = 2 + #(aseq, bseq, match, mismatch, gap_open, gap_extend) + alignments = pairwise2.align.localms(seq1, + seq2, + match_bonus, -1, -3, -1) + aln = alignments[0] + self.all_aligned_seqs = (aln[0], aln[1]) + self.all_aligned_clustal = self.align_list_to_clustal(aln, "control", "edited") + + def ctrl2sample_coords(self, coord, closest=False): + if self.alignment_pairs is None: + raise Exception("alignment has not been performed and alignment_pairs is None") + elif coord is None: + raise Exception("ctrl coordinate to convert cannot be None") + + if self.control_to_sample[coord] is None and closest is True: + # tries to find a sample coordinate to the left in the alignment + pos = coord + sample_coord = None + while pos > -1 and sample_coord == None: + sample_coord = self.control_to_sample[pos] + pos -= 1 + return sample_coord + else: + return self.control_to_sample[coord] + + @property + def has_alignment(self): + return not not self.alignment_pairs + + def align_with_window(self, alignment_window): + """ + This function uses a subsequence in seq1 to align to seq2. + The remaining bases are then padded onto the aln + + Example: + seq1 = AATGTAATGATAG + seq2 = AATGTATGATAG + window is 0, 3, so the sequence used for alignment is ATGT + After initial alignment: + AATG + AATGTATGATAG + After alignment padding: + AATGTAATGATAG + AATGTATGATAG + + Whereas an alignment of the full sequences would result in + AATGTAATGATAG + AATGTA-TGATAG + + :param alignment_window: specifies subsequence of seq1 to align to seq2 + :param seq1: control sequence + :param seq2: edited sequence + :return: True if completed + """ + seq1 = self.seq1 + seq2 = self.seq2 + aw = alignment_window + window_size = aw[1] - aw[0] + + match_bonus = 2 + # align windowed seq1 to seq2 + alignments = pairwise2.align.localms(seq1[aw[0]:aw[1]], + seq2, + match_bonus, -1, -2, -1) + # a, b, c, d scoring parameters for alignments + # a = points added for match + # b = points added for mismatch + # c = gap open penalty + # d = gap extension penalty + + # Initial Alignment + try: + # get the top alignment + aln = alignments[0] + except Exception as e: + self.aln_clustal = "No alignment found\nCtrl {} to {}:{}\nEdited:{}\n".format( + aw[0], aw[1], seq1, seq2) + debug_txt = " window: %s to %s " % (aw[0], aw[1]) + return False, ('No alignment found ' + str(e) + debug_txt) + + aln_score = aln[2] + aln_score_normalized = aln_score / (window_size * match_bonus) * 100 + self.aln_clustal = self.align_list_to_clustal(aln, "control_aln_window", "edited") + + # we'd like to have at least around 40% of bases matching) + if 50 > aln_score_normalized: + return False, "Poor alignment upstream of cutsite: %0.2f percent of full score" % (aln_score_normalized) + + self.aln_seqs = (aln[0], aln[1]) + + src_alignment = aln[0] + dst_alignment = aln[1] + + last_aligned_ctrl_base = None + index = len(src_alignment) - 1 + + # find the last aligned base (closest to right end), then we will fill in the alignment downstream + while last_aligned_ctrl_base is None and index > -1: + if src_alignment[index] == '-': + index -= 1 + else: + last_aligned_ctrl_base = index + + # use the alignment to generate a list of tuples that matches ref bases with sample bases + sample_index = 0 + reference_index = aw[0] + + alignment_pairs = [] + first_alned_ctrl_base = None + + for aln_idx, sample_base in enumerate(dst_alignment): + r = None + s = None + ref_base = src_alignment[aln_idx] + if sample_base != "-": + s = sample_index + sample_index += 1 + + if ref_base != "-": + if first_alned_ctrl_base is None: + first_alned_ctrl_base = aln_idx + r = reference_index + reference_index += 1 + + # alignment padding + # use the aligned segment to get the downstream ctrl subsequence that matches up with the inference window + elif aw[1] <= reference_index < len(seq1): + r = reference_index + reference_index += 1 + + alignment_pairs.append([r, s]) + self.sample_to_control[s] = r + self.control_to_sample[r] = s + + aln_idx = first_alned_ctrl_base + reference_index = alignment_pairs[aln_idx][0] + aln_idx -= 1 + reference_index -= 1 + while aln_idx > -1 and reference_index > -1: + alignment_pairs[aln_idx][0] = reference_index + aln_idx -= 1 + reference_index -= 1 + + self.alignment_pairs = alignment_pairs + + # now we traverse backwards and find the last aligned bases + for aln_idx, aln_pair in reversed(list(enumerate(self.alignment_pairs))): + ctrl_idx = aln_pair[0] + edit_idx = aln_pair[1] + if ctrl_idx is not None and edit_idx is not None: + self.last_aligned_pair_idx = aln_idx + break + + return True, "Aln succeeded" + + +class DonorAlignment(PairAlignment): + """ + Adds functionality that is useful in the specific case of pair alignment between control sequence and donor sequence + """ + MATCH_BONUS = 2 + + def __init__(self, control_seq, donor_seq): + super(DonorAlignment, self).__init__(control_seq, donor_seq) + self.align_ssodn() + self.hdr_indel_size = self._calc_hdr_indel_size() # how much the donor HDR would change control seq length + + @property + def aligned_control_seq(self): + return self.all_aligned_seqs[0] + + @property + def aligned_donor_seq(self): + return self.all_aligned_seqs[1] + + def _calc_hdr_indel_size(self): + insert_total_len = self.aligned_control_seq.strip('-').count('-') + deletion_total_len = self.aligned_donor_seq.strip('-').count('-') + return insert_total_len - deletion_total_len + + @property + def control_seq(self): + return self.seq1 + + @property + def donor_seq(self): + return self.seq2 + + def align_ssodn(self): + print('starting to align ssodn') + alignments = pairwise2.align.localms(self.control_seq, self.donor_seq, self.MATCH_BONUS, -1, -6, -0.5) + alignment = alignments[0] + self.all_aligned_seqs = (alignment[0], alignment[1]) + self.all_aligned_clustal = self.align_list_to_clustal(alignment, 'control', 'donor') diff --git a/build/lib/ice/classes/proposal_base.py b/build/lib/ice/classes/proposal_base.py new file mode 100644 index 0000000..4af5b04 --- /dev/null +++ b/build/lib/ice/classes/proposal_base.py @@ -0,0 +1,42 @@ +""" +Copyright 2018 Synthego Corporation All Rights Reserved + +The Synthego ICE software was developed at Synthego Corporation. + +Permission to use, copy, modify and distribute any part of Synthego ICE for +educational, research and non-profit purposes, without fee, and without a +written agreement is hereby granted, provided that the above copyright notice, +this paragraph and the following paragraphs appear in all copies. + +Those desiring to incorporate this Synthego ICE software into commercial +products or use for commercial purposes should contact Synthego support at Ph: +(888) 611-6883 ext:1, E-MAIL: support@synthego.com. + +In no event shall Synthego Corporation be liable to any party for direct, +indirect, special, incidental, or consequential damages, including lost +profits, arising out of the use of Synthego ICE, even if Synthego Corporation +has been advised of the possibility of such damage. + +The Synthego ICE tool provided herein is on an "as is" basis, and the Synthego +Corporation has no obligation to provide maintenance, support, updates, +enhancements, or modifications. The Synthego Corporation makes no +representations and extends no warranties of any kind, either implied or +express, including, but not limited to, the implied warranties of +merchantability or fitness for a particular purpose, or that the use of +Synthego ICE will not infringe any patent, trademark or other rights. +""" + + +class ProposalBase: + WILD_TYPE = 'wt' + INSERTION = 'insertion' + DELETION = 'deletion' + BASE_EDIT = 'base_edit' + HDR = 'hdr' + BASE_TYPE = [WILD_TYPE, INSERTION, DELETION, BASE_EDIT, HDR] + + def __init__(self, base, base_type, wt_coord): + self.base = base + assert base_type in ProposalBase.BASE_TYPE + self.base_type = base_type + self.wt_coord = wt_coord diff --git a/build/lib/ice/classes/sanger_analysis.py b/build/lib/ice/classes/sanger_analysis.py new file mode 100644 index 0000000..a8a5cb3 --- /dev/null +++ b/build/lib/ice/classes/sanger_analysis.py @@ -0,0 +1,847 @@ +""" +Copyright 2018 Synthego Corporation All Rights Reserved + +The Synthego ICE software was developed at Synthego Corporation. + +Permission to use, copy, modify and distribute any part of Synthego ICE for +educational, research and non-profit purposes, without fee, and without a +written agreement is hereby granted, provided that the above copyright notice, +this paragraph and the following paragraphs appear in all copies. + +Those desiring to incorporate this Synthego ICE software into commercial +products or use for commercial purposes should contact Synthego support at Ph: +(888) 611-6883 ext:1, E-MAIL: support@synthego.com. + +In no event shall Synthego Corporation be liable to any party for direct, +indirect, special, incidental, or consequential damages, including lost +profits, arising out of the use of Synthego ICE, even if Synthego Corporation +has been advised of the possibility of such damage. + +The Synthego ICE tool provided herein is on an "as is" basis, and the Synthego +Corporation has no obligation to provide maintenance, support, updates, +enhancements, or modifications. The Synthego Corporation makes no +representations and extends no warranties of any kind, either implied or +express, including, but not limited to, the implied warranties of +merchantability or fitness for a particular purpose, or that the use of +Synthego ICE will not infringe any patent, trademark or other rights. +""" + +import os +from itertools import combinations +from collections import defaultdict + +import numpy as np +from scipy.optimize import nnls +from scipy.stats.stats import pearsonr +from sklearn import linear_model +import re +from ice.classes.edit_proposal_creator import EditProposalCreator +from ice.classes.guide_target import GuideTarget +from ice.classes.ice_result import ICEResult +from ice.classes.pair_alignment import PairAlignment +from ice.classes.proposal_base import ProposalBase +from ice.classes.sanger_object import SangerObject +from ice.outputs.create_discordance_indel_files import generate_discordance_indel_files +from ice.outputs.create_json import write_individual_contribs, write_contribs_json, write_all_proposals_json +from ice.outputs.create_trace_files import generate_trace_files +from ice.utility.sequence import RNA2DNA, reverse_complement +def round_percent(orig_array,r_squared): + # Scale the array by 100 in order to work with np.floor + scaled_array=np.array([x*100 for x in orig_array]) + + + #floored array + fl_dist = np.floor(scaled_array) + # Calculate the total difference between the floored array and the r2, this difference must be added back + total_lost=round(r_squared*100)-np.sum(fl_dist) + + # Rank each value by how much it lost when it was floored + order=sorted([ (x,ix) for ix,x in enumerate(scaled_array-np.floor(scaled_array))],reverse=True) + + + # Add back what was lost in total, accoridng to how much each value lost. + for l in range(int(total_lost)): + fl_dist[order[l][1]]+=1 + + return fl_dist/100 #set it back to a percentage + +class SangerAnalysis: + """ + 1. Align sgRNA guide sequence to control sequence (or to the base calls from the control chromatogram) + 2. Align the upstream portion of the control sequence to the upstream portion of the experimental sample + 3. For each base call in experimental sample, extract relative abundance of each nucleotide + 4. Sum relative abundances of incorrect bases, normalize to 75 -> 100% + + """ + + iced_correction_factor = 1.41 # ICE-D regression coefficient + HDR_OVERLAP_FILTER_CUTOFF = 3 # abs val cutoff to not skip proposals even if they overlap in size w/ HDR proposal + + def __init__(self, verbose=False): + + self.control_chromatogram_path = None # the file path + self.control_sample = None # the Sanger Object + + self.edited_chromatogram_path = None # the file path + self.edited_sample = None # the Sanger Object + + # self.gRNA_sequence = None # the guide sequence + self.gRNA_sequences = [] + self.guide_targets = [] + + self.valid_configuration = False + self.donor_odn = None + self.recombination_changed_bases = [] + + # list of EditProposals + self.proposals = None + + # alignment of donor with ctrl + self.donor_alignment = None + + self.alignment_window = None # these coordinates refer to the ctrl + self.inference_window = None # these coordinates refer to the ctrl, not the sample + self.alignment_sequence = None + self.indel_max_size = None + + self.base_outputname = None + + self.use_ctrl_trace = True # if true, this will use the peak values from the ctrl trace for inference + # if false, the inference matrix will be filled with 0s and 1s based on the sequence only + self.inference_sequence = None + self.indel_list = None + self.output_vec = None + self.results = ICEResult() + + self.r_squared_correction = True + + self.verbose = verbose + self.use_partial_for_insertions = True + self.debug = False + self.warnings = [] + self.MIN_ALN_WINDOW_SIZE = 50 # minimum length of high quality sequence in sample required for alignment + + self.plot_guide_line = True # False if we want to use css for denoting where the guide lines are + self.allprops = False # print out all proposals in the json, even if they have 0 contribution + + def initialize_with(self, control_path, edited_path, gRNA_sequences, + alignment_window=None, + inference_window=None, + indel_max_size=5, + base_outputname=None, + donor=None, + allprops=False): + """ + indel_size_range=(1,10) + + :param control_path: + :param edited_path: + :param gRNA_sequences: + :param args: + :param kwargs: + :return: + """ + + if os.path.exists(control_path): + control_sample = SangerObject() + control_sample.initialize_from_path(control_path) + self.control_sample = control_sample + else: + raise Exception('Control path does not exist') + + if os.path.exists(edited_path): + edited_sample = SangerObject() + edited_sample.initialize_from_path(edited_path) + self.edited_sample = edited_sample + else: + raise Exception('Edited path does not exist') + + if gRNA_sequences is not None and (isinstance(gRNA_sequences, str)): + gRNA_sequences = gRNA_sequences.split(",") + gRNA_sequences = list(map(str.strip, gRNA_sequences)) + gRNA_sequences = list(map(RNA2DNA, gRNA_sequences)) + gRNA_sequences = list(map(lambda x:x.upper(), gRNA_sequences)) + self.gRNA_sequences = gRNA_sequences + + else: + raise Exception('{} contains invalid gRNA sequences'.format(gRNA_sequences)) + + if base_outputname is not '': + self.base_outputname = base_outputname + "." + self.inference_window = inference_window + self.indel_max_size = indel_max_size + self.donor_odn = donor + self.allprops = allprops + return True + + @property + def multiplex(self): + """ + :return: bool indicating if sanger analysis is multiplex + """ + return len(self.guide_targets) > 1 + + def quality_check(self): + aw = self.edited_sample.find_alignable_window() + if aw['max_window'] is None: + raise Exception("Sample ab1 %s quality scores too low" % self.edited_sample.basename) + + def sample_cutsite(self, guide_target, alt=False): + ctrl_cutsite = guide_target.cutsite + sample_cutsite = self.alignment.ctrl2sample_coords(ctrl_cutsite) + + return sample_cutsite + + def find_guide_in_ctrl(self, guide_label, guide_seq, ctrl_seq): + revcomp_guide = reverse_complement(guide_seq) + found_seq = None + if guide_seq in ctrl_seq: + cut_offset = len(guide_seq) - 3 + orientation = "fwd" + found_seq = guide_seq + elif revcomp_guide in ctrl_seq: + cut_offset = 3 + orientation = "rev" + found_seq = revcomp_guide + else: + raise Exception("guide %s not found in control sequence" % guide_seq) + + if found_seq: + guide_start = ctrl_seq.index(found_seq) + guide_end = guide_start + len(found_seq) + cutsite = guide_start + cut_offset + + + ### checking for PAMs + if orientation=="rev" and ctrl_seq[guide_start - 3:guide_start - 1]!='CC': + self.warnings.append("No PAM upstream of guide {}".format(guide_label)) + + elif orientation=="fwd" and ctrl_seq[guide_end + 1:guide_end + 3]!='GG': + self.warnings.append("No PAM downstream of guide {}".format(guide_label)) + + return GuideTarget( + orientation=orientation, + cut_offset=cut_offset, + cutsite=cutsite, + guide_start=guide_start, + guide_end=guide_end, + sequence=guide_seq, + label=guide_label + ) + + def find_targets(self): + ''' + This function verifies that the sgRNA sequence is in the control sample or reference, + it also checks if the reverse complement is in the sequence + :return: + ''' + if self.control_sample and self.edited_sample and self.gRNA_sequences: + for guide_idx, guide in enumerate(self.gRNA_sequences): + label = "g{}".format(guide_idx + 1) + g_t = self.find_guide_in_ctrl(label, guide, self.control_sample.primary_base_calls) + self.guide_targets.append(g_t) + self.guide_targets = sorted(self.guide_targets, key=lambda x: x.cutsite) + + def check_recombination(self): + epc = EditProposalCreator(self.control_sample.primary_base_calls, + use_ctrl_trace=True, + sanger_object=self.control_sample) + + if len(self.donor_odn)> len(self.control_sample.primary_base_calls)*0.75: + self.warnings.append("Large Donor of {} bp compared to control sequence of {} bp".format(len(self.donor_odn),len(self.control_sample.primary_base_calls))) + + + #int((len(re.split("(-+)", alignment[0])) - 3) / 2) + + try: + cutsite = self.guide_targets[0].cutsite + hr_proposal, changed_bases, odn_start_pos, aln = epc.homologous_recombination_proposal( + cutsite=cutsite, donor_sequence=self.donor_odn) + + self.recombined_seq = hr_proposal.sequence + self.recombination_changed_bases = changed_bases + self.odn_start_pos = odn_start_pos + self.donor_alignment = aln + + n_splits=int((len(re.split("(-+)", aln.all_aligned_seqs[0])) - 3) / 2) + if n_splits>0: + self.warnings.append("{} donor integration sites".format(n_splits+1)) + + + except Exception as e: + self.warnings.append("Could not analyze homologous recombination case: {}".format(e)) + self.donor_odn = None + + def find_alignment_window(self): + """ + Determines the alignment window + + """ + ctrl_aw = self.control_sample.find_alignable_window(QUAL_CUTOFF=30) + ctrl_aw['max_window'] + if ctrl_aw['max_window'] is None: + # no quality windows found + raise Exception("Control ab1 trace quality scores too low") + + start_of_ctrl_aln_window = ctrl_aw['max_window'][0] + # alignment window should stop upstream of any potential indels + end_of_ctrl_aln_window = self.guide_targets[0].cutsite - self.indel_max_size - 10 + + if end_of_ctrl_aln_window > ctrl_aw['max_window'][1]: + raise Exception("Control ab1 trace quality too upstream of guide too low") + if ctrl_aw['max_window'][0] > end_of_ctrl_aln_window: + raise Exception("Control ab1 %s quality before cutsite is too low" % self.control_sample.basename) + if end_of_ctrl_aln_window - ctrl_aw['max_window'][0] < 40: + self.warnings.append("Padding alignment window to be at least 40bp in length or to start of seq") + start_of_ctrl_aln_window = max(1, end_of_ctrl_aln_window - 40) + if self.donor_odn is not None: + if self.odn_start_pos < end_of_ctrl_aln_window: + end_of_ctrl_aln_window = self.odn_start_pos + self.alignment_window = (start_of_ctrl_aln_window, end_of_ctrl_aln_window) + + def calculate_discordance(self): + + # get the peak values at every base + control_peak_values = self.control_sample.get_peak_values() + edited_peak_values = self.edited_sample.get_peak_values() + + # ... get the primary sequence; note that the aligned_bases here refers to correctly called bases in + # the chromatogram + ctrl_sequence = self.control_sample.aligned_bases + shortest_sequence = min(len(ctrl_sequence), len(edited_peak_values['T'])) + + control_discord_abundances = [] + edited_discord_abundances = [] + + for aligned_bases in self.alignment.alignment_pairs: + ctrl_idx = aligned_bases[0] + sample_idx = aligned_bases[1] + if ctrl_idx is None or sample_idx is None or ctrl_idx > shortest_sequence: + # gap in alignment; gap is in ctrl sample + continue + + bases = ['A', 'C', 'G', 'T'] + ctrl_base = self.control_sample.primary_base_calls[ctrl_idx] + # sample_base = self.edited_sample.primary_base_calls[sample_idx] + + if ctrl_base not in bases: + + control_discord_abundances.append(1.0) + edited_discord_abundances.append(1.0) + + else: + # calculate the total intensity for this base call. + total_edited_signal = 0 + total_control_signal = 0 + + for ab in bases: + total_edited_signal += edited_peak_values[ab][sample_idx] + total_control_signal += control_peak_values[ab][ctrl_idx] + + # get the correct signal + edited_correct_signal = edited_peak_values[ctrl_base][sample_idx] + control_correct_signal = control_peak_values[ctrl_base][ctrl_idx] + + # now figure out the incorrect signal + edited_discord_signal = (total_edited_signal - edited_correct_signal) + control_discord_signal = (total_control_signal - control_correct_signal) + + # and get the relative amount + edited_discord_signal_relative = (edited_discord_signal / (1.0 * total_edited_signal)) + control_correct_signal = (control_discord_signal / (1.0 * total_control_signal)) + + # ta da + edited_discord_abundances.append(edited_discord_signal_relative) + control_discord_abundances.append(control_correct_signal) + + final_sequence = ctrl_sequence[0:shortest_sequence] + + guide_location = self.guide_targets[0].guide_start + + cutsite = self.sample_cutsite(self.guide_targets[0], alt="downstream") + + start = self.alignment_window[0] + end = self.alignment_window[1] + sample_start = self.alignment.ctrl2sample_coords(start) + sample_end = self.alignment.ctrl2sample_coords(end) + before_dsb = edited_discord_abundances[sample_start:sample_end] + after_dsb = edited_discord_abundances[cutsite:] + + if self.debug: + print("Cutsite for discord calcs: {}".format(cutsite)) + + if not before_dsb or not after_dsb: + #TODO find more precise description of this error + self.warnings.append("Unable to compute discordance plots") + + + mean_before = np.mean(before_dsb) + mean_after = np.mean(after_dsb) + + validation_msg = 'discord (aln window): %0.2f after cutsite: %0.2f' % (mean_before, mean_after) + + print(validation_msg) + + self.results.control_discordances = control_discord_abundances + self.results.edited_discordances = edited_discord_abundances + self.results.mean_discord_after = mean_after + self.results.mean_discord_before = mean_before + + def _calculate_inference_window(self): + left_offset = self.alignment_window[1] + last_guide = max(self.guide_targets, key=lambda x: x.cutsite) + cutsite = last_guide.cutsite + min_indel_sequence_length = 10000 + + MAX_BASES_AFTER_CUTSITE = 100 + + # find minimum length of all generated sequences + for ind in self.proposals: + if len(ind.sequence) < min_indel_sequence_length: + min_indel_sequence_length = len(ind.sequence) + #min_indel = ind.sequence + #[print(str(ind.bases_changed) + '__' + str(len(ind.sequence))) for ind in self.proposals] + #import pdb;pdb.set_trace() + ctrl_quality_windows = self.control_sample.find_alignable_window(window_size=10, QUAL_CUTOFF=35) + + # we should not be doing any calculations with data from low quality regions + # we can cutoff the inference window on the right based on quality + iw_right_boundary = None + if 'max_window' in ctrl_quality_windows: + ctrl_aw = ctrl_quality_windows['max_window'] + if ctrl_aw[1] > cutsite: + iw_right_boundary = min(cutsite + MAX_BASES_AFTER_CUTSITE, ctrl_aw[1], + min_indel_sequence_length) + else: + iw_right_boundary = min(cutsite + MAX_BASES_AFTER_CUTSITE, + min_indel_sequence_length) + self.warnings.append( + "Low quality control trace; using cutsite + {} for inference. Results may be poor.".format( + MAX_BASES_AFTER_CUTSITE)) + if iw_right_boundary is None: + self.warnings.append( + "Low quality control trace; using cutsite + {} for inference. Results may be poor.".format( + MAX_BASES_AFTER_CUTSITE)) + iw_right_boundary = cutsite + MAX_BASES_AFTER_CUTSITE + + # as an extra precaution, we want to exclude the last aligned 10bp + last_aligned_ctrl_base = self.alignment.alignment_pairs[self.alignment.last_aligned_pair_idx][0] + + iw_right_boundary = min(iw_right_boundary, last_aligned_ctrl_base - 10) + + inf_len_after_cutsite = iw_right_boundary - cutsite + if inf_len_after_cutsite < self.indel_max_size * 3: + self.warnings.append( + "Inf. window after cutsite is only {} in length and is less than 3x indel_max_size of {}".format( + inf_len_after_cutsite, self.indel_max_size)) + # set inference_window + self.inference_window = (left_offset, iw_right_boundary) + + def _should_skip_proposal(self, indel_size): + """ + Indels larger than min cutoff that overlap in size with donor are not considered as an edit proposal + """ + # Only consider skipping proposals if there is a donor + if not self.donor_odn: + return False + + # Don't filter out small indels because they are still likely to occur with HDR + if abs(indel_size) <= self.HDR_OVERLAP_FILTER_CUTOFF: + return False + + # Skip proposal if same size as donor + return indel_size == self.donor_alignment.hdr_indel_size + + def _generate_edit_proposals(self): + epc = EditProposalCreator(self.control_sample.primary_base_calls, + use_ctrl_trace=True, + sanger_object=self.control_sample) + proposals = [] + + if self.donor_odn is not None: + cutsite = self.guide_targets[0].cutsite + + hr_proposal, changed_bases, odn_start_pos, aln = epc.homologous_recombination_proposal( + cutsite=cutsite, donor_sequence=self.donor_odn) + proposals.append(hr_proposal) + + # single cut cases + for guide_target in self.guide_targets: + cutsite = guide_target.cutsite + + # deletion case + deletion_befores = list(range(self.indel_max_size + 1)) + deletion_afters = list(range(self.indel_max_size + 1)) + + for deletion_before in deletion_befores: + for deletion_after in deletion_afters: + indel_size = -(deletion_before + deletion_after) + if self._should_skip_proposal(indel_size): + continue + ep = epc.single_cut_edit_proposal(cutsite, guide_target.label, + del_before=deletion_before, del_after=deletion_after) + proposals.append(ep) + + insertions = list(range(self.indel_max_size + 1)) + + for insertion in insertions: + if self._should_skip_proposal(insertion): + continue + ep = epc.single_cut_edit_proposal(cutsite, guide_target.label, insertion=insertion) + proposals.append(ep) + + # multiplex case + # we limit the deletion sizes here + deletion_befores = list(range(5)) + deletion_afters = list(range(5)) + + insertions = list(range(3)) + + for combo in combinations(self.guide_targets, 2): + guide1 = min(combo, key=lambda x: x.cutsite) + guide2 = max(combo, key=lambda x: x.cutsite) + cutsite1 = guide1.cutsite + cutsite2 = guide2.cutsite + if cutsite1 == cutsite2: + print("Warning: cutsite1 == cutsite2") + continue + label1 = guide1.label + label2 = guide2.label + + for cut1_before in deletion_befores: + for cut1_after in deletion_afters: + for cut2_before in deletion_befores: + for cut2_after in deletion_afters: + independent_cut = epc.multiplex_proposal( + cutsite1, + cutsite2, + label1, + label2, + cut1_del=(cut1_before, cut1_after), cut2_del=(cut2_before, cut2_after)) + if independent_cut: + proposals.append(independent_cut) + + # dropout case + for cut1_before in deletion_befores: + for cut2_after in deletion_afters: + dropout = epc.multiplex_proposal( + cutsite1, + cutsite2, + label1, + label2, + cut1_del=(cut1_before, 0), cut2_del=(0, cut2_after), + dropout=True + ) + if dropout: + proposals.append(dropout) + for insertion1 in insertions: + for insertion2 in insertions: + cut_and_insert = epc.multiplex_proposal(cutsite1, cutsite2, label1, label2, + cut1_ins=insertion1, cut2_ins=insertion2) + if cut_and_insert: + proposals.append(cut_and_insert) + # dropout insertion case + for insertion in insertions: + dropout_and_insert = epc.multiplex_proposal(cutsite1, cutsite2, label1, label2, + cut1_ins=insertion, dropout=True) + if dropout_and_insert: + proposals.append(dropout_and_insert) + + + #removing degenerate proposals + seen=[] + self.proposals = list(filter(lambda x: seen.append(x.sequence) is None if x.sequence not in seen else False, proposals)) + + + def _generate_coefficient_matrix(self): + num_proposals = len(self.proposals) + iw_length = self.inference_window[1] - self.inference_window[0] + output_matrix = np.zeros((num_proposals, 4 * iw_length)) + #import pdb; pdb.set_trace() + for edit_proposal_idx, ep in enumerate(self.proposals): + for base_index in range(self.inference_window[0], self.inference_window[1]): + seq_index = base_index - self.inference_window[0] + for color_index in range(4): + base_color_index = base_index * 4 + color_index + output_matrix[edit_proposal_idx][seq_index * 4 + color_index] = ep.trace_data[base_color_index] + + # normalize values + sum = np.sum(output_matrix[edit_proposal_idx][seq_index * 4:seq_index * 4 + 4]) + for color_index in range(4): + normalized_amt = output_matrix[edit_proposal_idx][seq_index * 4 + color_index] / sum + if np.isnan(normalized_amt * 100): + output_matrix[edit_proposal_idx][seq_index * 4 + color_index] = 0 + else: + output_matrix[edit_proposal_idx][seq_index * 4 + color_index] = normalized_amt * 100 + if self.verbose: + print("Shape of coefficient matrix:", output_matrix.shape) + + self.coefficient_matrix = output_matrix.T + + def _generate_outcomes_vector(self): + # generate_b (outcomes) vector + + output_list = [] + edited_peak_values = self.edited_sample.get_peak_values() + inference_length = self.inference_window[1] - self.inference_window[0] + self.inference_sequence = self.control_sample.primary_base_calls[ + self.inference_window[1]:self.inference_window[0]] + + last_good_ref_idx = -1 + for aln_tuple in self.alignment.alignment_pairs: + ref_idx = aln_tuple[0] + sample_idx = aln_tuple[1] + + if ref_idx is None: + ref_idx = last_good_ref_idx + else: + last_good_ref_idx = ref_idx + + if self.inference_window[0] <= ref_idx and len(output_list) < inference_length * 4: + for base in self.edited_sample.base_order: + val = edited_peak_values[base][sample_idx] + output_list.append(val) + + if self.verbose: + print('------------------------------------------------------') + print('Inference Sequence length: %d' % inference_length) + print('Inference sequence', self.inference_sequence) + print('Output vector : 4x%d (%d)' % (len(output_list) / 4, len(output_list))) + + self.output_vec = np.array(output_list) + + def normalize_observed_trace(self): + """ + This function normalizes the observed Sanger sequencing trace for the sample. + For each base position, it normalizes the signal for A, T, C, G to the total signal + :return: vector with normalized numbers for the observed trace, b + """ + b = self.output_vec + + b_prime = b.reshape(int(len(b) / 4), 4) + + row_sums = b_prime.sum(axis=1) + normed = b_prime / row_sums[:, np.newaxis] + + b_normed = normed.reshape(int(len(b))) + + # if there any nan's due to dividing by zero, convert to zeros + np.nan_to_num(b_normed, copy=False) + return b_normed + + def simple_discordance_algorithm(self): + """ + This is a simple algorithm that measures the discordance of the observed signal + with the control signal (after the cut site). + The proportion of edited sequences is then taken to be + 1-(signal conforming to control) + take the minimum signal across the entire seq that corresponds to the control. + the estimate of the edited proportion, could be an underestimate actually due to coincident bases + :param norm_b: + :return: + + """ + # discordances = self.discordances + edited_discordances = self.results.edited_discordances + control_discordances = self.results.control_discordances + unexplained_discord_signal = [] + # print(ra['edited_discord']) + + for np_idx, edited_discord_signal in np.ndenumerate(edited_discordances): + idx = np_idx[0] + if self.guide_targets[0].cutsite < idx < self.inference_window[1]: + control_discord_signal = control_discordances[idx] + + unexplained = edited_discord_signal - control_discord_signal + + if unexplained < 0: + unexplained = 0 + + unexplained_discord_signal.append(unexplained) + + self.results.ice_d = min(100., + SangerAnalysis.iced_correction_factor * sum(unexplained_discord_signal) / float( + len(unexplained_discord_signal)) * 100) + + self.results.max_unexp_discord = max(unexplained_discord_signal) * 100 + + def infer_abundances(self, norm_b=False): + """ + This uses nnls to solve for abundances of each edit proposal + :param norm_b: + :return: + """ + A = self.coefficient_matrix + + b = self.output_vec + #import pdb; pdb.set_trace() + if self.verbose: + print("") + print('NNLS input shapes') + print("--------------------------------------------------------") + print('A', A.shape) + print('b', b.shape) + + if norm_b: + b_normed = self.normalize_observed_trace() + b = b_normed + + # Solves argmin_x || Ax - b ||_2 for x>=0 + # A: columns (number of different indel possibilities) + # : rows (base calls (4*inference_length) + 1) + # x: abundances of possibilities, has shape (A.cols, 1) + # b: actual observed base calls, has shape (A.rows, 1) + # nnls solves for x, or the abundances of each possible indel + # xvals contains the inferred sequence abundances + # rnorm is the residual || Ax-b ||_2 + + try: + + #xvals, rnorm = nnls(A, b) + + # compute the predicted signal + #predicted = np.dot(A, xvals) + + #optional L1 + lasso_model = linear_model.Lasso(alpha=0.8, positive=True) + lasso_model.fit(A, b) + + xvals = lasso_model.coef_ + predicted = np.dot(A, xvals) + + + + except Exception as e: + + raise type(e)(str(e) + ' A: ' + str(A.shape) + ' B: ' + str(b.shape)) + + # calculate pearson's R + (fit_r, p_val_2_tailed) = pearsonr(predicted, b) + self.results.r_squared = fit_r ** 2 + print("R_SQUARED {}".format(self.results.r_squared)) + + xtotal = xvals.sum() + # here we normalize the relative abundance of each possible indel + for n, x_val in enumerate(xvals): + self.proposals[n].x_abs = x_val + self.proposals[n].x_rel = x_val / (1.0 * xtotal) + + if self.r_squared_correction: + ##addition of (1-r_squared, or missing variance) to the no edit case + if n == 0: + self.proposals[0].x_rel = x_val / (1.0 * xtotal) * self.results.r_squared + ##edited cases + else: + self.proposals[n].x_rel = x_val / (1.0 * xtotal) * self.results.r_squared + + new_array = round_percent([x.x_rel for x in self.proposals], self.results.r_squared) + # new_array=[np.floor(x.x_rel*100)/100 for x in self.proposals] + + for n, val in enumerate(new_array): + self.proposals[n].x_rel = val + + self.results.r_squared = np.round(self.results.r_squared,2) + + + + def analyze_and_rank(self): + + self.infer_abundances(norm_b=True) + + sorted_by_contribution = sorted(self.proposals, key=lambda x: x.x_rel, reverse=True) + + # create totals: + + aggregated_indel = defaultdict(float) + hdr_percentage = 0 + ko_score=0 + + for entry in sorted_by_contribution: + if entry.bases_changed not in aggregated_indel: + aggregated_indel[entry.bases_changed] = 0 + aggregated_indel[entry.bases_changed] += entry.x_rel + if entry.bases_changed == ProposalBase.HDR: + hdr_percentage += 100 * entry.x_rel + if entry.bases_changed != ProposalBase.HDR and (entry.bases_changed%3!=0 or abs(entry.bases_changed)>20): + ko_score += 100 * entry.x_rel + + unedited_percent = aggregated_indel[0] + (1 - self.results.r_squared) + + editing_efficiency = (1.0 - unedited_percent) + + + + + if self.debug: + print(aggregated_indel) + print(sorted_by_contribution) + self.results.ko_score =ko_score + self.results.aggregate = aggregated_indel + self.results.contribs = sorted_by_contribution + self.results.edit_eff = editing_efficiency + self.results.hdr_percentage = hdr_percentage + + ############################# + + def analyze_sample(self): + self.quality_check() + self.find_targets() + if self.donor_odn: + self.check_recombination() + + self.find_alignment_window() + + alignment = PairAlignment(self.control_sample.primary_base_calls, + self.edited_sample.primary_base_calls + ) + + self.alignment = alignment + alignment.align_all() + alignment.align_with_window(self.alignment_window) + + if alignment.has_alignment: + pass + else: + raise Exception("No alignment found between control and edited sample") + + # aln_flag, msg = self.align_edited_to_control_sequence() + aln_file = self.base_outputname + "all.txt" + alignment.write_aln(alignment.all_aligned_clustal, to_file=aln_file) + aln_file = self.base_outputname + "windowed.txt" + alignment.write_aln(alignment.aln_clustal, to_file=aln_file) + + # write to jsons + aln_json_file = self.base_outputname + "all.json" + alignment.write_json(alignment.all_aligned_seqs, aln_json_file) + aln_json_file = self.base_outputname + "windowed.json" + alignment.write_json(alignment.aln_seqs, aln_json_file) + + self._generate_edit_proposals() + + if self.donor_odn: + aln_file = self.base_outputname + "donor.txt" + self.donor_alignment.write_aln(self.donor_alignment.all_aligned_clustal, to_file=aln_file) + aln_json_file = self.base_outputname + "donor.json" + self.donor_alignment.write_json(self.donor_alignment.all_aligned_seqs, aln_json_file) + + print("analyzing {} number of edit proposals".format(len(self.proposals))) + self._calculate_inference_window() + self._generate_coefficient_matrix() + self._generate_outcomes_vector() + + self.analyze_and_rank() + self.calculate_discordance() + self.simple_discordance_algorithm() + # output + indel_file = self.base_outputname + "indel.json" + trace_file = self.base_outputname + "trace.json" + + generate_discordance_indel_files(self, self.results, to_file=indel_file) + generate_trace_files(self, to_file=trace_file) + + write_individual_contribs(self, to_file=self.base_outputname + "contribs.txt") + write_contribs_json(self, self.base_outputname + "contribs.json") + + + if self.allprops: + write_all_proposals_json(self, self.base_outputname + "allproposals.json") diff --git a/build/lib/ice/classes/sanger_object.py b/build/lib/ice/classes/sanger_object.py new file mode 100644 index 0000000..246de31 --- /dev/null +++ b/build/lib/ice/classes/sanger_object.py @@ -0,0 +1,216 @@ +""" +Copyright 2018 Synthego Corporation All Rights Reserved + +The Synthego ICE software was developed at Synthego Corporation. + +Permission to use, copy, modify and distribute any part of Synthego ICE for +educational, research and non-profit purposes, without fee, and without a +written agreement is hereby granted, provided that the above copyright notice, +this paragraph and the following paragraphs appear in all copies. + +Those desiring to incorporate this Synthego ICE software into commercial +products or use for commercial purposes should contact Synthego support at Ph: +(888) 611-6883 ext:1, E-MAIL: support@synthego.com. + +In no event shall Synthego Corporation be liable to any party for direct, +indirect, special, incidental, or consequential damages, including lost +profits, arising out of the use of Synthego ICE, even if Synthego Corporation +has been advised of the possibility of such damage. + +The Synthego ICE tool provided herein is on an "as is" basis, and the Synthego +Corporation has no obligation to provide maintenance, support, updates, +enhancements, or modifications. The Synthego Corporation makes no +representations and extends no warranties of any kind, either implied or +express, including, but not limited to, the implied warranties of +merchantability or fitness for a particular purpose, or that the use of +Synthego ICE will not infringe any patent, trademark or other rights. +""" + +import os +import numpy as np +from Bio import SeqIO + +from ice.utility.misc import running_mean +from ice.utility.misc import find_regions_greater_than + + +class SangerObject: + """ + + """ + + PRIMARY_BASE_CALL = 'PBAS1' + BASE_ORDER = 'FWO_1' + PEAK_LOCATIONS = 'PLOC1' + CHANNELS = ['DATA9', 'DATA10', 'DATA11', 'DATA12'] + + def __init__(self, verbose=False): + self.data = None + # self.left_trim_bases = 5 + # self.right_trim_bases = 5 + # self.alignment_offset = 0 + self.external_base_calls = None + self.path = None + self.basename = None + + def estimate_quality_scores(self): + ''' + a good enough way to estimate quality scores + As ICE uses the quality scores for determining windows for alignment, we just need a rough estimate + of base quality. If the primary color is 90% of the signal, the phred score is 50% of the maximum. + If the primary color is 80% of the total signal at that position, the phred score is 0% of the maximum. + :return: list of phred scores + ''' + + MAX_PHRED = 60 + peak_values = self.get_peak_values() + phred_scores = [] + for idx, base in enumerate(peak_values['C']): + sum_values = 0 + max_value = 0 + for color in 'ACGT': + sum_values += peak_values[color][idx] + if peak_values[color][idx] > max_value: + max_value = peak_values[color][idx] + + if sum_values < 100 or sum_values > 5000: + phred_score = 0 + else: + max_value_percent = max_value / sum_values * 100 + + # linear relationship of PHRED score 0 to 60 and max_value_percent 80 to 100 + phred_score = max(3 * max_value_percent - 240, 0) + #check if absolute intensities are reasonable + + phred_scores.append(phred_score) + return phred_scores + + def initialize_from_path(self, ab1_file_path, *args, **kwargs): + ''' + + + :param ab1_file_path: + :param args: + :param kwargs: + :return: + + 'FWO_1': 'GATC' contains the base order + DATA9: G, DATA10: A, DATA11: T, DATA12: C + good documentation of what all the data channels are here: + https://github.com/biopython/biopython/blob/master/Bio/SeqIO/AbiIO.py + ''' + self.path = ab1_file_path + self.basename = os.path.basename(ab1_file_path) + record = SeqIO.read(ab1_file_path, 'abi') + + # In Biopython 1.74, some strings are converted to bytes in py3 + # Convert strings with bytes-object back to regular string + traces_ = record.annotations['abif_raw'] + traces = {} + for k, v in traces_.items(): + if isinstance(v, bytes): + v = v.decode() + else: + v = v + traces[k] = v + self.data = traces + + phred_scores = [] + for c in traces['PCON2']: + phred_scores.append(ord(c)) + self.phred_scores = phred_scores + + override_quality_score = kwargs.get('override_quality_score', False) + if np.count_nonzero(phred_scores) == 0 or override_quality_score: + self.phred_scores = self.estimate_quality_scores() + + def find_alignable_window(self, window_size=30, QUAL_CUTOFF=40): + ''' + This code finds the optimal window for aligning the edited trace to the reference sequence + 1) It looks at the phred scores and calculates a moving average + 2) It then selects the largest window where the moving average is greater than QUAL_CUTOFF + 3) This segment and other data is then returned in a dictionary + all coordinates are absolute with respect to the sanger object + ''' + + # TODO: are there any edge cases where an edit is so good that the alignable window would extend into + # regions where there is mismatch between sample and control? + + phreds = np.asarray(self.phred_scores) + # find a running mean + window_size = window_size + windowed_phred = running_mean(phreds, window_size) + + # find regions with windowed mean > QUAL_CUTOFF + y = find_regions_greater_than(windowed_phred, QUAL_CUTOFF) + + + + max_window_size = 0 + max_window = None + for region in y: + w_size = region[1] - region[0] + if w_size > max_window_size: + max_window_size = w_size + max_window = (region[0], region[1]) + + return { + 'regions': y, + 'max_window': max_window, + 'windowed_phred': windowed_phred + } + + @property + def primary_base_calls(self): + return self.data[SangerObject.PRIMARY_BASE_CALL] + + @property + def aligned_bases(self): + return self.primary_base_calls + + @property + def base_order(self): + return self.data[SangerObject.BASE_ORDER] + + @property + def peak_locations(self): + """ Returns the peak locations for the final sequence + + :return: + """ + return list(self.data[SangerObject.PEAK_LOCATIONS]) + + def get_peak_values(self): + output_dict = {} + for n, channel in enumerate(SangerObject.CHANNELS): + peak_data = np.array(self.data[channel]) + # peak_values = peak_data[self.peak_locations] + peak_values = peak_data[list(self.data[SangerObject.PEAK_LOCATIONS])] + output_dict[self.base_order[n]] = peak_values + + return output_dict + + def get_trough_position(self, loc1, side="left"): + if side == "left": + loc2 = loc1 - 1 + else: + loc2 = loc1 + 1 + return (self.peak_locations[loc1] + + self.peak_locations[loc2]) / 2 + + @classmethod + def trace_to_base_calls(cls, trace, base_order): + BASE_CALL_CUTOFF = 0.75 + assert len(trace) % 4 == 0 + seq_len = int(len(trace) / 4) + seq = "" + for base_idx in range(seq_len): + slice_values = trace[base_idx * 4: base_idx * 4 + 4] + slice_max = max(slice_values) + if slice_max > BASE_CALL_CUTOFF: + max_idx = slice_values.index(slice_max) + base = base_order[max_idx] + seq += base + else: + seq += 'N' + return seq diff --git a/build/lib/ice/outputs/__init__.py b/build/lib/ice/outputs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build/lib/ice/outputs/create_discordance_indel_files.py b/build/lib/ice/outputs/create_discordance_indel_files.py new file mode 100644 index 0000000..92813e7 --- /dev/null +++ b/build/lib/ice/outputs/create_discordance_indel_files.py @@ -0,0 +1,58 @@ +""" +Copyright 2018 Synthego Corporation All Rights Reserved + +The Synthego ICE software was developed at Synthego Corporation. + +Permission to use, copy, modify and distribute any part of Synthego ICE for +educational, research and non-profit purposes, without fee, and without a +written agreement is hereby granted, provided that the above copyright notice, +this paragraph and the following paragraphs appear in all copies. + +Those desiring to incorporate this Synthego ICE software into commercial +products or use for commercial purposes should contact Synthego support at Ph: +(888) 611-6883 ext:1, E-MAIL: support@synthego.com. + +In no event shall Synthego Corporation be liable to any party for direct, +indirect, special, incidental, or consequential damages, including lost +profits, arising out of the use of Synthego ICE, even if Synthego Corporation +has been advised of the possibility of such damage. + +The Synthego ICE tool provided herein is on an "as is" basis, and the Synthego +Corporation has no obligation to provide maintenance, support, updates, +enhancements, or modifications. The Synthego Corporation makes no +representations and extends no warranties of any kind, either implied or +express, including, but not limited to, the implied warranties of +merchantability or fitness for a particular purpose, or that the use of +Synthego ICE will not infringe any patent, trademark or other rights. +""" + +import json + +from ice.utility.misc import round_list + + +def generate_discordance_indel_files(sanger_analysis, ice_results, to_file=None): + # generate dict/json + data = {} + discord_plot = {} + discord_plot['bp'] = list(range(len(ice_results.control_discordances))) + discord_plot['control_discord'] = round_list(ice_results.control_discordances, 3) + discord_plot['edited_discord'] = round_list(ice_results.edited_discordances, 3) + discord_plot['aln_start'] = sanger_analysis.alignment.ctrl2sample_coords(sanger_analysis.alignment_window[0]) + discord_plot['aln_end'] = sanger_analysis.alignment.ctrl2sample_coords(sanger_analysis.alignment_window[1]) + discord_plot['cut_site'] = sanger_analysis.guide_targets[0].cutsite + discord_plot['inf_start'] = sanger_analysis.alignment.ctrl2sample_coords(sanger_analysis.inference_window[0]) + discord_plot['inf_end'] = sanger_analysis.alignment.ctrl2sample_coords(sanger_analysis.inference_window[1]) + data['discord_plot'] = discord_plot + + lowerbound = -2 * sanger_analysis.indel_max_size + upperbound = 1 * sanger_analysis.indel_max_size + editing_outcomes = {} + for i in range(lowerbound, upperbound, 1): + editing_outcomes[i] = round(sanger_analysis.results.aggregate[i] * 100, 2) + data['editing_outcomes'] = editing_outcomes + data['editing_eff'] = round(100 * sanger_analysis.results.edit_eff, 2) + data['r_sq'] = round(sanger_analysis.results.r_squared, 3) + + with open(to_file, 'w') as f: + json.dump(data, f, ensure_ascii=False) diff --git a/build/lib/ice/outputs/create_json.py b/build/lib/ice/outputs/create_json.py new file mode 100644 index 0000000..5ad0db9 --- /dev/null +++ b/build/lib/ice/outputs/create_json.py @@ -0,0 +1,75 @@ +""" +Copyright 2018 Synthego Corporation All Rights Reserved + +The Synthego ICE software was developed at Synthego Corporation. + +Permission to use, copy, modify and distribute any part of Synthego ICE for +educational, research and non-profit purposes, without fee, and without a +written agreement is hereby granted, provided that the above copyright notice, +this paragraph and the following paragraphs appear in all copies. + +Those desiring to incorporate this Synthego ICE software into commercial +products or use for commercial purposes should contact Synthego support at Ph: +(888) 611-6883 ext:1, E-MAIL: support@synthego.com. + +In no event shall Synthego Corporation be liable to any party for direct, +indirect, special, incidental, or consequential damages, including lost +profits, arising out of the use of Synthego ICE, even if Synthego Corporation +has been advised of the possibility of such damage. + +The Synthego ICE tool provided herein is on an "as is" basis, and the Synthego +Corporation has no obligation to provide maintenance, support, updates, +enhancements, or modifications. The Synthego Corporation makes no +representations and extends no warranties of any kind, either implied or +express, including, but not limited to, the implied warranties of +merchantability or fitness for a particular purpose, or that the use of +Synthego ICE will not infringe any patent, trademark or other rights. +""" + +import json + + +def write_individual_contribs(sanger_analysis, to_file=None): + # human readable + with open(to_file, 'w') as out_file: + print('Relative contribution of each sequence (normalized)', file=out_file) + print("--------------------------------------------------------", file=out_file) + for entry in sanger_analysis.results.contribs: + if entry.x_rel > 0.0: + print('%02.04f \t %s \t %s' % (entry.x_rel, entry.summary, + entry.human_readable_sequence(bp_after_cutsite=150)), + file=out_file) + + +def write_contribs_json(sanger_analysis, to_file): + out_list = [] + for entry in sanger_analysis.results.contribs: + if entry.x_rel > 0.0: + abundance = round(entry.x_rel, 3) + out_list.append({'rel_abundance': abundance, + 'human_readable': entry.human_readable_sequence(), + 'indel': entry.summary_json, 'wt': entry.wildtype}) + + editing_outcomes = {} + for i in sanger_analysis.results.aggregate: + editing_outcomes[i] = round(sanger_analysis.results.aggregate[i] * 100, 2) + + output = {'contribs_list': out_list, 'editing_outcomes': editing_outcomes, 'multiplex': sanger_analysis.multiplex} + + with open(to_file, 'w') as out_file: + json.dump(output, out_file) + + +def write_all_proposals_json(sanger_analysis, to_file): + out_list = [] + for entry in sanger_analysis.results.contribs: + # if entry.x_rel > 0.0: + abundance = round(entry.x_rel, 3) + wt = False + if entry.summary == 0: + wt = True + out_list.append({'rel_abundance': abundance, + 'human_readable': entry.human_readable_sequence(), + 'indel': entry.summary, 'wt': wt}) + with open(to_file, 'w') as out_file: + json.dump(out_list, out_file) diff --git a/build/lib/ice/outputs/create_trace_files.py b/build/lib/ice/outputs/create_trace_files.py new file mode 100644 index 0000000..518d196 --- /dev/null +++ b/build/lib/ice/outputs/create_trace_files.py @@ -0,0 +1,156 @@ +""" +Copyright 2018 Synthego Corporation All Rights Reserved + +The Synthego ICE software was developed at Synthego Corporation. + +Permission to use, copy, modify and distribute any part of Synthego ICE for +educational, research and non-profit purposes, without fee, and without a +written agreement is hereby granted, provided that the above copyright notice, +this paragraph and the following paragraphs appear in all copies. + +Those desiring to incorporate this Synthego ICE software into commercial +products or use for commercial purposes should contact Synthego support at Ph: +(888) 611-6883 ext:1, E-MAIL: support@synthego.com. + +In no event shall Synthego Corporation be liable to any party for direct, +indirect, special, incidental, or consequential damages, including lost +profits, arising out of the use of Synthego ICE, even if Synthego Corporation +has been advised of the possibility of such damage. + +The Synthego ICE tool provided herein is on an "as is" basis, and the Synthego +Corporation has no obligation to provide maintenance, support, updates, +enhancements, or modifications. The Synthego Corporation makes no +representations and extends no warranties of any kind, either implied or +express, including, but not limited to, the implied warranties of +merchantability or fitness for a particular purpose, or that the use of +Synthego ICE will not infringe any patent, trademark or other rights. +""" + +import json +import xml.etree.ElementTree as ET + +ET.register_namespace("", "http://www.w3.org/2000/svg") + + +def get_edited_base_px_positions(sanger_analysis, sample, changed_bases, start_px): + changed_bases_px = [] + if sanger_analysis.donor_odn is not None: + for changed_base in changed_bases: + changed_base_left = sample.get_trough_position(changed_base, "left") - start_px + changed_base_right = sample.get_trough_position(changed_base, "right") - start_px + + if changed_base_right > 0 and changed_base_right > 0: + changed_bases_px.append((changed_base_left, changed_base_right)) + return changed_bases_px + + +def generate_chromatogram_data(data, start, end, cutsite, peaks, basecalls, basecoords): + channels = [('DATA9', 'G'), ('DATA10', 'A'), ('DATA11', 'T'), ('DATA12', 'C')] + output = {} + output['channels'] = {} + for (channel, base) in channels: + # axis.plot(data[c[0]][start:end], color=colors[c[1]]) + output['channels'][base] = data[channel][start:end] + output['cutsite'] = cutsite - start + output['basecalls'] = [] + for idx, base in enumerate(basecalls): + output['basecalls'].append([peaks[idx] - start, base, basecoords[0] + idx]) + return output + + +def reformat_to_json(data): + list_of_dicts = [] + basecall_ptr = 0 + for idx, val in enumerate(data['channels']['G']): + coord_vals = {} + for base in 'ATCG': + coord_vals[base] = data['channels'][base][idx] + if basecall_ptr < len(data['basecalls']) and idx == data['basecalls'][basecall_ptr][0]: + coord_vals['basecall'] = data['basecalls'][basecall_ptr][1] + coord_vals['basecall_bp_coord'] = data['basecalls'][basecall_ptr][2] + basecall_ptr += 1 + list_of_dicts.append(coord_vals) + output = {} + output['trace_data'] = list_of_dicts + for key in ['cutsite', 'guide_start', 'guide_end', 'pam_start', 'pam_end']: + if key in data: + output[key] = data[key] + return output + + +def generate_trace_files(sanger_analysis, to_file=None): + ctrl_sample = sanger_analysis.control_sample + edited_sample = sanger_analysis.edited_sample + + # convert base coords to trace coords + upstream_in_bp = sanger_analysis.guide_targets[0].cutsite - 35 + downstream_in_bp = sanger_analysis.guide_targets[0].cutsite + 30 + ctrl_start = ctrl_sample.peak_locations[upstream_in_bp] + ctrl_end = ctrl_sample.peak_locations[downstream_in_bp] + + ctrl_cutsite = ctrl_sample.get_trough_position(sanger_analysis.guide_targets[0].cutsite, "left") + + ctrl_peaks = ctrl_sample.peak_locations[upstream_in_bp:downstream_in_bp] + ctrl_basecalls = ctrl_sample.primary_base_calls[upstream_in_bp:downstream_in_bp] + ctrl_basecoords = (upstream_in_bp, downstream_in_bp) + + ctrl_data = generate_chromatogram_data(ctrl_sample.data, ctrl_start, + ctrl_end, ctrl_cutsite, ctrl_peaks, ctrl_basecalls, ctrl_basecoords) + guide_end_offset = len(sanger_analysis.gRNA_sequences[0]) - 1 + ctrl_data["guide_start"] = ctrl_sample.get_trough_position( + sanger_analysis.guide_targets[0].cutsite - sanger_analysis.guide_targets[0].cut_offset, "left") - ctrl_start + ctrl_data["guide_end"] = ctrl_sample.get_trough_position(sanger_analysis.guide_targets[0].cutsite + \ + (guide_end_offset - sanger_analysis.guide_targets[ + 0].cut_offset), "right") - ctrl_start + + if sanger_analysis.guide_targets[0].orientation == "fwd": + pam_end = ctrl_sample.get_trough_position(sanger_analysis.guide_targets[0].cutsite + ( + guide_end_offset - sanger_analysis.guide_targets[0].cut_offset) + 3, "right") + ctrl_data["pam_start"] = min(ctrl_data["guide_end"], pam_end - ctrl_start) + ctrl_data["pam_end"] = max(ctrl_data["guide_end"], pam_end - ctrl_start) + else: + pam_start = ctrl_sample.get_trough_position(sanger_analysis.guide_targets[0].cutsite - 6, "left") + ctrl_data["pam_start"] = min(pam_start - ctrl_start, ctrl_data["guide_start"]) + ctrl_data["pam_end"] = max(pam_start - ctrl_start, ctrl_data["guide_start"]) + + edited_start_bp = sanger_analysis.alignment.ctrl2sample_coords(upstream_in_bp, closest=True) + edited_end_bp = sanger_analysis.alignment.ctrl2sample_coords(downstream_in_bp, closest=True) + edited_cutsite_bp = sanger_analysis.alignment.ctrl2sample_coords(sanger_analysis.guide_targets[0].cutsite, + closest=True) + edited_start = edited_sample.peak_locations[edited_start_bp] + edited_end = edited_sample.peak_locations[edited_end_bp] + + edited_peaks = edited_sample.peak_locations[edited_start_bp:edited_end_bp] + edited_basecalls = edited_sample.primary_base_calls[edited_start_bp:edited_end_bp] + edited_basecoords = (edited_start_bp, edited_end_bp) + + if sanger_analysis.guide_targets[0].orientation == 'fwd': + edited_cutsite = (edited_sample.peak_locations[edited_cutsite_bp] + + edited_sample.peak_locations[edited_cutsite_bp - 1]) / 2 + else: + edited_cutsite = (edited_sample.peak_locations[edited_cutsite_bp] + + edited_sample.peak_locations[edited_cutsite_bp - 1]) / 2 + + edited_data = generate_chromatogram_data(edited_sample.data, edited_start, + edited_end, edited_cutsite, edited_peaks, edited_basecalls, + edited_basecoords) + + ctrl_data['donor_odn_bases'] = get_edited_base_px_positions(sanger_analysis, ctrl_sample, + sanger_analysis.recombination_changed_bases, + ctrl_start) + + edited_bases = [] + for base in sanger_analysis.recombination_changed_bases: + edited_bases.append(sanger_analysis.alignment.ctrl2sample_coords(base, closest=True)) + + edited_data['donor_odn_bases'] = get_edited_base_px_positions(sanger_analysis, edited_sample, + edited_bases, + edited_start) + + out_json = {} + + out_json['ctrl_sample'] = reformat_to_json(ctrl_data) + out_json['edited_sample'] = reformat_to_json(edited_data) + + with open(to_file, 'w') as f: + json.dump(out_json, f, ensure_ascii=False) diff --git a/build/lib/ice/tests/__init__.py b/build/lib/ice/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build/lib/ice/tests/fixtures.py b/build/lib/ice/tests/fixtures.py new file mode 100644 index 0000000..b4fbe61 --- /dev/null +++ b/build/lib/ice/tests/fixtures.py @@ -0,0 +1,108 @@ +import os +import shutil +import tempfile + +import pytest + +from ice.classes.pair_alignment import DonorAlignment, PairAlignment + + +@pytest.fixture +def data_dir(): + """ returns the test data directory """ + + path = os.path.dirname(os.path.realpath(__file__)) + test_data = os.path.join(path, "test_data/") + return test_data + + +@pytest.fixture +def example_alignment(): + """ returns an example alignment obj """ + seq1 = 'AATGTAATGATAG' + seq2 = 'AATGTATGATAG' + pa = PairAlignment(seq1, seq2) + return pa + + +@pytest.fixture +def donor_alignment_contiguous_insert(): + """ + Returns example of DonorAlignment with size 5 contiguous insert + Expected alignments: + aligned control seq 'CCCCTGAAATGTA-----ATGATAGCC' + aligned donor seq '----TGAAATGTATTTTTATGATAGCC' + """ + control_seq = 'CCCCTGAAATGTAATGATAGCC' + donor_seq = 'TGAAATGTATTTTTATGATAGCC' + return DonorAlignment(control_seq, donor_seq) + + +@pytest.fixture +def donor_alignment_noncontiguous_insert(): + """ + Returns example of DonorAlignment with size 9 noncontiguous insert + Expected alignments: + aligned control seq 'CCCCTGAAATGTA-----ATGATAGCC--ATGACT' + aligned donor seq '----TGAAATGTATTTTTATGATAGCCAATTGACT' + """ + control_seq = 'CCCCTGAAATGTAATGATAGCCATGACT' + donor_seq = 'TGAAATGTATTTTTATGATAGCCAATTGACT' + return DonorAlignment(control_seq, donor_seq) + + +@pytest.fixture +def donor_alignment_insert_and_deletion(): + """ + Returns example of DonorAlignment with size 5 insert and size 2 deletion + Expected alignments: + aligned control seq 'CCCCTGAAATGTA-----ATGATAGCCAATTGACT' + aligned donor seq '----TGAAATGTATTTTTATGATAGCC--TTGACT' + """ + control_seq = 'CCCCTGAAATGTAATGATAGCCAATTGACT' + donor_seq = 'TGAAATGTATTTTTATGATAGCCTTGACT' + return DonorAlignment(control_seq, donor_seq) + + +@pytest.fixture +def donor_alignment_deletion(): + """ + Returns example of DonorAlignment with size 4 deletion + Expected alignments: + aligned control seq 'CCCCTGAAATGTAATGATAGCCAATTGACT' + aligned donor seq '----TGAAATGTAATGATAG----TTGACT' + """ + control_seq = 'CCCCTGAAATGTAATGATAGCCAATTGACT' + donor_seq = 'TGAAATGTAATGATAGTTGACT' + return DonorAlignment(control_seq, donor_seq) + + +@pytest.fixture +def shorter_alignment(): + """ returns an example alignment obj """ + seq1 = 'AATGTAATGATAG' + seq3 = 'AATGTATGATAGTGGG' + pa = PairAlignment(seq1, seq3) + return pa + + +@pytest.fixture +def no_alignment(): + """ returns an example alignment obj """ + seq1 = 'AATGTAATGATAG' + seq4 = 'GGACAGACTTAAA' + pa = PairAlignment(seq1, seq4) + return pa + + +@pytest.yield_fixture(scope='session', autouse=True) +def temp_dir(): + """ path to a temporary directory that is deleted after testing""" + + # Will be executed before the first test + dirpath = tempfile.mkdtemp() + + yield dirpath + + # Will be executed after the last test + shutil.rmtree(dirpath) diff --git a/build/lib/ice/tests/test_alignment.py b/build/lib/ice/tests/test_alignment.py new file mode 100644 index 0000000..4f4d236 --- /dev/null +++ b/build/lib/ice/tests/test_alignment.py @@ -0,0 +1,120 @@ +import pytest + +from ice.classes.pair_alignment import PairAlignment +from ice.tests.fixtures import example_alignment, shorter_alignment, no_alignment, donor_alignment_contiguous_insert,\ + donor_alignment_noncontiguous_insert, donor_alignment_insert_and_deletion, donor_alignment_deletion + + +def test_windowed_alignment(example_alignment): + pa = example_alignment + pa.align_with_window([0, 3]) + assert pa.aln_seqs[0] == "AAT---------" + assert pa.aln_seqs[1] == "AATGTATGATAG" + + +def test_coord_conversion(): + ''' + GGGAATGTCCTGATAG + ---AATGT--TGATAG + :return: + ''' + seq1 = 'GGGAATGTCCTGATAG' + seq2 = 'AATGTTGATAG' + pa = PairAlignment(seq1, seq2) + pa.align_with_window([0, 14]) + assert 2 == pa.ctrl2sample_coords(5) + assert 0 == pa.ctrl2sample_coords(3) + assert 9 == pa.ctrl2sample_coords(14) + assert None == pa.ctrl2sample_coords(9) + assert 4 == pa.ctrl2sample_coords(9, closest=True) + + with pytest.raises(Exception): + pa.ctrl2sample_coords(None) + + +def test_no_aln_possible(): + seq1 = 'TTTTTTTTTT' + seq2 = 'AAAAAAAAAAGGG' + pa = PairAlignment(seq1, seq2) + flag, msg = pa.align_with_window([0, 14]) + assert not flag + assert 'No alignment found' in msg + + +def test_no_aln_done(example_alignment): + pa = example_alignment + with pytest.raises(Exception): + pa.ctrl2sample_coords(5) + + +def test_all_alignment(example_alignment): + pa = example_alignment + pa.align_all() + assert "AATGT-ATGATAG" in pa.all_aligned_clustal + + with pytest.raises(Exception): + pa.ctrl2sample_coords(None) + + # only windowed alignment results in alingment_pairs + with pytest.raises(Exception): + assert 7 == pa.ctrl2sample_coords(8) + + +def test_aln_str(example_alignment): + pa = example_alignment + pa.align_all() + assert (str(pa) == 'AATGTAATGATAG\nAATGT-ATGATAG') or \ + (str(pa) == 'AATGTAATGATAG\nAATGTA-TGATAG') + + +def test_aln_data(example_alignment): + ''' + The alignment after windowing should ignore the last base of the ref seq + + control AATGTAATGATA + edited AATGTATGATAG + + ''' + pa = example_alignment + pa.align_with_window([0, 3]) + for idx, pair in enumerate(pa.alignment_pairs): + assert pair[0] == pair[1] + + +def test_aln_shorter_sample(shorter_alignment): + ''' + + The alignment after windowing should have gaps + AATGTAATGATAG--- + AATGTATGATAGTGGG + + ''' + pa = shorter_alignment + + pa.align_with_window([0, 3]) + + for idx, pair in enumerate(pa.alignment_pairs): + if idx < 13: + assert pair[0] == pair[1] + else: + assert pair[0] == None + + +def test_no_aln(no_alignment): + ''' + This alignment should be None + ''' + + pa = no_alignment + pa.align_all() + assert pa.alignment_pairs is None + + +def test_donor_alignment_hdr_indel_size(donor_alignment_contiguous_insert, + donor_alignment_noncontiguous_insert, + donor_alignment_insert_and_deletion, + donor_alignment_deletion): + assert donor_alignment_contiguous_insert.hdr_indel_size == 5 + assert donor_alignment_noncontiguous_insert.hdr_indel_size == 7 + assert donor_alignment_insert_and_deletion.hdr_indel_size == 3 + assert donor_alignment_deletion.hdr_indel_size == -4 diff --git a/build/lib/ice/tests/test_analysis.py b/build/lib/ice/tests/test_analysis.py new file mode 100644 index 0000000..e0be493 --- /dev/null +++ b/build/lib/ice/tests/test_analysis.py @@ -0,0 +1,411 @@ +import os +from pprint import pprint as pp + +import pytest +from unittest.mock import MagicMock + +from ice.analysis import single_sanger_analysis +from ice.classes.sanger_analysis import SangerAnalysis +from ice.tests.fixtures import data_dir, temp_dir + + +def test_high_quality_sample(temp_dir, data_dir): + """ Running a good sample""" + + control = os.path.join(data_dir, "good_example_control.ab1") + sample = os.path.join(data_dir, "good_example_edited.ab1") + + output_path = os.path.join(temp_dir, 'high_quality') + guide = "AACCAGTTGCAGGCGCCCCA" + + job_args = (control, sample, output_path, guide) + job_kwargs = {'verbose': True, 'allprops': True} + + results = single_sanger_analysis(*job_args, **job_kwargs) + + pp(results) + + assert results['status'] == 'succeeded' + assert results['ice'] == 77 + allproposals = os.path.join(temp_dir, 'high_quality.allproposals.json') + assert os.path.exists(allproposals) + + +def test_lowercase_guide(temp_dir, data_dir): + """ Running a good sample""" + + control = os.path.join(data_dir, "good_example_control.ab1") + sample = os.path.join(data_dir, "good_example_edited.ab1") + + output_path = os.path.join(temp_dir, 'high_quality_lowercase') + guide = "aaccagttgcaggcgcccca" + + job_args = (control, sample, output_path, guide) + job_kwargs = {'verbose': True, 'allprops': True} + + results = single_sanger_analysis(*job_args, **job_kwargs) + + pp(results) + + assert results['status'] == 'succeeded' + assert results['ice'] == 77 + allproposals = os.path.join(temp_dir, 'high_quality_lowercase.allproposals.json') + assert os.path.exists(allproposals) + + +def test_low_quality_control(temp_dir, data_dir): + # Low quality control but analysis still possible + control = os.path.join(data_dir, "low_quality_control2.ab1") + sample = os.path.join(data_dir, "low_quality_edited2.ab1") + guide = 'CUGGUCGCGCACGAUCAGGG' + output_path = os.path.join(temp_dir, 'low_quality') + + job_args = (control, sample, output_path, guide) + job_kwargs = {'verbose': True} + + results = single_sanger_analysis(*job_args, **job_kwargs) + assert 'Low quality control trace' in results['notes'] + + +def test_low_quality_control_fail(temp_dir, data_dir): + # Low quality control trace --> no analysis possible + bad_quality = os.path.join(data_dir, "low_quality_control.ab1") + sample = os.path.join(data_dir, "good_example_edited.ab1") + guide = 'CCCGCCTCGGCCTCCCTAA' + output_path = os.path.join(temp_dir, 'low_quality') + + job_args = (bad_quality, sample, output_path, guide) + job_kwargs = {'verbose': True} + results = single_sanger_analysis(*job_args, **job_kwargs) + + assert results['status'] == 'succeeded' + assert 'Control ab1 trace quality scores too low' in results['notes'] + + +def test_sanger_analysis_bad_path(temp_dir, data_dir): + sa = SangerAnalysis() + bad_file = os.path.join(data_dir, "no_file_here.xyz") + control = os.path.join(data_dir, "good_example_control.ab1") + sample = os.path.join(data_dir, "good_example_edited.ab1") + + guide = "CGGCUCAAAAGGACCAGACU" + with pytest.raises(Exception): + sa.initialize_with(bad_file, sample, guide) + with pytest.raises(Exception): + sa.initialize_with(control, bad_file, guide) + with pytest.raises(Exception): + sa.initialize_with(control, sample, 123) + with pytest.raises(Exception): + sa.initialize_with(control, sample, "1234") + + +def test_low_quality_sample(temp_dir, data_dir): + """ Running a low quality sample""" + + control = os.path.join(data_dir, "low_quality_control.ab1") + sample = os.path.join(data_dir, "low_quality_edited.ab1") + + output_path = os.path.join(temp_dir, 'low_quality') + guide = "CGGCUCAAAAGGACCAGACU" + + # guide = 'AAAGUCAUCCAAGCCAAGUC' + # guide = 'CAAAAGGACCAGACUUGGCU' + + job_args = (control, sample, output_path, guide) + job_kwargs = {'verbose': True} + + # low quality sample + results = single_sanger_analysis(*job_args, **job_kwargs) + assert results['status'] == 'succeeded' + assert 'Sample ab1 low_quality_edited.ab1 quality scores too low' in results['notes'] + + +def test_bad_paths(temp_dir, data_dir): + control = os.path.join(data_dir, "does_not_exist.abcxyz") + sample = os.path.join(data_dir, "alignment_fail_edited.ab1") + output_path = os.path.join(temp_dir, 'bad_paths') + + guide = "AACCAGTTGCAGGCGCCCCA" + + job_args = (control, sample, output_path, guide) + job_kwargs = {'verbose': True} + with pytest.raises(Exception): + results = single_sanger_analysis(*job_args, **job_kwargs) + + good_control = os.path.join(data_dir, "low_quality_control.ab1") + bad_sample = os.path.join(data_dir, "does_not_exist.abcxyz") + job_args = (good_control, bad_sample, output_path, guide) + job_kwargs = {'verbose': True} + with pytest.raises(Exception): + results = single_sanger_analysis(*job_args, **job_kwargs) + + +def test_alignment_fail(temp_dir, data_dir): + + control = os.path.join(data_dir, "alignment_fail_control.ab1") + sample = os.path.join(data_dir, "alignment_fail_edited.ab1") + + output_path = os.path.join(temp_dir, 'alignment_fail') + guide = "AACCAGTTGCAGGCGCCCCA" + + job_args = (control, sample, output_path, guide) + job_kwargs = {'verbose': True} + + try: + results = single_sanger_analysis(*job_args, **job_kwargs) + assert False + + except Exception as e: + pp(results) + assert results['status'] == 'succeeded' + assert "No alignment found between control and edited sample" in results['notes'] + + +def test_sequence_not_found(temp_dir, data_dir): + + control = os.path.join(data_dir, "bad_guide_sequence_control.ab1") + sample = os.path.join(data_dir, "bad_guide_sequence_edited.ab1") + + output_path = os.path.join(temp_dir, 'bad_sequence') + guide = "AACCAGTTGCAGGCGCCCCA" + + job_args = (control, sample, output_path, guide) + job_kwargs = {'verbose': True} + + try: + results = single_sanger_analysis(*job_args, **job_kwargs) + assert False + + except Exception as e: + pp(results) + assert results['status'] == 'succeeded' + assert 'guide AACCAGTTGCAGGCGCCCCA not found in control sequence' in results['notes'] + + +def test_donor_example(temp_dir, data_dir): + """ Running a good sample with HDR """ + + control = os.path.join(data_dir, 'donor_example_control.ab1') + sample = os.path.join(data_dir, 'donor_example_knockin.ab1') + + output_path = os.path.join(temp_dir, 'donor_example') + + # this should result in a +14 insertion + guide = 'AAGTGCAGCTCGTCCGGCGT' + donor = 'ATCCTCCCGGGAACGTCTCCACCAGCTTCCCTTCCAGCCGACGAGATTGATCTCCGACCCGACGAGCTGCACTTCCTGTCCAAGCACTTCCGCAGCTCAGAGAA' + + job_args = (control, sample, output_path, guide, donor) + job_kwargs = {'verbose': True} + + results = single_sanger_analysis(*job_args, **job_kwargs) + + pp(results) + + assert results['status'] == 'succeeded' + assert pytest.approx(results['hdr_pct']) == 33 + + +def test_donor_substitution_example(temp_dir, data_dir): + """ Running a good sample with donor containing only a 2bp substitution """ + control = os.path.join(data_dir, 'donor_sub_example_control.ab1') + sample = os.path.join(data_dir, 'donor_sub_example_edited.ab1') + + output_path = os.path.join(temp_dir, 'donor_example') + + # this should result in a 2bp substitution + guide = 'GCTGCTTCCTGGGGGCGCCT' + donor = 'AGCCTCAGGGCCTCACACCAGCCCATGTGGATGACCTGAGGGTCCTGTTTCCCATCCCACttCAGGCGCCCCCAGGAAGCAGCGGCGGGAGCGCACCACCTTCACCCGGAGCCAACTGGAGG' + + job_args = (control, sample, output_path, guide, donor) + job_kwargs = {'verbose': True} + + results = single_sanger_analysis(*job_args, **job_kwargs) + + pp(results) + + assert results['status'] == 'succeeded' + assert pytest.approx(results['hdr_pct']) == 44 + + +def test_multiplex_0_0(temp_dir, data_dir): + """ Running a good multiplex sample""" + + control = os.path.join(data_dir, "multiplex0_control.ab1") + sample = os.path.join(data_dir, "multiplex0_edited.ab1") + + output_path = os.path.join(temp_dir, 'multiplex0') + guide = "UGCCAGGAUCACCUCCGAGA" + + job_args = (control, sample, output_path, guide) + job_kwargs = {'verbose': True} + + results = single_sanger_analysis(*job_args, **job_kwargs) + + pp(results) + + assert results['status'] == 'succeeded' + assert results['ice_d'] == 66 + + +def test_multiplex_0_1(temp_dir, data_dir): + """ Running a good multiplex sample""" + + control = os.path.join(data_dir, "multiplex0_control.ab1") + sample = os.path.join(data_dir, "multiplex0_edited.ab1") + + output_path = os.path.join(temp_dir, 'multiplex0') + guide = "CGAUAGGGGGGCCUUCUCGG" + + job_args = (control, sample, output_path, guide) + job_kwargs = {'verbose': True} + + results = single_sanger_analysis(*job_args, **job_kwargs) + + pp(results) + + assert results['status'] == 'succeeded' + assert results['ice_d'] == 66 + + +def test_multiplex_0_2(temp_dir, data_dir): + """ Running a good multiplex sample""" + + control = os.path.join(data_dir, "multiplex0_control.ab1") + sample = os.path.join(data_dir, "multiplex0_edited.ab1") + + output_path = os.path.join(temp_dir, 'multiplex0') + guide = "GCGUCCUCUUAUCUUCUGCC" + + job_args = (control, sample, output_path, guide) + job_kwargs = {'verbose': True} + + results = single_sanger_analysis(*job_args, **job_kwargs) + + pp(results) + + assert results['status'] == 'succeeded' + assert results['ice_d'] == 66 + + +def test_multiplex_1_0(temp_dir, data_dir): + """ Running a good multiplex sample""" + + control = os.path.join(data_dir, "multiplex1_control.ab1") + sample = os.path.join(data_dir, "multiplex1_edited.ab1") + + output_path = os.path.join(temp_dir, 'multiplex1') + guide = "UUCCCCUUAUUUAGAUAACU" + + job_args = (control, sample, output_path, guide) + job_kwargs = {'verbose': True} + + results = single_sanger_analysis(*job_args, **job_kwargs) + + pp(results) + + assert results['status'] == 'succeeded' + assert results['ice_d'] == 67 + + +def test_multiplex_1_1(temp_dir, data_dir): + """ Running a good multiplex sample""" + + control = os.path.join(data_dir, "multiplex1_control.ab1") + sample = os.path.join(data_dir, "multiplex1_edited.ab1") + + output_path = os.path.join(temp_dir, 'multiplex1') + guide = "UCCCCUUAUUUAGAUAACUC" + + job_args = (control, sample, output_path, guide) + job_kwargs = {'verbose': True} + + results = single_sanger_analysis(*job_args, **job_kwargs) + + pp(results) + + assert results['status'] == 'succeeded' + assert results['ice_d'] == 67 + + +def test_multiplex_three_guides(temp_dir, data_dir): + """ Running a good multiplex sample""" + + control = os.path.join(data_dir, "multiplex1_control.ab1") + sample = os.path.join(data_dir, "multiplex1_edited.ab1") + + output_path = os.path.join(temp_dir, 'multiplex1') + guide = "UUCCCCUUAUUUAGAUAACU,UCCCCUUAUUUAGAUAACUC,UGUUAACCAAAUUAUGAUGA" + + job_args = (control, sample, output_path, guide) + job_kwargs = {'verbose': True} + + results = single_sanger_analysis(*job_args, **job_kwargs) + + pp(results) + + assert results['status'] == 'succeeded' + assert results['ice'] == 63 + assert len(results['guides']) == 3 + + +def test_multiplex_three_guides_no_editing(temp_dir, data_dir): + """ + Running a zero editing multiplex sample, with guides that are far apart + """ + + control = os.path.join(data_dir, "multiplex1_control.ab1") + sample = os.path.join(data_dir, "multiplex1_control.ab1") + + output_path = os.path.join(temp_dir, 'multiplex_no_editing_far_guides') + + guide = "UUCCCCUUAUUUAGAUAACU,UCCCCUUAUUUAGAUAACUC,TGAGTTTTTTTGTAAGTAGC" + + job_args = (control, sample, output_path, guide) + job_kwargs = {'verbose': True} + + results = single_sanger_analysis(*job_args, **job_kwargs) + + pp(results) + + assert results['status'] == 'succeeded' + assert results['ice'] == 0 + assert len(results['guides']) == 3 + + +def test_should_skip_proposal(): + fake_donor = 'ATCG' + size_5_insertion = MagicMock(hdr_indel_size=5) + mocked_sanger_analysis = MagicMock(_should_skip_proposal=SangerAnalysis._should_skip_proposal, donor_odn=fake_donor, + HDR_OVERLAP_FILTER_CUTOFF=3, donor_alignment=size_5_insertion) + + # does pass insert above cutoff that is same size as donor + assert mocked_sanger_analysis._should_skip_proposal(mocked_sanger_analysis, 5) + + # does not pass if there is not a donor_odn + mocked_sanger_analysis.donor_odn = None + assert not mocked_sanger_analysis._should_skip_proposal(mocked_sanger_analysis, 5) + mocked_sanger_analysis.donor_odn = fake_donor # add back fake donor for rest of tests + + # does not pass insert above cutoff that isn't same size as donor + assert not mocked_sanger_analysis._should_skip_proposal(mocked_sanger_analysis, 6) + + # does pass deletion above cutoff + size_7_deletion = MagicMock(hdr_indel_size=-7) + mocked_sanger_analysis.donor_alignment = size_7_deletion + assert mocked_sanger_analysis._should_skip_proposal(mocked_sanger_analysis, -7) + + # does not pass deletion above cutoff that isn't same size as donor + assert not mocked_sanger_analysis._should_skip_proposal(mocked_sanger_analysis, -5) + + # does not pass insert same size as deletion + assert not mocked_sanger_analysis._should_skip_proposal(mocked_sanger_analysis, 7) + + # does not pass insert equal to cutoff + size_3_insertion = MagicMock(hdr_indel_size=3) + mocked_sanger_analysis.donor_alignment = size_3_insertion + assert not mocked_sanger_analysis._should_skip_proposal(mocked_sanger_analysis, 3) + + # does not pass deletion below cutoff + size_2_deletion = MagicMock(hdr_indel_size=-2) + mocked_sanger_analysis.donor_alignment = size_2_deletion + assert not mocked_sanger_analysis._should_skip_proposal(mocked_sanger_analysis, -2) diff --git a/build/lib/ice/tests/test_batch.py b/build/lib/ice/tests/test_batch.py new file mode 100644 index 0000000..d2f722f --- /dev/null +++ b/build/lib/ice/tests/test_batch.py @@ -0,0 +1,57 @@ +import os + +from pprint import pprint as pp +import pytest + +from ice.analysis import multiple_sanger_analysis +from ice.tests.fixtures import data_dir, temp_dir +from ice.classes.edit_proposal_creator import EditProposalCreator + + +def test_batch_analysis(temp_dir, data_dir): + """ Execute a batch analysis""" + + definition_file = os.path.join(data_dir, 'batch_example.xlsx') + data_directory = data_dir + output_dir = os.path.join(temp_dir, 'batch_analysis') + + job_args = (definition_file, output_dir) + job_kwargs = { + 'verbose': True, + 'data_dir': data_directory + } + results = multiple_sanger_analysis(*job_args, **job_kwargs) + pp(results) + + for idx, result in enumerate(results): + if idx == 1: + assert result['sample_name'] == 'multiplex_example1' + guides = [] + expected_guides = ['UGCCAGGAUCACCUCCGAGA'.replace('U', 'T'), + 'CGAUAGGGGGGCCUUCUCGG'.replace('U', 'T'), + 'GCGUCCUCUUAUCUUCUGCC'.replace('U', 'T')] + for g in result['guides']: + guides.append(g['sequence']) + assert set(expected_guides) == set(guides) + if idx == 6: + assert result['sample_name'] == 'test_revcomp' + assert result['guides'][0]['sequence'] == 'CCAGAGGCTGATGCTCACCA' + if idx == 8: + msg = "Could not analyze homologous recombination case: " + msg += "Homology arms of length {} not found in control sequence".format( + EditProposalCreator.MIN_HOMOLOGY_ARM) + assert result['notes'] == msg + + +def test_bad_batch(temp_dir, data_dir): + definition_file = os.path.join(data_dir, 'bad_batch_definition.xlsx') + data_directory = data_dir + output_dir = os.path.join(temp_dir, 'bad_batch_analysis') + + job_args = (definition_file, output_dir) + job_kwargs = { + 'verbose': True, + 'data_dir': data_directory + } + results = multiple_sanger_analysis(*job_args, **job_kwargs) + assert not results diff --git a/build/lib/ice/tests/test_edit_proposal_creator.py b/build/lib/ice/tests/test_edit_proposal_creator.py new file mode 100644 index 0000000..08ff764 --- /dev/null +++ b/build/lib/ice/tests/test_edit_proposal_creator.py @@ -0,0 +1,124 @@ +from ice.classes.edit_proposal_creator import EditProposalCreator + +import pytest + + +def test_single_cut(): + wt_basecalls = "GCACCTGTCCCCATAGAAAT" + epc = EditProposalCreator(wt_basecalls, use_ctrl_trace=False) + + proposal = epc.single_cut_edit_proposal(10, "test", del_before=4) + assert proposal.sequence == "GCACCTCCATAGAAAT" + + del_after_proposal = epc.single_cut_edit_proposal(10, "test", del_after=3) + assert del_after_proposal.sequence == "GCACCTGTCCTAGAAAT" + + ins_proposal = epc.single_cut_edit_proposal(6, "test", insertion=2) + assert ins_proposal.sequence.upper() == "GCACCTNNGTCCCCATAGAAAT" + + bad_config = epc.single_cut_edit_proposal(2, "bad", del_before=5) + assert bad_config.sequence == "ACCTGTCCCCATAGAAAT" + + +def test_human_readable_sequence_single_edit_proposal(): + wt_basecalls = 'ACTGCCTGCACCTGTCCCCATAGAAATCCGCTTTACGGAGAGTCACTAGACTAGGACCCCCATAAATTTCACGACAGGGACTAGACCTTGACTGGATA' + epc = EditProposalCreator(wt_basecalls, use_ctrl_trace=False) + proposal = epc.single_cut_edit_proposal(27, 'g1', del_before=4) + + # test default padding + seq = proposal.human_readable_sequence() + assert seq == 'TGCCTGCACCTGTCCCCATAG----|CCGCTTTACGGAGAGTCACTAGACTAGGACCCCCATAAATTTCACGACAG' + + # test custom padding + seq = proposal.human_readable_sequence(bp_before_cutsite=10, bp_after_cutsite=15) + assert seq == 'CCATAG----|CCGCTTTACGGAGAG' + + # test passing in out of bounds padding values (should default to returning the entire proposed sequence) + seq = proposal.human_readable_sequence(bp_before_cutsite=500, bp_after_cutsite=500) + assert seq == 'ACTGCCTGCACCTGTCCCCATAG----|CCGCTTTACGGAGAGTCACTAGACTAGGACCCCCATAAATTTCACGACAGGGACTAGACCTTGACTGGAT' + + # test passing in inf values (should return the the entire proposed sequence) + seq = proposal.human_readable_sequence(bp_before_cutsite=float('inf'), bp_after_cutsite=float('inf')) + assert seq == 'ACTGCCTGCACCTGTCCCCATAG----|CCGCTTTACGGAGAGTCACTAGACTAGGACCCCCATAAATTTCACGACAGGGACTAGACCTTGACTGGAT' + + +def test_human_readable_sequence_multiplex_proposal(): + wt_basecalls = ('CCCATAAATCCGCTTTACAGTCACTAGACTAGGACCAATTTCACGACAGGGACTAGACCTTGCTGGATAGCACCTGTCCCCATAGAAATGGCTATGGA' + 'AAGCCTTTGGGTTATTTGCGGCACCTGTCCCCATAGAAATGGCTATGGAAAGCCTTTGGGTTATTTGCG') + epc = EditProposalCreator(wt_basecalls, use_ctrl_trace=False) + + # test close together dropout default padding + close_together_dropout_proposal = epc.multiplex_proposal(30, 50, 'g1', 'g2', dropout=True) + seq = close_together_dropout_proposal.human_readable_sequence() + assert seq == 'AAATCCGCTTTACAGTCACTAGACT|--------------------|GACTAGACCTTGCTGGATAGCACCTGTCCC' + + # test far apart dropout default padding + far_apart_dropout_proposal = epc.multiplex_proposal(30, 90, 'g1', 'g2', dropout=True) + seq = far_apart_dropout_proposal.human_readable_sequence() + assert seq == ('AAATCCGCTTTACAGTCACTAGACT|------------------------------------------------------------|GCTATGGAAAGC' + 'CTTTGGGTTATTT') + + # test far apart insertion default padding + far_apart_dropout_proposal = epc.multiplex_proposal(30, 90, 'g1', 'g2', dropout=False, cut1_ins=2, cut2_ins=4) + seq = far_apart_dropout_proposal.human_readable_sequence() + assert seq == ('ATCCGCTTTACAGTCACTAGACTnn|AGGACCAATTTCACGACAGGGACTAGACCTTGCTGGATAGCACCTGTCCCCATAGAAATGnnnn|GCTATGGA' + 'AAGCCTTTGGGTTATTT') + + +def test_multiplex_cut(): + wt_basecalls = "GCACCTGTCCCCATAGAAATGGCTATGGAAAGCCTTTGGGTTATTTGCG" + "GCACCTGTCC-CCATAGAAATGGCTATGGAAAGCCT-TTGGGTTATTTGCG" + epc = EditProposalCreator(wt_basecalls, use_ctrl_trace=False) + + dropout_proposal = epc.multiplex_proposal(10, 35, "test1", "test2", dropout=True) + assert dropout_proposal.sequence == "GCACCTGTCCTTGGGTTATTTGCGnnnnnnnnnnnnnnnnnnnnnnnnn" + assert dropout_proposal.summary.startswith('-25:md') + + dropout_ins_proposal = epc.multiplex_proposal(10, 35, "test1", "test2", dropout=True, cut1_ins=2, cut2_ins=3) + assert dropout_ins_proposal.sequence == "GCACCTGTCCnnTTGGGTTATTTGCGnnnnnnnnnnnnnnnnnnnnnnnnn" + + indep_cut = epc.multiplex_proposal(10, 35, "test1", "test2", cut1_del=(4, 0), cut2_del=(0, 3)) + assert indep_cut.sequence == "GCACCTCCATAGAAATGGCTATGGAAAGCCTGGTTATTTGCGnnnnnnn" + assert indep_cut.summary.startswith('-7:m-4[test1]') + + indep_cut2 = epc.multiplex_proposal(10, 35, "test1", "test2", cut1_del=(-4, 0), cut2_del=(3, 0)) + assert indep_cut2.sequence == "GCACCTGTCCCCATAGAAATGGCTATGGAAAGTTGGGTTATTTGCGnnn" + + +def test_bad_multiplex(): + wt_basecalls = "GCACCTGTCCCCATAGAAATGGCTATGGAAAGCCTTTGGGTTATTTGCG" + epc = EditProposalCreator(wt_basecalls, use_ctrl_trace=False) + with pytest.raises(Exception): + epc.multiplex_proposal(20, 10, "test1", "test2", dropout=True) + + +def test_wt_proposal(): + wt_basecalls = "GCACCTGTCCCCGGGGAAAT" + epc = EditProposalCreator(wt_basecalls, use_ctrl_trace=False) + + proposal = epc.single_cut_edit_proposal(10, "test") + assert proposal.sequence == "GCACCTGTCCCCGGGGAAAT" + + +def test_req_ctrl_trace(): + with pytest.raises(Exception): + epc = EditProposalCreator("ATCG", use_ctrl_trace=True) + + +def test_generated_trace(): + seq = "ATNCG" + + epc = EditProposalCreator(seq, use_ctrl_trace=False) + + expected_trace = {'A': [], 'T': [], 'C': [], 'G': []} + for base in seq: + if base == 'N': + for color in expected_trace.keys(): + expected_trace[color].append(0.25) + else: + for color in epc.base_order: + if color == base: + expected_trace[color].append(0.97) + else: + expected_trace[color].append(0.01) + assert epc.wt_trace == expected_trace diff --git a/build/lib/ice/tests/test_sanger_object.py b/build/lib/ice/tests/test_sanger_object.py new file mode 100644 index 0000000..31a6cd8 --- /dev/null +++ b/build/lib/ice/tests/test_sanger_object.py @@ -0,0 +1,9 @@ +from ice.classes.sanger_object import SangerObject + +def test_trace_to_seq(): + trace = [0,0,1,0] + trace += [0,0.75, 0.0, 0.25] + trace += [0.87, 0, 0, 0] + trace += [0, 0.751, 0.0, 0.249] + base_order = "ATCG" + assert "CNAT" == SangerObject.trace_to_base_calls(trace, base_order) \ No newline at end of file diff --git a/build/lib/ice/tests/test_sequence.py b/build/lib/ice/tests/test_sequence.py new file mode 100644 index 0000000..4ba883c --- /dev/null +++ b/build/lib/ice/tests/test_sequence.py @@ -0,0 +1,45 @@ +from ice.utility import sequence +import pytest + +def test_rev_comp(): + seq = "ATTCCG" + + assert sequence.reverse_complement(seq) == "CGGAAT" + +def test_complement(): + seq = "ATTCCG" + + assert sequence.complement(seq) == "TAAGGC" + +def test_rev_transcribe(): + seq = "GCCUuAAA" + assert sequence.reverse_transcribe(seq) == "CGGAaTTT" + + +def test_transcribe(): + seq = "GCCAAATTtt" + assert sequence.transcribe(seq) == "CGGUUUAAaa" + + +def test_is_nuc_acid(): + good_seq = "GCCAat" + assert sequence.is_nuc_acid(good_seq) + + bad_seq = "AAAAXGG" + assert not sequence.is_nuc_acid(bad_seq) + + not_str = 1234 + assert not sequence.is_nuc_acid(not_str) + + +def test_rna2dna(): + rna = "UGUUAACG" + + assert sequence.RNA2DNA(rna) == "TGTTAACG" + + with pytest.raises(Exception): + assert sequence.RNA2DNA(123) + +def test_dna2rna(): + dna = "TGTTAAGC" + assert sequence.DNA2RNA(dna) == "UGUUAAGC" \ No newline at end of file diff --git a/build/lib/ice/tests/test_utility.py b/build/lib/ice/tests/test_utility.py new file mode 100644 index 0000000..39096a1 --- /dev/null +++ b/build/lib/ice/tests/test_utility.py @@ -0,0 +1,44 @@ +import pytest + +from ice.utility.misc import version +from ice.utility.sequence import reverse_transcribe, transcribe, is_nuc_acid, reverse_complement, DNA2RNA, RNA2DNA + + +def test_sequence_util(): + + sequence = 'ACTG' + rna_sequence = 'ACUG' + + transcribed = transcribe(sequence) + print(transcribed) + + reverse_comp = reverse_complement(sequence) + print(reverse_comp) + + reverse_transcribed_seq = reverse_transcribe(rna_sequence) + print(reverse_transcribed_seq) + + assert is_nuc_acid('A') + assert not is_nuc_acid('L') + + assert not is_nuc_acid(None) + + assert RNA2DNA('U') == 'T' + assert DNA2RNA('T') == 'U' + + try: + RNA2DNA('L') + except Exception as e: + assert 'Input sequence L is not valid nucleic acid sequence' in str(e) + + +def test_dna_bad_input(): + with pytest.raises(Exception): + DNA2RNA('L') + + +def test_version(): + + v = version() + + assert v is not None diff --git a/build/lib/ice/utility/__init__.py b/build/lib/ice/utility/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build/lib/ice/utility/misc.py b/build/lib/ice/utility/misc.py new file mode 100644 index 0000000..c813b4d --- /dev/null +++ b/build/lib/ice/utility/misc.py @@ -0,0 +1,85 @@ +""" +Copyright 2018 Synthego Corporation All Rights Reserved + +The Synthego ICE software was developed at Synthego Corporation. + +Permission to use, copy, modify and distribute any part of Synthego ICE for +educational, research and non-profit purposes, without fee, and without a +written agreement is hereby granted, provided that the above copyright notice, +this paragraph and the following paragraphs appear in all copies. + +Those desiring to incorporate this Synthego ICE software into commercial +products or use for commercial purposes should contact Synthego support at Ph: +(888) 611-6883 ext:1, E-MAIL: support@synthego.com. + +In no event shall Synthego Corporation be liable to any party for direct, +indirect, special, incidental, or consequential damages, including lost +profits, arising out of the use of Synthego ICE, even if Synthego Corporation +has been advised of the possibility of such damage. + +The Synthego ICE tool provided herein is on an "as is" basis, and the Synthego +Corporation has no obligation to provide maintenance, support, updates, +enhancements, or modifications. The Synthego Corporation makes no +representations and extends no warranties of any kind, either implied or +express, including, but not limited to, the implied warranties of +merchantability or fitness for a particular purpose, or that the use of +Synthego ICE will not infringe any patent, trademark or other rights. +""" + +import numpy as np +import os +import subprocess +from ice.__version__ import __version__ + + +def round_list(in_list, num_digits): + return [round(i, num_digits) for i in in_list] + + +def running_mean(x, n): + """ + https://stackoverflow.com/questions/13728392/moving-average-or-running-mean + """ + + cumsum = np.cumsum(np.insert(x, 0, 0)) + return (cumsum[n:] - cumsum[:-n]) / n + + +def version(): + return __version__ + + +def contiguous_regions(condition): + """Finds contiguous True regions of the boolean array "condition". Returns + a 2D array where the first column is the start index of the region and the + second column is the end index. + + # From + # https://stackoverflow.com/questions/4494404/find-large-number-of-consecutive-values-fulfilling-condition-in-a-numpy-array + + """ + + # Find the indicies of changes in "condition" + d = np.diff(condition) + idx, = d.nonzero() + + # We need to start things after the change in "condition". Therefore, + # we'll shift the index by 1 to the right. + idx += 1 + + if condition[0]: + # If the start of condition is True prepend a 0 + idx = np.r_[0, idx] + + if condition[-1]: + # If the end of condition is True, append the length of the array + idx = np.r_[idx, condition.size] # Edit + + # Reshape the result into two columns + idx.shape = (-1, 2) + return idx + + +def find_regions_greater_than(array, comparison): + condition = array > comparison + return contiguous_regions(condition) diff --git a/build/lib/ice/utility/sequence.py b/build/lib/ice/utility/sequence.py new file mode 100644 index 0000000..2edfc94 --- /dev/null +++ b/build/lib/ice/utility/sequence.py @@ -0,0 +1,92 @@ +""" +Copyright 2018 Synthego Corporation All Rights Reserved + +The Synthego ICE software was developed at Synthego Corporation. + +Permission to use, copy, modify and distribute any part of Synthego ICE for +educational, research and non-profit purposes, without fee, and without a +written agreement is hereby granted, provided that the above copyright notice, +this paragraph and the following paragraphs appear in all copies. + +Those desiring to incorporate this Synthego ICE software into commercial +products or use for commercial purposes should contact Synthego support at Ph: +(888) 611-6883 ext:1, E-MAIL: support@synthego.com. + +In no event shall Synthego Corporation be liable to any party for direct, +indirect, special, incidental, or consequential damages, including lost +profits, arising out of the use of Synthego ICE, even if Synthego Corporation +has been advised of the possibility of such damage. + +The Synthego ICE tool provided herein is on an "as is" basis, and the Synthego +Corporation has no obligation to provide maintenance, support, updates, +enhancements, or modifications. The Synthego Corporation makes no +representations and extends no warranties of any kind, either implied or +express, including, but not limited to, the implied warranties of +merchantability or fitness for a particular purpose, or that the use of +Synthego ICE will not infringe any patent, trademark or other rights. +""" + +DNA_COMPLEMENT = { + 'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', + 'a': 't', 't': 'a', 'c': 'g', 'g': 'c' +} + +RT_MAP = { + 'U': 'A', 'A': 'T', 'C': 'G', 'G': 'C', + 'u': 'a', 'a': 't', 'c': 'g', 'g': 'c', +} + +VALID_RNA = ['A', 'C', 'G', 'U', 'a', 'c', 'g', 'u'] + +VALID_NUC_ACID = ['A', 'C', 'G', 'T', 'U'] + +TRANS_MAP = {v: k for k, v in RT_MAP.items()} + + +def reverse_transcribe(input_seq): + output = '' + for c in input_seq: + output += RT_MAP.get(c, 'x') + return output + + +def transcribe(input_seq): + output = '' + for c in input_seq: + output += TRANS_MAP.get(c, 'x') + return output + + +def reverse_complement(input_seq): + output = '' + for c in reversed(input_seq): + output += DNA_COMPLEMENT.get(c, 'x') + return output + + +def complement(input_seq): + output = '' + for c in input_seq: + output += DNA_COMPLEMENT.get(c, 'x') + return output + + +def is_nuc_acid(input_seq): + try: + return all(nuc in VALID_NUC_ACID for nuc in input_seq.upper()) + except Exception: + return False + + +def RNA2DNA(input_seq): + if is_nuc_acid(input_seq): + return input_seq.replace('U', 'T').replace('u', 't') + else: + raise Exception("Input sequence {} is not valid nucleic acid sequence".format(input_seq)) + + +def DNA2RNA(input_seq): + if is_nuc_acid(input_seq): + return input_seq.replace('T', 'U').replace('t', 'u') + else: + raise Exception("Input sequence {} is not valid nucleic acid sequence".format(input_seq)) diff --git a/ice/__version__.py b/ice/__version__.py index b3ddbc4..58d478a 100644 --- a/ice/__version__.py +++ b/ice/__version__.py @@ -1 +1 @@ -__version__ = '1.1.1' +__version__ = '1.2.0' diff --git a/ice/tests/test_alignment.py b/ice/tests/test_alignment.py index ac4a729..4f4d236 100644 --- a/ice/tests/test_alignment.py +++ b/ice/tests/test_alignment.py @@ -5,9 +5,9 @@ donor_alignment_noncontiguous_insert, donor_alignment_insert_and_deletion, donor_alignment_deletion -def test_windowed_alignment(): - pa = example_alignment() - pa.align_with_window([0,3]) +def test_windowed_alignment(example_alignment): + pa = example_alignment + pa.align_with_window([0, 3]) assert pa.aln_seqs[0] == "AAT---------" assert pa.aln_seqs[1] == "AATGTATGATAG" @@ -21,7 +21,7 @@ def test_coord_conversion(): seq1 = 'GGGAATGTCCTGATAG' seq2 = 'AATGTTGATAG' pa = PairAlignment(seq1, seq2) - pa.align_with_window([0,14]) + pa.align_with_window([0, 14]) assert 2 == pa.ctrl2sample_coords(5) assert 0 == pa.ctrl2sample_coords(3) assert 9 == pa.ctrl2sample_coords(14) @@ -41,33 +41,33 @@ def test_no_aln_possible(): assert 'No alignment found' in msg -def test_no_aln_done(): - pa = example_alignment() +def test_no_aln_done(example_alignment): + pa = example_alignment with pytest.raises(Exception): pa.ctrl2sample_coords(5) -def test_all_alignment(): - pa = example_alignment() +def test_all_alignment(example_alignment): + pa = example_alignment pa.align_all() assert "AATGT-ATGATAG" in pa.all_aligned_clustal with pytest.raises(Exception): pa.ctrl2sample_coords(None) - #only windowed alignment results in alingment_pairs + # only windowed alignment results in alingment_pairs with pytest.raises(Exception): assert 7 == pa.ctrl2sample_coords(8) -def test_aln_str(): - pa = example_alignment() +def test_aln_str(example_alignment): + pa = example_alignment pa.align_all() assert (str(pa) == 'AATGTAATGATAG\nAATGT-ATGATAG') or \ - (str(pa) == 'AATGTAATGATAG\nAATGTA-TGATAG') + (str(pa) == 'AATGTAATGATAG\nAATGTA-TGATAG') -def test_aln_data(): +def test_aln_data(example_alignment): ''' The alignment after windowing should ignore the last base of the ref seq @@ -75,13 +75,13 @@ def test_aln_data(): edited AATGTATGATAG ''' - pa = example_alignment() + pa = example_alignment pa.align_with_window([0, 3]) for idx, pair in enumerate(pa.alignment_pairs): assert pair[0] == pair[1] -def test_aln_shorter_sample(): +def test_aln_shorter_sample(shorter_alignment): ''' The alignment after windowing should have gaps @@ -89,7 +89,7 @@ def test_aln_shorter_sample(): AATGTATGATAGTGGG ''' - pa = shorter_alignment() + pa = shorter_alignment pa.align_with_window([0, 3]) @@ -100,18 +100,21 @@ def test_aln_shorter_sample(): assert pair[0] == None -def test_no_aln(): +def test_no_aln(no_alignment): ''' This alignment should be None ''' - pa = no_alignment() + pa = no_alignment pa.align_all() assert pa.alignment_pairs is None -def test_donor_alignment_hdr_indel_size(): - assert donor_alignment_contiguous_insert().hdr_indel_size == 5 - assert donor_alignment_noncontiguous_insert().hdr_indel_size == 7 - assert donor_alignment_insert_and_deletion().hdr_indel_size == 3 - assert donor_alignment_deletion().hdr_indel_size == -4 +def test_donor_alignment_hdr_indel_size(donor_alignment_contiguous_insert, + donor_alignment_noncontiguous_insert, + donor_alignment_insert_and_deletion, + donor_alignment_deletion): + assert donor_alignment_contiguous_insert.hdr_indel_size == 5 + assert donor_alignment_noncontiguous_insert.hdr_indel_size == 7 + assert donor_alignment_insert_and_deletion.hdr_indel_size == 3 + assert donor_alignment_deletion.hdr_indel_size == -4 diff --git a/ice/tests/test_analysis.py b/ice/tests/test_analysis.py index bda4074..e0be493 100644 --- a/ice/tests/test_analysis.py +++ b/ice/tests/test_analysis.py @@ -9,56 +9,54 @@ from ice.tests.fixtures import data_dir, temp_dir -def test_high_quality_sample(temp_dir): +def test_high_quality_sample(temp_dir, data_dir): """ Running a good sample""" - control = os.path.join(data_dir(), "good_example_control.ab1") - sample = os.path.join(data_dir(), "good_example_edited.ab1") + control = os.path.join(data_dir, "good_example_control.ab1") + sample = os.path.join(data_dir, "good_example_edited.ab1") output_path = os.path.join(temp_dir, 'high_quality') guide = "AACCAGTTGCAGGCGCCCCA" job_args = (control, sample, output_path, guide) - job_kwargs = {'verbose': True, 'allprops':True} - + job_kwargs = {'verbose': True, 'allprops': True} results = single_sanger_analysis(*job_args, **job_kwargs) pp(results) assert results['status'] == 'succeeded' - assert results['ice'] == 78 + assert results['ice'] == 77 allproposals = os.path.join(temp_dir, 'high_quality.allproposals.json') assert os.path.exists(allproposals) -def test_lowercase_guide(temp_dir): +def test_lowercase_guide(temp_dir, data_dir): """ Running a good sample""" - control = os.path.join(data_dir(), "good_example_control.ab1") - sample = os.path.join(data_dir(), "good_example_edited.ab1") + control = os.path.join(data_dir, "good_example_control.ab1") + sample = os.path.join(data_dir, "good_example_edited.ab1") output_path = os.path.join(temp_dir, 'high_quality_lowercase') guide = "aaccagttgcaggcgcccca" job_args = (control, sample, output_path, guide) - job_kwargs = {'verbose': True, 'allprops':True} - + job_kwargs = {'verbose': True, 'allprops': True} results = single_sanger_analysis(*job_args, **job_kwargs) pp(results) assert results['status'] == 'succeeded' - assert results['ice'] == 78 + assert results['ice'] == 77 allproposals = os.path.join(temp_dir, 'high_quality_lowercase.allproposals.json') assert os.path.exists(allproposals) -def test_low_quality_control(temp_dir): - #Low quality control but analysis still possible - control = os.path.join(data_dir(), "low_quality_control2.ab1") - sample = os.path.join(data_dir(), "low_quality_edited2.ab1") +def test_low_quality_control(temp_dir, data_dir): + # Low quality control but analysis still possible + control = os.path.join(data_dir, "low_quality_control2.ab1") + sample = os.path.join(data_dir, "low_quality_edited2.ab1") guide = 'CUGGUCGCGCACGAUCAGGG' output_path = os.path.join(temp_dir, 'low_quality') @@ -69,14 +67,14 @@ def test_low_quality_control(temp_dir): assert 'Low quality control trace' in results['notes'] -def test_low_quality_control_fail(temp_dir): - #Low quality control trace --> no analysis possible - bad_quality = os.path.join(data_dir(), "low_quality_control.ab1") - sample = os.path.join(data_dir(), "good_example_edited.ab1") +def test_low_quality_control_fail(temp_dir, data_dir): + # Low quality control trace --> no analysis possible + bad_quality = os.path.join(data_dir, "low_quality_control.ab1") + sample = os.path.join(data_dir, "good_example_edited.ab1") guide = 'CCCGCCTCGGCCTCCCTAA' output_path = os.path.join(temp_dir, 'low_quality') - job_args = (bad_quality, sample , output_path, guide) + job_args = (bad_quality, sample, output_path, guide) job_kwargs = {'verbose': True} results = single_sanger_analysis(*job_args, **job_kwargs) @@ -84,11 +82,11 @@ def test_low_quality_control_fail(temp_dir): assert 'Control ab1 trace quality scores too low' in results['notes'] -def test_sanger_analysis_bad_path(temp_dir): +def test_sanger_analysis_bad_path(temp_dir, data_dir): sa = SangerAnalysis() - bad_file = os.path.join(data_dir(), "no_file_here.xyz") - control = os.path.join(data_dir(), "good_example_control.ab1") - sample = os.path.join(data_dir(), "good_example_edited.ab1") + bad_file = os.path.join(data_dir, "no_file_here.xyz") + control = os.path.join(data_dir, "good_example_control.ab1") + sample = os.path.join(data_dir, "good_example_edited.ab1") guide = "CGGCUCAAAAGGACCAGACU" with pytest.raises(Exception): @@ -101,11 +99,11 @@ def test_sanger_analysis_bad_path(temp_dir): sa.initialize_with(control, sample, "1234") -def test_low_quality_sample(temp_dir): +def test_low_quality_sample(temp_dir, data_dir): """ Running a low quality sample""" - control = os.path.join(data_dir(), "low_quality_control.ab1") - sample = os.path.join(data_dir(), "low_quality_edited.ab1") + control = os.path.join(data_dir, "low_quality_control.ab1") + sample = os.path.join(data_dir, "low_quality_edited.ab1") output_path = os.path.join(temp_dir, 'low_quality') guide = "CGGCUCAAAAGGACCAGACU" @@ -116,15 +114,15 @@ def test_low_quality_sample(temp_dir): job_args = (control, sample, output_path, guide) job_kwargs = {'verbose': True} - #low quality sample + # low quality sample results = single_sanger_analysis(*job_args, **job_kwargs) assert results['status'] == 'succeeded' assert 'Sample ab1 low_quality_edited.ab1 quality scores too low' in results['notes'] -def test_bad_paths(temp_dir): - control = os.path.join(data_dir(), "does_not_exist.abcxyz") - sample = os.path.join(data_dir(), "alignment_fail_edited.ab1") +def test_bad_paths(temp_dir, data_dir): + control = os.path.join(data_dir, "does_not_exist.abcxyz") + sample = os.path.join(data_dir, "alignment_fail_edited.ab1") output_path = os.path.join(temp_dir, 'bad_paths') guide = "AACCAGTTGCAGGCGCCCCA" @@ -134,18 +132,18 @@ def test_bad_paths(temp_dir): with pytest.raises(Exception): results = single_sanger_analysis(*job_args, **job_kwargs) - good_control = os.path.join(data_dir(), "low_quality_control.ab1") - bad_sample = os.path.join(data_dir(), "does_not_exist.abcxyz") + good_control = os.path.join(data_dir, "low_quality_control.ab1") + bad_sample = os.path.join(data_dir, "does_not_exist.abcxyz") job_args = (good_control, bad_sample, output_path, guide) job_kwargs = {'verbose': True} with pytest.raises(Exception): results = single_sanger_analysis(*job_args, **job_kwargs) -def test_alignment_fail(temp_dir): +def test_alignment_fail(temp_dir, data_dir): - control = os.path.join(data_dir(), "alignment_fail_control.ab1") - sample = os.path.join(data_dir(), "alignment_fail_edited.ab1") + control = os.path.join(data_dir, "alignment_fail_control.ab1") + sample = os.path.join(data_dir, "alignment_fail_edited.ab1") output_path = os.path.join(temp_dir, 'alignment_fail') guide = "AACCAGTTGCAGGCGCCCCA" @@ -163,10 +161,10 @@ def test_alignment_fail(temp_dir): assert "No alignment found between control and edited sample" in results['notes'] -def test_sequence_not_found(temp_dir): +def test_sequence_not_found(temp_dir, data_dir): - control = os.path.join(data_dir(), "bad_guide_sequence_control.ab1") - sample = os.path.join(data_dir(), "bad_guide_sequence_edited.ab1") + control = os.path.join(data_dir, "bad_guide_sequence_control.ab1") + sample = os.path.join(data_dir, "bad_guide_sequence_edited.ab1") output_path = os.path.join(temp_dir, 'bad_sequence') guide = "AACCAGTTGCAGGCGCCCCA" @@ -184,11 +182,11 @@ def test_sequence_not_found(temp_dir): assert 'guide AACCAGTTGCAGGCGCCCCA not found in control sequence' in results['notes'] -def test_donor_example(temp_dir): +def test_donor_example(temp_dir, data_dir): """ Running a good sample with HDR """ - control = os.path.join(data_dir(), 'donor_example_control.ab1') - sample = os.path.join(data_dir(), 'donor_example_knockin.ab1') + control = os.path.join(data_dir, 'donor_example_control.ab1') + sample = os.path.join(data_dir, 'donor_example_knockin.ab1') output_path = os.path.join(temp_dir, 'donor_example') @@ -204,13 +202,13 @@ def test_donor_example(temp_dir): pp(results) assert results['status'] == 'succeeded' - assert pytest.approx(results['hdr_pct']) == 29.70283309122449 + assert pytest.approx(results['hdr_pct']) == 33 -def test_donor_substitution_example(temp_dir): +def test_donor_substitution_example(temp_dir, data_dir): """ Running a good sample with donor containing only a 2bp substitution """ - control = os.path.join(data_dir(), 'donor_sub_example_control.ab1') - sample = os.path.join(data_dir(), 'donor_sub_example_edited.ab1') + control = os.path.join(data_dir, 'donor_sub_example_control.ab1') + sample = os.path.join(data_dir, 'donor_sub_example_edited.ab1') output_path = os.path.join(temp_dir, 'donor_example') @@ -226,14 +224,14 @@ def test_donor_substitution_example(temp_dir): pp(results) assert results['status'] == 'succeeded' - assert pytest.approx(results['hdr_pct']) == 49.42660823273503 + assert pytest.approx(results['hdr_pct']) == 44 -def test_multiplex_0_0(temp_dir): +def test_multiplex_0_0(temp_dir, data_dir): """ Running a good multiplex sample""" - control = os.path.join(data_dir(), "multiplex0_control.ab1") - sample = os.path.join(data_dir(), "multiplex0_edited.ab1") + control = os.path.join(data_dir, "multiplex0_control.ab1") + sample = os.path.join(data_dir, "multiplex0_edited.ab1") output_path = os.path.join(temp_dir, 'multiplex0') guide = "UGCCAGGAUCACCUCCGAGA" @@ -249,11 +247,11 @@ def test_multiplex_0_0(temp_dir): assert results['ice_d'] == 66 -def test_multiplex_0_1(temp_dir): +def test_multiplex_0_1(temp_dir, data_dir): """ Running a good multiplex sample""" - control = os.path.join(data_dir(), "multiplex0_control.ab1") - sample = os.path.join(data_dir(), "multiplex0_edited.ab1") + control = os.path.join(data_dir, "multiplex0_control.ab1") + sample = os.path.join(data_dir, "multiplex0_edited.ab1") output_path = os.path.join(temp_dir, 'multiplex0') guide = "CGAUAGGGGGGCCUUCUCGG" @@ -269,11 +267,11 @@ def test_multiplex_0_1(temp_dir): assert results['ice_d'] == 66 -def test_multiplex_0_2(temp_dir): +def test_multiplex_0_2(temp_dir, data_dir): """ Running a good multiplex sample""" - control = os.path.join(data_dir(), "multiplex0_control.ab1") - sample = os.path.join(data_dir(), "multiplex0_edited.ab1") + control = os.path.join(data_dir, "multiplex0_control.ab1") + sample = os.path.join(data_dir, "multiplex0_edited.ab1") output_path = os.path.join(temp_dir, 'multiplex0') guide = "GCGUCCUCUUAUCUUCUGCC" @@ -281,7 +279,6 @@ def test_multiplex_0_2(temp_dir): job_args = (control, sample, output_path, guide) job_kwargs = {'verbose': True} - results = single_sanger_analysis(*job_args, **job_kwargs) pp(results) @@ -290,11 +287,11 @@ def test_multiplex_0_2(temp_dir): assert results['ice_d'] == 66 -def test_multiplex_1_0(temp_dir): +def test_multiplex_1_0(temp_dir, data_dir): """ Running a good multiplex sample""" - control = os.path.join(data_dir(), "multiplex1_control.ab1") - sample = os.path.join(data_dir(), "multiplex1_edited.ab1") + control = os.path.join(data_dir, "multiplex1_control.ab1") + sample = os.path.join(data_dir, "multiplex1_edited.ab1") output_path = os.path.join(temp_dir, 'multiplex1') guide = "UUCCCCUUAUUUAGAUAACU" @@ -310,11 +307,11 @@ def test_multiplex_1_0(temp_dir): assert results['ice_d'] == 67 -def test_multiplex_1_1(temp_dir): +def test_multiplex_1_1(temp_dir, data_dir): """ Running a good multiplex sample""" - control = os.path.join(data_dir(), "multiplex1_control.ab1") - sample = os.path.join(data_dir(), "multiplex1_edited.ab1") + control = os.path.join(data_dir, "multiplex1_control.ab1") + sample = os.path.join(data_dir, "multiplex1_edited.ab1") output_path = os.path.join(temp_dir, 'multiplex1') guide = "UCCCCUUAUUUAGAUAACUC" @@ -322,7 +319,6 @@ def test_multiplex_1_1(temp_dir): job_args = (control, sample, output_path, guide) job_kwargs = {'verbose': True} - results = single_sanger_analysis(*job_args, **job_kwargs) pp(results) @@ -331,11 +327,11 @@ def test_multiplex_1_1(temp_dir): assert results['ice_d'] == 67 -def test_multiplex_three_guides(temp_dir): +def test_multiplex_three_guides(temp_dir, data_dir): """ Running a good multiplex sample""" - control = os.path.join(data_dir(), "multiplex1_control.ab1") - sample = os.path.join(data_dir(), "multiplex1_edited.ab1") + control = os.path.join(data_dir, "multiplex1_control.ab1") + sample = os.path.join(data_dir, "multiplex1_edited.ab1") output_path = os.path.join(temp_dir, 'multiplex1') guide = "UUCCCCUUAUUUAGAUAACU,UCCCCUUAUUUAGAUAACUC,UGUUAACCAAAUUAUGAUGA" @@ -343,33 +339,30 @@ def test_multiplex_three_guides(temp_dir): job_args = (control, sample, output_path, guide) job_kwargs = {'verbose': True} - results = single_sanger_analysis(*job_args, **job_kwargs) pp(results) assert results['status'] == 'succeeded' - assert results['ice'] == 69 + assert results['ice'] == 63 assert len(results['guides']) == 3 -def test_multiplex_three_guides_no_editing(temp_dir): +def test_multiplex_three_guides_no_editing(temp_dir, data_dir): """ Running a zero editing multiplex sample, with guides that are far apart """ - control = os.path.join(data_dir(), "multiplex1_control.ab1") - sample = os.path.join(data_dir(), "multiplex1_control.ab1") + control = os.path.join(data_dir, "multiplex1_control.ab1") + sample = os.path.join(data_dir, "multiplex1_control.ab1") output_path = os.path.join(temp_dir, 'multiplex_no_editing_far_guides') - guide = "UUCCCCUUAUUUAGAUAACU,UCCCCUUAUUUAGAUAACUC,TGAGTTTTTTTGTAAGTAGC" job_args = (control, sample, output_path, guide) job_kwargs = {'verbose': True} - results = single_sanger_analysis(*job_args, **job_kwargs) pp(results) diff --git a/ice/tests/test_batch.py b/ice/tests/test_batch.py index 6d39138..d2f722f 100644 --- a/ice/tests/test_batch.py +++ b/ice/tests/test_batch.py @@ -7,17 +7,18 @@ from ice.tests.fixtures import data_dir, temp_dir from ice.classes.edit_proposal_creator import EditProposalCreator -def test_batch_analysis(temp_dir): + +def test_batch_analysis(temp_dir, data_dir): """ Execute a batch analysis""" - definition_file = os.path.join(data_dir(), 'batch_example.xlsx') - data_directory = data_dir() + definition_file = os.path.join(data_dir, 'batch_example.xlsx') + data_directory = data_dir output_dir = os.path.join(temp_dir, 'batch_analysis') job_args = (definition_file, output_dir) job_kwargs = { 'verbose': True, - 'data_dir' : data_directory + 'data_dir': data_directory } results = multiple_sanger_analysis(*job_args, **job_kwargs) pp(results) @@ -37,21 +38,20 @@ def test_batch_analysis(temp_dir): assert result['guides'][0]['sequence'] == 'CCAGAGGCTGATGCTCACCA' if idx == 8: msg = "Could not analyze homologous recombination case: " - msg += "Homology arms of length {} not found in control sequence".format(EditProposalCreator.MIN_HOMOLOGY_ARM) + msg += "Homology arms of length {} not found in control sequence".format( + EditProposalCreator.MIN_HOMOLOGY_ARM) assert result['notes'] == msg -def test_bad_batch(temp_dir): - definition_file = os.path.join(data_dir(), 'bad_batch_definition.xlsx') - data_directory = data_dir() + +def test_bad_batch(temp_dir, data_dir): + definition_file = os.path.join(data_dir, 'bad_batch_definition.xlsx') + data_directory = data_dir output_dir = os.path.join(temp_dir, 'bad_batch_analysis') job_args = (definition_file, output_dir) job_kwargs = { 'verbose': True, - 'data_dir' : data_directory + 'data_dir': data_directory } results = multiple_sanger_analysis(*job_args, **job_kwargs) assert not results - - - diff --git a/ice/tests/test_edit_proposal_creator.py b/ice/tests/test_edit_proposal_creator.py index 0107a87..08ff764 100644 --- a/ice/tests/test_edit_proposal_creator.py +++ b/ice/tests/test_edit_proposal_creator.py @@ -56,13 +56,13 @@ def test_human_readable_sequence_multiplex_proposal(): far_apart_dropout_proposal = epc.multiplex_proposal(30, 90, 'g1', 'g2', dropout=True) seq = far_apart_dropout_proposal.human_readable_sequence() assert seq == ('AAATCCGCTTTACAGTCACTAGACT|------------------------------------------------------------|GCTATGGAAAGC' - 'CTTTGGGTTATTT') + 'CTTTGGGTTATTT') # test far apart insertion default padding far_apart_dropout_proposal = epc.multiplex_proposal(30, 90, 'g1', 'g2', dropout=False, cut1_ins=2, cut2_ins=4) seq = far_apart_dropout_proposal.human_readable_sequence() assert seq == ('ATCCGCTTTACAGTCACTAGACTnn|AGGACCAATTTCACGACAGGGACTAGACCTTGCTGGATAGCACCTGTCCCCATAGAAATGnnnn|GCTATGGA' - 'AAGCCTTTGGGTTATTT') + 'AAGCCTTTGGGTTATTT') def test_multiplex_cut(): @@ -71,18 +71,18 @@ def test_multiplex_cut(): epc = EditProposalCreator(wt_basecalls, use_ctrl_trace=False) dropout_proposal = epc.multiplex_proposal(10, 35, "test1", "test2", dropout=True) - assert dropout_proposal.sequence == "GCACCTGTCCTTGGGTTATTTGCG" + assert dropout_proposal.sequence == "GCACCTGTCCTTGGGTTATTTGCGnnnnnnnnnnnnnnnnnnnnnnnnn" assert dropout_proposal.summary.startswith('-25:md') dropout_ins_proposal = epc.multiplex_proposal(10, 35, "test1", "test2", dropout=True, cut1_ins=2, cut2_ins=3) - assert dropout_ins_proposal.sequence == "GCACCTGTCCnnTTGGGTTATTTGCG" + assert dropout_ins_proposal.sequence == "GCACCTGTCCnnTTGGGTTATTTGCGnnnnnnnnnnnnnnnnnnnnnnnnn" indep_cut = epc.multiplex_proposal(10, 35, "test1", "test2", cut1_del=(4, 0), cut2_del=(0, 3)) - assert indep_cut.sequence == "GCACCTCCATAGAAATGGCTATGGAAAGCCTGGTTATTTGCG" + assert indep_cut.sequence == "GCACCTCCATAGAAATGGCTATGGAAAGCCTGGTTATTTGCGnnnnnnn" assert indep_cut.summary.startswith('-7:m-4[test1]') indep_cut2 = epc.multiplex_proposal(10, 35, "test1", "test2", cut1_del=(-4, 0), cut2_del=(3, 0)) - assert indep_cut2.sequence == "GCACCTGTCCCCATAGAAATGGCTATGGAAAGTTGGGTTATTTGCG" + assert indep_cut2.sequence == "GCACCTGTCCCCATAGAAATGGCTATGGAAAGTTGGGTTATTTGCGnnn" def test_bad_multiplex(): @@ -110,7 +110,7 @@ def test_generated_trace(): epc = EditProposalCreator(seq, use_ctrl_trace=False) - expected_trace = {'A':[], 'T':[], 'C':[], 'G':[]} + expected_trace = {'A': [], 'T': [], 'C': [], 'G': []} for base in seq: if base == 'N': for color in expected_trace.keys(): diff --git a/requirements.txt b/requirements.txt index 7704827..8287fa9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -pytest==3.4.2 +pytest==5.2.2 biopython>=1.70 coverage>=4.4.1 pytest-cov==2.5.1