# Dual CRISPR Screen Analysis
# Step 3: Construct Counting
Amanda Birmingham, CCBB, UCSD (abirmingham@ucsd.edu)

## Instructions

To run this notebook reproducibly, follow these steps:
1. Click **Kernel** > **Restart & Clear Output**
2. When prompted, click the red **Restart & clear all outputs** button
3. Fill in the values for your analysis for each of the variables in the [Input Parameters](#Input-Parameters) section
4. Click **Cell** > **Run All**

## Input Parameters

In [1]:
g_num_processors = 3
g_filtered_fastqs_dir = ('/Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/'
    'test_files/small_notebook_test/notebook2_small_notebook_test')
g_library_fp = ('/Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/'
    'test_files/small_notebook_test/CV4_2spacers_w_probe_names_wo_duplicate.txt')
g_len_of_seq_to_match = 19
g_num_allowed_mismatches = 1
g_fastq_counts_run_prefix = 'test_small_notebook'
g_fastq_counts_dir = ('/Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/'
    'test_files/small_notebook_test/notebook3_small_notebook_test')
g_code_location = '/Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/'

## Automated Set-Up

In [2]:
import inspect
import sys
sys.path.append(g_code_location)

import ccbbucsd.utilities.analysis_run_prefixes as ns_runs
import ccbbucsd.utilities.files_and_paths as ns_files
import ccbbucsd.utilities.notebook_logging as ns_logs


def describe_var_list(input_var_name_list):
    description_list =  ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
    return "".join(description_list)


ns_logs.set_stdout_info_logger()

In [3]:
g_fastq_counts_dir = ns_runs.check_or_set(g_fastq_counts_dir, g_filtered_fastqs_dir)
g_fastq_counts_run_prefix = ns_runs.check_or_set(g_fastq_counts_run_prefix, ns_runs.generate_run_prefix())
print(describe_var_list(['g_fastq_counts_run_prefix', 'g_fastq_counts_dir']))
ns_files.verify_or_make_dir(g_fastq_counts_dir)

g_fastq_counts_dir: /Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/test_files/small_notebook_test/notebook3_small_notebook_test
g_fastq_counts_run_prefix: test_small_notebook



## Construct Counting Functions

In [4]:
import ccbbucsd.malicrispr.count_filterer as ns_filter
print(inspect.getsource(ns_filter.get_filtered_file_suffix))

def get_filtered_file_suffix():
    return "_len_filtered.fastq"



In [5]:
import ccbbucsd.malicrispr.construct_file_extracter as ns_extractor
print(inspect.getsource(ns_extractor))

# third-party libraries
import pandas

# ccbb libraries
from ccbbucsd.utilities.bio_seq_utilities import trim_seq

__author__ = "Amanda Birmingham"
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "prototype"

_CONSTRUCT_ID = "construct_id"
_PROBE_A_SEQ = "probe_a_seq"
_PROBE_B_SEQ = "probe_b_seq"
_PROBE_A_NAME = "probe_a_id"
_PROBE_B_NAME = "probe_b_id"
_TARGET_A_NAME = "target_a_id"
_TARGET_B_NAME = "target_b_id"
_TARGET_PAIR_ID = "target_pair_id"
_PROBE_PAIR_ID = "probe_pair_id"
_HEADER_DIVIDER = "_"


def get_potential_annotation_headers():
    return [_CONSTRUCT_ID, _PROBE_A_SEQ, _PROBE_B_SEQ, _PROBE_A_NAME, _PROBE_B_NAME, _TARGET_A_NAME, _TARGET_B_NAME,
            _TARGET_PAIR_ID, _PROBE_PAIR_ID]


def get_header_divider():
    return _HEADER_DIVIDER


def get_construct_header():
    return _CONSTRUCT_ID


def get_probe_id_header(probe_letter):
    return _PROBE_A_NAME if _is_letter_a(probe_letter) else _PROBE_B_NAME


def get_probe_seq_header

In [6]:
import ccbbucsd.malicrispr.grna_position_matcher as ns_matcher
print(inspect.getsource(ns_matcher))

# ccbb libraries
from ccbbucsd.utilities.bio_seq_utilities import rev_comp_canonical_dna_seq

__author__ = "Amanda Birmingham"
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "development"


class GrnaPositionMatcher:
    @staticmethod
    def _generate_seqs_to_check(fw_whole_seq, rv_whole_seq):
        rc_whole_rv_seq = rev_comp_canonical_dna_seq(rv_whole_seq)
        return fw_whole_seq, rc_whole_rv_seq

    def __init__(self, grna_names_and_seqs, expected_len, num_allowed_fw_mismatches, num_allowed_rv_mismatches):
        self._grna_names_and_seqs = grna_names_and_seqs
        self._num_allowed_fw_mismatches = num_allowed_fw_mismatches
        self._num_allowed_rv_mismatches = num_allowed_rv_mismatches
        self._seq_len = expected_len

    @property
    def num_allowed_fw_mismatches(self):
        return self._num_allowed_fw_mismatches

    @property
    def num_allowed_rv_mismatches(self):
        return self._num_allowed_rv_mismatches

    

In [7]:
import ccbbucsd.malicrispr.construct_counter as ns_counter
print(inspect.getsource(ns_counter))

"""This module counts almost-perfect matches of small sequences within forward and reverse fastq sequence pairs."""

# standard libraries
import csv
import datetime
import logging

# ccbb libraries
from ccbbucsd.utilities.basic_fastq import FastqHandler, paired_fastq_generator

# project-specific libraries
from ccbbucsd.malicrispr.construct_file_extracter import compose_probe_pair_id_from_probe_ids

__author__ = "Amanda Birmingham"
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "development"


def get_counts_file_suffix():
    return "counts.txt"


def get_construct_header():
    return "construct_id"


def get_counter_from_names(names_to_count):
    return {x: 0 for x in names_to_count}


def generate_construct_counts(grna_matcher, construct_names, output_fp, fw_fastq_fp, rv_fastq_fp):
    counts_info_tuple = _match_and_count_constructs_from_files(grna_matcher, construct_names, fw_fastq_fp, rv_fastq_fp)
    counts_by_construct = counts_info_tuple[

In [8]:
def count_constructs_for_one_fastq_pair(curr_base, run_prefix, seq_len, num_allowed_mismatches, constructs_fp, 
                                        output_dir, fw_fastq_fp, rv_fastq_fp):
    construct_names, grna_name_seq_pairs = ns_extractor.extract_construct_and_grna_info(constructs_fp)
    trimmed_grna_name_seq_pairs = ns_extractor.trim_probes(grna_name_seq_pairs, seq_len)
    # Note: currently same value (num_allowed_mismatches) is being used for number of mismatches allowed in forward
    # read and number of mismatches allowed in reverse read, but this can be altered if desired
    grna_matcher = ns_matcher.GrnaPositionMatcher(trimmed_grna_name_seq_pairs, seq_len, num_allowed_mismatches, 
                                       num_allowed_mismatches)    
    output_fp = ns_files.build_multipart_fp(output_dir, [curr_base, run_prefix, ns_counter.get_counts_file_suffix()])
    ns_counter.generate_construct_counts(grna_matcher, construct_names, output_fp, fw_fastq_fp, rv_fastq_fp)

In [9]:
import ccbbucsd.utilities.parallel_process_fastqs as ns_parallel

g_parallel_results = ns_parallel.parallel_process_paired_reads(g_filtered_fastqs_dir, 
    ns_filter.get_filtered_file_suffix(), g_num_processors, count_constructs_for_one_fastq_pair, 
    [g_fastq_counts_run_prefix, g_len_of_seq_to_match, g_num_allowed_mismatches, g_library_fp,
     g_fastq_counts_dir], True)

Starting parallel processing at 2017-03-23 14:53:15.988404
Starting A549-CV4-100-d21-1_S3_L001_001_trimmed53_len_filtered at 2017-03-23 14:53:16.005416
Starting A549-CV4-100-d21-2_S4_L001_001_trimmed53_len_filtered at 2017-03-23 14:53:16.005721
Starting A549-CV4-100-d28-1_S5_L001_001_trimmed53_len_filtered at 2017-03-23 14:53:16.006993
A549-CV4-100-d21-2_S4_L001_001_trimmed53_len_filtered elapsed time: 0:00:00
Starting A549-CV4-100-d28-2_S6_L001_001_trimmed53_len_filtered at 2017-03-23 14:53:16.742590
A549-CV4-100-d28-1_S5_L001_001_trimmed53_len_filtered elapsed time: 0:00:00
A549-CV4-100-d21-1_S3_L001_001_trimmed53_len_filtered elapsed time: 0:00:00
A549-CV4-100-d28-2_S6_L001_001_trimmed53_len_filtered elapsed time: 0:00:00
parallel processing elapsed time: 0:00:01


In [10]:
print(ns_parallel.concatenate_parallel_results(g_parallel_results))

A549-CV4-100-d21-1_S3_L001_001_trimmed53_len_filtered: finished
A549-CV4-100-d21-2_S4_L001_001_trimmed53_len_filtered: finished
A549-CV4-100-d28-1_S5_L001_001_trimmed53_len_filtered: finished
A549-CV4-100-d28-2_S6_L001_001_trimmed53_len_filtered: finished

