# Dual CRISPR Screen Analysis
# Step 2: Construct Filter
Amanda Birmingham, CCBB, UCSD (abirmingham@ucsd.edu)

## Instructions

To run this notebook reproducibly, follow these steps:
1. Click **Kernel** > **Restart & Clear Output**
2. When prompted, click the red **Restart & clear all outputs** button
3. Fill in the values for your analysis for each of the variables in the [Input Parameters](#Input-Parameters) section
4. Click **Cell** > **Run All**

## Input Parameters

In [1]:
g_num_processors = 3
g_trimmed_fastqs_dir = ('/Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/'
    'test_files/known_goods/trimmed_fastq_20160706_HeLa_A549_CV4')
g_filtered_fastas_dir = ('/Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/'
    'test_files/test_outputs/notebook2_20160706_HeLa_A549_CV4')
g_min_trimmed_grna_len = 19
g_max_trimmed_grna_len = 21
g_len_of_seq_to_match = 19
g_code_location = '/Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/'

## Automated Set-Up

In [2]:
import inspect
import sys
sys.path.append(g_code_location)

import ccbbucsd.utilities.analysis_run_prefixes as ns_runs
import ccbbucsd.utilities.files_and_paths as ns_files
import ccbbucsd.utilities.notebook_logging as ns_logs


def describe_var_list(input_var_name_list):
    description_list =  ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
    return "".join(description_list)


ns_logs.set_stdout_info_logger()

In [4]:
g_filtered_fastas_dir = ns_runs.check_or_set(g_filtered_fastas_dir, g_trimmed_fastqs_dir)
print(describe_var_list(['g_filtered_fastas_dir']))
ns_files.verify_or_make_dir(g_filtered_fastas_dir)

g_filtered_fastas_dir: /Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/test_files/test_outputs/notebook2_20160706_HeLa_A549_CV4



## Construct Filtering Functions

In [5]:
import ccbbucsd.malicrispr.scaffold_trim as trim
print(inspect.getsource(trim))

# standard libraries
import enum

# third-party libraries
import cutadapt.scripts.cutadapt

# ccbb libraries
from ccbbucsd.utilities.files_and_paths import get_file_name_pieces, make_file_path

__author__ = 'Amanda Birmingham'
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "prototype"


class TrimType(enum.Enum):
    FIVE = "5"
    THREE = "3"
    FIVE_THREE = "53"


def get_trimmed_suffix(trimtype):
    return "_trimmed{0}.fastq".format(trimtype.value)


def trim_linked_scaffold(output_dir, fastq_fp, scaffold_seq_5p, scaffold_seq_3p, quiet=True):
    args = ["-a", "{0}...{1}".format(scaffold_seq_5p,scaffold_seq_3p)]
    return _run_cutadapt(output_dir, fastq_fp, TrimType.FIVE_THREE, args, quiet)


def trim_global_scaffold(output_dir, fastq_fp, scaffold_seq_5p=None, scaffold_seq_3p=None, quiet=True):
    curr_fastq_fp = fastq_fp

    if scaffold_seq_5p is not None:
        curr_fastq_fp = _run_cutadapt_global(output_dir, curr_fastq_fp, scaffold_seq

In [6]:
import ccbbucsd.malicrispr.count_filterer as fltr
print(inspect.getsource(fltr))

# standard libraries
import logging

# ccbb libraries
from ccbbucsd.utilities.bio_seq_utilities import trim_seq
from ccbbucsd.utilities.basic_fastq import FastqHandler, paired_fastq_generator
from ccbbucsd.utilities.files_and_paths import transform_path

__author__ = "Amanda Birmingham"
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "development"


def get_filtered_file_suffix():
    return "_len_filtered.fastq"


def filter_pair_by_len(min_len, max_len, retain_len, output_dir, fw_fastq_fp, rv_fastq_fp):
    fw_fastq_handler = FastqHandler(fw_fastq_fp)
    rv_fastq_handler = FastqHandler(rv_fastq_fp)
    fw_out_handle, rv_out_handle = _open_output_file_pair(fw_fastq_fp, rv_fastq_fp, output_dir)
    counters = {"num_pairs": 0, "num_pairs_passing": 0}

    filtered_fastq_records = _filtered_fastq_generator(fw_fastq_handler, rv_fastq_handler, min_len, max_len, retain_len,
                                                       counters)
    for fw_reco

In [7]:
import ccbbucsd.utilities.parallel_process_fastqs as ns_parallel

g_parallel_results = ns_parallel.parallel_process_paired_reads(g_trimmed_fastqs_dir, 
    trim.get_trimmed_suffix(trim.TrimType.FIVE_THREE), g_num_processors, 
    fltr.filter_pair_by_len, [g_min_trimmed_grna_len, g_max_trimmed_grna_len, 
    g_len_of_seq_to_match, g_filtered_fastas_dir])

Starting parallel processing at 2017-03-11 13:41:54.256524
Starting A549-CV4-d21-1_S3_L002_001_trimmed53 at 2017-03-11 13:41:54.331087
Starting A549-CV4-d21-1_S3_L001_001_trimmed53 at 2017-03-11 13:41:54.331089
Starting A549-CV4-d21-2_S4_L001_001_trimmed53 at 2017-03-11 13:41:54.333641
A549-CV4-d21-2_S4_L001_001_trimmed53 elapsed time: 0:07:49
Starting A549-CV4-d21-2_S4_L002_001_trimmed53 at 2017-03-11 13:49:43.833308
A549-CV4-d21-1_S3_L001_001_trimmed53 elapsed time: 0:07:53
Starting A549-CV4-d28-1_S5_L001_001_trimmed53 at 2017-03-11 13:49:47.438210
A549-CV4-d21-1_S3_L002_001_trimmed53 elapsed time: 0:07:53
Starting A549-CV4-d28-1_S5_L002_001_trimmed53 at 2017-03-11 13:49:47.857085
A549-CV4-d21-2_S4_L002_001_trimmed53 elapsed time: 0:07:22
Starting A549-CV4-d28-2_S6_L001_001_trimmed53 at 2017-03-11 13:57:06.810958
A549-CV4-d28-1_S5_L001_001_trimmed53 elapsed time: 0:08:44
Starting A549-CV4-d28-2_S6_L002_001_trimmed53 at 2017-03-11 13:58:31.515856
A549-CV4-d28-1_S5_L002_001_trimmed53 e

In [8]:
print(ns_parallel.concatenate_parallel_results(g_parallel_results))

A549-CV4-d21-1_S3_L001_001_trimmed53: num_pairs:13985001,num_pairs_passing:11090180
A549-CV4-d21-1_S3_L002_001_trimmed53: num_pairs:13985929,num_pairs_passing:11059401
A549-CV4-d21-2_S4_L001_001_trimmed53: num_pairs:13860989,num_pairs_passing:11013990
A549-CV4-d21-2_S4_L002_001_trimmed53: num_pairs:14020200,num_pairs_passing:11113434
A549-CV4-d28-1_S5_L001_001_trimmed53: num_pairs:16699890,num_pairs_passing:13303189
A549-CV4-d28-1_S5_L002_001_trimmed53: num_pairs:16920809,num_pairs_passing:13448977
A549-CV4-d28-2_S6_L001_001_trimmed53: num_pairs:20070590,num_pairs_passing:15414999
A549-CV4-d28-2_S6_L002_001_trimmed53: num_pairs:20197369,num_pairs_passing:15481427

