# Dual CRISPR Screen Analysis
# Step 1: Construct Scaffold Trimming
Amanda Birmingham, CCBB, UCSD (abirmingham@ucsd.edu)

## Instructions

To run this notebook reproducibly, follow these steps:
1. Click **Kernel** > **Restart & Clear Output**
2. When prompted, click the red **Restart & clear all outputs** button
3. Fill in the values for your analysis for each of the variables in the [Input Parameters](#Input-Parameters) section
4. Click **Cell** > **Run All**

## Input Parameters

In [1]:
g_num_processors = 3
g_fastqs_dir = ('/Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/'
    'test_files/known_goods/fastq_gz_20160706_HeLa_A549_CV4')
g_trimmed_fastqs_dir = ('/Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/'
    'test_files/test_outputs/notebook1_20160706_HeLa_A549_CV4')
g_full_5p_r1 = 'TATATATCTTGTGGAAAGGACGAAACACCG'
g_full_5p_r2 = 'CCTTATTTTAACTTGCTATTTCTAGCTCTAAAAC'
g_full_3p_r1 = 'GTTTCAGAGCTATGCTGGAAACTGCATAGCAAGTTGAAATAAGGCTAGTCCGTTATCAACTTGAAAAAGTGGCACCGAGTCGGTGCTTTTTTGTACTGAG'
g_full_3p_r2 = 'CAAACAAGGCTTTTCTCCAAGGGATATTTATAGTCTCAAAACACACAATTACTTTACAGTTAGGGTGAGTTTCCTTTTGTGCTGTTTTTTAAAATA'
g_keep_gzs = False  # True only works for gzip 1.6+ (apparently not available on AWS linux)
g_code_location = '/Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/'

## Automated Set-Up

In [3]:
import inspect
import sys
sys.path.append(g_code_location)

import ccbbucsd.utilities.analysis_run_prefixes as ns_runs
import ccbbucsd.utilities.files_and_paths as ns_files
import ccbbucsd.utilities.notebook_logging as ns_logs


def describe_var_list(input_var_name_list):
    description_list =  ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
    return "".join(description_list)


ns_logs.set_stdout_info_logger()

In [4]:
g_trimmed_fastqs_dir = ns_runs.check_or_set(g_trimmed_fastqs_dir, g_fastqs_dir)
print(describe_var_list(['g_trimmed_fastqs_dir']))
ns_files.verify_or_make_dir(g_trimmed_fastqs_dir)

g_trimmed_fastqs_dir: /Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/test_files/known_outputs/notebook1_20160706_HeLa_A549_CV4



## Scaffold Trimming Functions

In [20]:
import ccbbucsd.malicrispr.scaffold_trim as trim
print(inspect.getsource(trim))

# standard libraries
import enum

# third-party libraries
import cutadapt.scripts.cutadapt

# ccbb libraries
from ccbbucsd.utilities.files_and_paths import get_file_name_pieces, make_file_path

__author__ = 'Amanda Birmingham'
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "prototype"


class TrimType(enum.Enum):
    FIVE = "5"
    THREE = "3"
    FIVE_THREE = "53"


def get_trimmed_suffix(trimtype):
    return "_trimmed{0}.fastq".format(trimtype.value)


def trim_linked_scaffold(output_dir, fastq_fp, scaffold_seq_5p, scaffold_seq_3p, quiet=True):
    args = ["-a", "{0}...{1}".format(scaffold_seq_5p,scaffold_seq_3p)]
    return _run_cutadapt(output_dir, fastq_fp, TrimType.FIVE_THREE, args, quiet)


def trim_global_scaffold(output_dir, fastq_fp, scaffold_seq_5p=None, scaffold_seq_3p=None, quiet=True):
    curr_fastq_fp = fastq_fp

    if scaffold_seq_5p is not None:
        curr_fastq_fp = _run_cutadapt_global(output_dir, curr_fastq_fp, scaffold_seq

In [15]:
def trim_fw_and_rv_reads(output_dir, full_5p_r1, full_3p_r1, full_5p_r2, full_3p_r2, fw_fastq_fp, rv_fastq_fp):        
    trim.trim_linked_scaffold(output_dir, fw_fastq_fp, full_5p_r1, full_3p_r1)
    trim.trim_linked_scaffold(output_dir, rv_fastq_fp, full_5p_r2, full_3p_r2) 

## Gzipped FASTQ Filenames

In [9]:
g_seq_file_ext_name = ".fastq"
g_gzip_ext_name = ".gz"

In [10]:
print(ns_files.summarize_filenames_for_prefix_and_suffix(g_fastqs_dir, "", 
                                                "{0}{1}".format(g_seq_file_ext_name, g_gzip_ext_name), 
                                                all_subdirs=True))

A549-CV4-d21-1_S3_L001_R1_001.fastq.gz
A549-CV4-d21-1_S3_L001_R2_001.fastq.gz
A549-CV4-d21-1_S3_L002_R1_001.fastq.gz
A549-CV4-d21-1_S3_L002_R2_001.fastq.gz
A549-CV4-d21-2_S4_L001_R1_001.fastq.gz
A549-CV4-d21-2_S4_L001_R2_001.fastq.gz
A549-CV4-d21-2_S4_L002_R1_001.fastq.gz
A549-CV4-d21-2_S4_L002_R2_001.fastq.gz
A549-CV4-d28-1_S5_L001_R1_001.fastq.gz
A549-CV4-d28-1_S5_L001_R2_001.fastq.gz
A549-CV4-d28-1_S5_L002_R1_001.fastq.gz
A549-CV4-d28-1_S5_L002_R2_001.fastq.gz
A549-CV4-d28-2_S6_L001_R1_001.fastq.gz
A549-CV4-d28-2_S6_L001_R2_001.fastq.gz
A549-CV4-d28-2_S6_L002_R1_001.fastq.gz
A549-CV4-d28-2_S6_L002_R2_001.fastq.gz


## FASTQ Gunzip Execution

In [11]:
import ccbbucsd.utilities.files_and_paths as ns_files

def unzip_and_flatten_seq_files(top_fastqs_dir, ext_name, gzip_ext_name, keep_gzs):
    # first, recursively unzip all fastq.gz files anywhere under the input dir
    ns_files.gunzip_wildpath(top_fastqs_dir, ext_name + gzip_ext_name, keep_gzs, True)  # True = do recursive
    # now move all fastqs to top-level directory so don't have to work recursively in future
    ns_files.move_to_dir_and_flatten(top_fastqs_dir, top_fastqs_dir, ext_name)

In [12]:
# False = don't keep gzs as well as expanding, True = do keep them (True only works for gzip 1.6+)
unzip_and_flatten_seq_files(g_fastqs_dir, g_seq_file_ext_name, g_gzip_ext_name, g_keep_gzs)  

## FASTQ Filenames

In [13]:
print(ns_files.summarize_filenames_for_prefix_and_suffix(g_fastqs_dir, "", g_seq_file_ext_name))

A549-CV4-d21-1_S3_L001_R1_001.fastq
A549-CV4-d21-1_S3_L001_R2_001.fastq
A549-CV4-d21-1_S3_L002_R1_001.fastq
A549-CV4-d21-1_S3_L002_R2_001.fastq
A549-CV4-d21-2_S4_L001_R1_001.fastq
A549-CV4-d21-2_S4_L001_R2_001.fastq
A549-CV4-d21-2_S4_L002_R1_001.fastq
A549-CV4-d21-2_S4_L002_R2_001.fastq
A549-CV4-d28-1_S5_L001_R1_001.fastq
A549-CV4-d28-1_S5_L001_R2_001.fastq
A549-CV4-d28-1_S5_L002_R1_001.fastq
A549-CV4-d28-1_S5_L002_R2_001.fastq
A549-CV4-d28-2_S6_L001_R1_001.fastq
A549-CV4-d28-2_S6_L001_R2_001.fastq
A549-CV4-d28-2_S6_L002_R1_001.fastq
A549-CV4-d28-2_S6_L002_R2_001.fastq


## Scaffold Trim Execution

In [17]:
import ccbbucsd.utilities.parallel_process_fastqs as ns_parallel
g_parallel_results = ns_parallel.parallel_process_paired_reads(g_fastqs_dir, g_seq_file_ext_name, g_num_processors, 
                                                   trim_fw_and_rv_reads, [g_trimmed_fastqs_dir, g_full_5p_r1, 
                                                                          g_full_3p_r1, g_full_5p_r2, g_full_3p_r2])

Starting parallel processing at 2017-03-10 15:58:27.581617
Starting A549-CV4-d21-1_S3_L002_001 at 2017-03-10 15:58:27.663852
Starting A549-CV4-d21-2_S4_L001_001 at 2017-03-10 15:58:27.665574
Starting A549-CV4-d21-1_S3_L001_001 at 2017-03-10 15:58:27.665756
This is cutadapt 1.10 with Python 3.4.5
This is cutadapt 1.10 with Python 3.4.5
This is cutadapt 1.10 with Python 3.4.5
Command line parameters: -a TATATATCTTGTGGAAAGGACGAAACACCG...GTTTCAGAGCTATGCTGGAAACTGCATAGCAAGTTGAAATAAGGCTAGTCCGTTATCAACTTGAAAAAGTGGCACCGAGTCGGTGCTTTTTTGTACTGAG --quiet -o /Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/test_files/known_outputs/notebook1_20160706_HeLa_A549_CV4/A549-CV4-d21-2_S4_L001_R1_001_trimmed53.fastq /Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/test_files/known_inputs/notebook1_20160706_HeLa_A549_CV4/A549-CV4-d21-2_S4_L001_R1_001.fastq
Command line parameters: -a TATATATCTTGTGGAAAGGACGAAACACCG...GTTTCAGAGC

In [18]:
print(ns_parallel.concatenate_parallel_results(g_parallel_results))

A549-CV4-d21-1_S3_L001_001: finished
A549-CV4-d21-1_S3_L002_001: finished
A549-CV4-d21-2_S4_L001_001: finished
A549-CV4-d21-2_S4_L002_001: finished
A549-CV4-d28-1_S5_L001_001: finished
A549-CV4-d28-1_S5_L002_001: finished
A549-CV4-d28-2_S6_L001_001: finished
A549-CV4-d28-2_S6_L002_001: finished



## Trimmed FASTQ Filenames

In [21]:
print(ns_files.summarize_filenames_for_prefix_and_suffix(g_trimmed_fastqs_dir, "", 
    trim.get_trimmed_suffix(trim.TrimType.FIVE_THREE)))

A549-CV4-d21-1_S3_L001_R1_001_trimmed53.fastq
A549-CV4-d21-1_S3_L001_R2_001_trimmed53.fastq
A549-CV4-d21-1_S3_L002_R1_001_trimmed53.fastq
A549-CV4-d21-1_S3_L002_R2_001_trimmed53.fastq
A549-CV4-d21-2_S4_L001_R1_001_trimmed53.fastq
A549-CV4-d21-2_S4_L001_R2_001_trimmed53.fastq
A549-CV4-d21-2_S4_L002_R1_001_trimmed53.fastq
A549-CV4-d21-2_S4_L002_R2_001_trimmed53.fastq
A549-CV4-d28-1_S5_L001_R1_001_trimmed53.fastq
A549-CV4-d28-1_S5_L001_R2_001_trimmed53.fastq
A549-CV4-d28-1_S5_L002_R1_001_trimmed53.fastq
A549-CV4-d28-1_S5_L002_R2_001_trimmed53.fastq
A549-CV4-d28-2_S6_L001_R1_001_trimmed53.fastq
A549-CV4-d28-2_S6_L001_R2_001_trimmed53.fastq
A549-CV4-d28-2_S6_L002_R1_001_trimmed53.fastq
A549-CV4-d28-2_S6_L002_R2_001_trimmed53.fastq
