# Dual CRISPR Screen Analysis
# Step 4: Count Combination
Amanda Birmingham, CCBB, UCSD (abirmingham@ucsd.edu)

## Instructions

To run this notebook reproducibly, follow these steps:
1. Click **Kernel** > **Restart & Clear Output**
2. When prompted, click the red **Restart & clear all outputs** button
3. Fill in the values for your analysis for each of the variables in the [Input Parameters](#Input-Parameters) section
4. Click **Cell** > **Run All**

## Input Parameters

In [1]:
g_dataset_name = "small_notebook_test"
g_fastq_counts_run_prefix = "test_small_notebook"
g_fastq_counts_dir = ('/Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/'
    'test_files/small_notebook_test/notebook3_small_notebook_test')
g_collapsed_counts_run_prefix = "test_small_notebook"
g_collapsed_counts_dir = ('/Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/'
    'test_files/small_notebook_test/notebook4_small_notebook_test')
g_combined_counts_dir = ""
g_combined_counts_run_prefix = ""
g_code_location = '/Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/'

## Automated Set-Up

In [2]:
import inspect
import sys
sys.path.append(g_code_location)

import ccbbucsd.utilities.analysis_run_prefixes as ns_runs
import ccbbucsd.utilities.files_and_paths as ns_files
import ccbbucsd.utilities.notebook_logging as ns_logs


def describe_var_list(input_var_name_list):
    description_list =  ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
    return "".join(description_list)


ns_logs.set_stdout_info_logger()

In [3]:
g_collapsed_counts_run_prefix = ns_runs.check_or_set(g_collapsed_counts_run_prefix, 
                                                     ns_runs.generate_run_prefix(g_dataset_name))
g_collapsed_counts_dir = ns_runs.check_or_set(g_collapsed_counts_dir, g_fastq_counts_dir)
g_combined_counts_run_prefix = ns_runs.check_or_set(g_combined_counts_run_prefix, g_collapsed_counts_run_prefix)
g_combined_counts_dir = ns_runs.check_or_set(g_combined_counts_dir, g_collapsed_counts_dir)

print(describe_var_list(['g_collapsed_counts_run_prefix','g_collapsed_counts_dir',
                         'g_combined_counts_run_prefix','g_combined_counts_dir']))
ns_files.verify_or_make_dir(g_collapsed_counts_dir)
ns_files.verify_or_make_dir(g_combined_counts_dir)

g_collapsed_counts_run_prefix: test_small_notebook
g_collapsed_counts_dir: /Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/test_files/small_notebook_test/notebook4_small_notebook_test
g_combined_counts_run_prefix: test_small_notebook
g_combined_counts_dir: /Users/Birmingham/Work/Repositories/ccbb_tickets_2017/mali-dual-crispr-pipeline/src/python/test_files/small_notebook_test/notebook4_small_notebook_test



## Count Combination Functions

In [4]:
import ccbbucsd.malicrispr.construct_counter as ns_counter
print(inspect.getsource(ns_counter.get_counts_file_suffix))

def get_counts_file_suffix():
    return "counts.txt"



In [5]:
import ccbbucsd.malicrispr.count_combination as ns_combine
print(inspect.getsource(ns_combine))

# ccbb libraries
from ccbbucsd.utilities.analysis_run_prefixes import strip_run_prefix
from ccbbucsd.utilities.files_and_paths import build_multipart_fp, group_files, get_filepaths_by_prefix_and_suffix

# project-specific libraries
import ccbbucsd.malicrispr.count_files_and_dataframes as ns_counts

__author__ = "Amanda Birmingham"
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "prototype"


def get_collapsed_counts_file_suffix():
    return "collapsed.txt"


def get_combined_counts_file_suffix():
    return "counts_combined.txt"


def group_lane_and_set_files(filepaths):
    # NB: this regex assumes read designator has *already* been removed
    # and replaced with _ as done by group_read_pairs
    return group_files(filepaths, "_L\d\d\d_\d\d\d", "")


def combine_count_files(counts_fp_for_dataset, run_prefix):
    combined_df = None

    for curr_counts_fp in counts_fp_for_dataset:
        count_header, curr_counts_df = ns_counts.get_counts_df(cur

## Input Count Filenames

In [6]:
print(ns_files.summarize_filenames_for_prefix_and_suffix(g_fastq_counts_dir, g_fastq_counts_run_prefix, 
                                                         ns_counter.get_counts_file_suffix()))

A549-CV4-100-d21-1_S3_L001_001_trimmed53_len_filtered_test_small_notebook_counts.txt
A549-CV4-100-d21-2_S4_L001_001_trimmed53_len_filtered_test_small_notebook_counts.txt
A549-CV4-100-d28-1_S5_L001_001_trimmed53_len_filtered_test_small_notebook_counts.txt
A549-CV4-100-d28-2_S6_L001_001_trimmed53_len_filtered_test_small_notebook_counts.txt


## Count Combination Execution

In [7]:
ns_combine.write_collapsed_count_files(g_fastq_counts_dir, g_collapsed_counts_dir, g_collapsed_counts_run_prefix, 
                            g_fastq_counts_run_prefix, ns_counter.get_counts_file_suffix(), 
                            ns_combine.get_collapsed_counts_file_suffix())

In [8]:
ns_combine.write_combined_count_file(g_collapsed_counts_dir, g_combined_counts_dir, g_collapsed_counts_run_prefix, 
                          g_combined_counts_run_prefix, ns_combine.get_collapsed_counts_file_suffix(), 
                          ns_combine.get_combined_counts_file_suffix())