# Run Analyses

Given run data, outputs PDF analysis summaries and, for each run, a directory of png outputs. 

In [2]:
import primo.analysis.run_analysis_functions
import os
import gzip
import shutil
import csv
# from fpdf import FPDF
import pandas as pd
from tqdm import tqdm # for progress bar
import matplotlib.pyplot as plt
from Bio.SeqIO import parse
from Bio.Seq import Seq
from Bio import pairwise2

In [5]:
SCORE_THRESH = 17 # my rule of thumb, this value is
                # len(alignment_seq) - 3
                # for queries >= 20nt but less for shorter queries.
                # Note that the score threshold will only return sequences
                # with alignment score > threshold so if your query is 20nt
                # your maximum alignment score will be 20

MAX_READ_LEN = 500 # often the sequence aligner chokes on long sequences, this is the maximum
                    # length a sequence is allowed to be in order to be analyzed, longer reads
                    # are totally ignored

## Set the baseline
This analyzes run data for the baseline pool. You may skip this section if you have already run it before.

In [None]:
SEQS_TO_ALIGN = sys.argv[1]
INITIAL_POOL_DIR_PATH = sys.argv[2]
date_label = INITIAL_POOL_DIR_PATH.split('/')[-1].split('_')[0]
CURRENT_DIR = os.getcwd()

In [None]:
# Unzip all fastq files from FASTQ_PASS dir
print('Unzipping Fastq files')
gz_extract(INITIAL_POOL_DIR_PATH)

# Read in all target seqs (seqs to align reads to) as dictionary
print('Reading in target sequences')
target_dict = csv_to_dict(SEQS_TO_ALIGN)

# Read in FASTQ_PASS nanopore data and see which reads align to any of the seqs to align to
# NOTE- any read aligning to more than seq will be discarded
print('Aligning reads to target sequences')
aligned_reads = align_reads(target_dict, INITIAL_POOL_DIR_PATH)
aligned_reads_text = f'Number of aligned reads (with duplicate reads):{len(aligned_reads)}'

# Remove all reads that aligned to multiple sequences
print('Removing all reads with multiple alignments')
aligned_reads = remove_duplicate_reads(aligned_reads)
cleaned_aligned_reads_text = f'Number of aligned reads (no duplicate reads):{len(aligned_reads)}'

# Write dataframe to csv
print('Writing aligned read data to csv')
csv_name = f'{create_data_dir(CURRENT_DIR, date_label)}/baseline_pool_data.csv'
aligned_reads.to_csv(csv_name)

In [None]:
print('Data is being analyzed')
# Plot number of reads each sequence got
barfig_name = plot_target_freq(aligned_reads, CURRENT_DIR, date_label)

# Get a list of all the read lengths in the directory
all_read_lengths_in_dir = get_read_lens(INITIAL_POOL_DIR_PATH)

# Get a descriptive string of the total number of reads in the directory
total_reads_str = get_total_dir_reads(all_read_lengths_in_dir)

# Plot read length distribution for reads in FASTQ_PASS dir
len_dist_name = plot_len_distribution(all_read_lengths_in_dir, CURRENT_DIR, date_label)
# Plot read length distribution of reads length 0 to max_read_len
len_dist_zoomed_name = plot_len_distribution_zoomed(all_read_lengths_in_dir, CURRENT_DIR, date_label)

# Make PDF and write data to it
print('Making PDF (this is a slow process, give it a minute or two)')
pdf = initialize_pdf_expt0(date_label)
text = total_reads_str + '\n' + aligned_reads_text + '\n' + cleaned_aligned_reads_text
pdf.multi_cell(200,30, txt=text, align='L')

default_image_width = 150
default_image_height = 110
pdf.image(barfig_name, w=default_image_width, h=default_image_height)
pdf.image(len_dist_name, w=default_image_width, h=default_image_height)
pdf.image(len_dist_zoomed_name, w=default_image_width, h=default_image_height)

pdf.output(f'{create_data_dir(CURRENT_DIR, date_label)}/{date_label}_css_analysis_summary.pdf')

## Analyze Similarity Search Run Data
This analyzes run data where an alequot of the baseline pool has been queried with Cas9. 

In [None]:
SEQS_TO_ALIGN = sys.argv[1]
INITIAL_POOL_INFO = sys.argv[2]
EXPERIMENT_DIR = sys.argv[3]
date_label = EXPERIMENT_DIR.split('/')[-1].split('_')[0]
CURRENT_DIR = os.getcwd()

In [None]:
# read in baseline pool information
unmodified_df = pd.read_csv(INITIAL_POOL_INFO)

# Unzip all fastq files from FASTQ_PASS dir
print('Unzipping Fastq files')
gz_extract(EXPERIMENT_DIR)

# Read in all target seqs (seqs to align reads to) as dictionary
print('Reading in target sequences')
target_dict = csv_to_dict(SEQS_TO_ALIGN)

# Read in FASTQ_PASS nanopore data and see which reads align to any of the seqs to align to
# NOTE- any read aligning to more than seq will eventually be discarded
print('Aligning reads to target sequences')
aligned_reads = align_reads(target_dict, EXPERIMENT_DIR)
aligned_reads_text = f'Number of aligned reads (with duplicate alignments):{len(aligned_reads)}'

# Remove all reads that aligned to multiple sequences
print('Removing all reads with multiple alignments')
aligned_reads = remove_duplicate_reads(aligned_reads)
cleaned_aligned_reads_text = f'Number of aligned reads (no duplicate alignments):{len(aligned_reads)}'


In [None]:
print('Data is being analyzed')
# Plot raw number of reads each sequence got
barfig_name = plot_target_freq(aligned_reads, CURRENT_DIR, date_label)

# Get a list of all the read lengths in the directory
all_read_lengths_in_dir = get_read_lens(EXPERIMENT_DIR)

# Get a descriptive string of the total number of reads in the directory
total_reads_str = get_total_dir_reads(all_read_lengths_in_dir)

# Plot read length distribution for reads in FASTQ_PASS dir
len_dist_name = plot_len_distribution(all_read_lengths_in_dir, CURRENT_DIR, date_label)

# Plot read length distribution of reads length 0 to max_read_len
len_dist_zoomed_name = plot_len_distribution_zoomed(all_read_lengths_in_dir, CURRENT_DIR, date_label)

# Calculate enrichment score and plot those results
initial_ratios = calculate_enrichment_scores(unmodified_df)
expt_ratios = calculate_enrichment_scores(aligned_reads)
enrichment_score_barfig_name = plot_es_barplot(expt_ratios, initial_ratios, CURRENT_DIR, date_label)

In [None]:
# Make PDF and write data to it
print('Making PDF (this is a slow process, give it a minute or two)')
pdf = initialize_pdf(date_label)
text = total_reads_str + '\n' + aligned_reads_text + '\n' + cleaned_aligned_reads_text
pdf.multi_cell(200,30, txt=text, align='L')

default_image_width = 150
default_image_height = 110
pdf.image(barfig_name, w=default_image_width, h=default_image_height)
pdf.image(enrichment_score_barfig_name, w=default_image_width, h=default_image_height)
pdf.image(len_dist_name, w=default_image_width, h=default_image_height)
pdf.image(len_dist_zoomed_name, w=default_image_width, h=default_image_height)

pdf.output(f'{create_data_dir(CURRENT_DIR, date_label)}/{date_label}_css_analysis_summary.pdf')