# Run Analyses

Given run data, outputs PDF analysis summaries and, for each run, a directory of png outputs. 

In [1]:
import primo.analysis.run_analysis_functions
from primo.models.simulator import Simulator
import pandas as pd
from multiprocessing import Pool
import numpy as np
import os

# for looking up images
import io
import zipfile
from PIL import Image

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-vhsg9qoy because the default path (/tf/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


## Read in the csv file with the oligos sequences

In [2]:
seq_df = pd.read_csv('../03_simulation/oligos.csv')
seq_df = seq_df.rename(columns = {'Unnamed: 0': 'Photo_ID'})

In [3]:
seq_df

Unnamed: 0,Photo_ID,FeatureSequence,ID,random_padding,Oligo
0,000002b66c9c498e,ACCGGTAAGGCACAAAAACG,TTTGCCAAGTTGGTGCACAC,CCTTCCGTAGGCGACATTTCTCGGCACGCGCTATCTTATAGTCGTC...,TACTCGCTGCGTGCAATTTATTTGCCAAGTTGGTGCACACACCGGT...
1,000002b97e5471a0,ATTTGCAAGGAACAAAAACG,CCATTGCTATCTGTCTCTAG,AGATAAGGTTCCTCGGCGCATCCGCCTAATTAACATGCCCGAACTC...,TACTCGCTGCGTGCAATTTACCATTGCTATCTGTCTCTAGATTTGC...
2,000002c707c9895e,ACCTGTAAGGCACAAAAACG,AGTCGGGATAATACGTCAAG,GCCTGTGGGCCAACACAGCCTACCGCCGATCCCACCCTCTAGTTAG...,TACTCGCTGCGTGCAATTTAAGTCGGGATAATACGTCAAGACCTGT...
3,0000048549557964,AATTGCAAGGAACAAAAACG,ATTCGGATGTGCTAAGACGA,GTCGAGAGGGATGAGCGACCAAGTGGTTCCACACTGTAAACAGCTC...,TACTCGCTGCGTGCAATTTAATTCGGATGTGCTAAGACGAAATTGC...
4,000004f4400f6ec5,ACCGGTAAGGTACAAAAACG,ACCGCAATACTTTGTATAAT,TCGTCCACAAGAGCTTACGTAGTTCCGTCAGACTGATATGCAACCC...,TACTCGCTGCGTGCAATTTAACCGCAATACTTTGTATAATACCGGT...
...,...,...,...,...,...
1743037,ffffd9716fd38279,ATTTGCAAGGAACAAAAACG,CCATTGCTATCTGTCTCTAG,AGATAAGGTTCCTCGGCGCATCCGCCTAATTAACATGCCCGAACTC...,TACTCGCTGCGTGCAATTTACCATTGCTATCTGTCTCTAGATTTGC...
1743038,ffffda81903d6bb7,AGGGGTAAGGTACAAAAACG,ATCGCGGTCGGATAAACAAA,GCCGTGCCGTACCGGCACGCAAAGTGTTCCGACGTTGATGACAACT...,TACTCGCTGCGTGCAATTTAATCGCGGTCGGATAAACAAAAGGGGT...
1743039,ffffeac7d2f37489,AATCGCAAGGAACAAAAACG,ATTCGCTGTGCTTTCGCTAC,AATACGACGAGTAGTAGACACTAATTTATCCATACTGTGTACAGCG...,TACTCGCTGCGTGCAATTTAATTCGCTGTGCTTTCGCTACAATCGC...
1743040,ffffebddbb8bba21,ACCTGTAAGGCACAGAAACG,AGGAGGGCACGACCATTTCT,CGACCACCTGAATAAGATCGCCTTGTTAGGGCTGAAGTTGGGTGGG...,TACTCGCTGCGTGCAATTTAAGGAGGGCACGACCATTTCTACCTGT...


#### Some cells to help with looking up photos in the future

In [4]:
# get an array of all the IDs
seqs_to_align = seq_df['ID'].unique()

# make into a dictionary with the keys as ID labels, the values as the DNA sequence
seqs_to_align_dict = dict(zip(range(len(seqs_to_align)), seqs_to_align))

In [5]:
def get_image_from_zip(imgid):
    zip_path = f'/tf/open_images/targets/images/train_{imgid[:1]}.zip'
    zip_file = zipfile.ZipFile(zip_path, 'r')
    img_file = zip_file.read(f"train_{imgid[:1]}/{imgid}.jpg")
    return Image.open(io.BytesIO(img_file))

def get_images_with_ID(ID, seq_df):
    rows_with_ID = seq_df.loc[seq_df['ID'] == ID]
    photo_IDs = rows_with_ID['Photo_ID']
    return photo_IDs

## Set the baseline
This analyzes run data for the baseline pool.

In [7]:
SCORE_THRESH = 17 # my rule of thumb, this value is
                # len(alignment_seq) - 3
                # for queries >= 20nt but less for shorter queries.
                # Note that the score threshold will only return sequences
                # with alignment score > threshold so if your query is 20nt
                # your maximum alignment score will be 20

# TODO: make a MIN_READ_LEN and update relevant functions in primo/analysis/run_analysis_functions.py
MAX_READ_LEN = 500 # often the sequence aligner chokes on long sequences, this is the maximum
                    # length a sequence is allowed to be in order to be analyzed, longer reads
                    # are totally ignored


In [8]:
INITIAL_POOL_DIR_PATH = '../../data/sequencing/20220907_1411_MN21390_FAU64496_dec56e17'
date_label = INITIAL_POOL_DIR_PATH.split('/')[-1].split('_')[0]

PRCNT_DATA = .6 # the percent of fastq files you'd like to use for this data analysis

In [None]:
# Unzip all fastq files from FASTQ_PASS dir
print('Unzipping Fastq files')
primo.analysis.run_analysis_functions.gz_extract(INITIAL_POOL_DIR_PATH)

In [None]:
# #### TODO: THREADING
# files_to_analyze = primo.analysis.run_analysis_functions.list_of_fastqs_to_analyze(PRCNT_DATA)
# print(f"Analyzing {len(files_to_analyze)} files")

# pool = Pool(13, initializer=np.random.seed)
# pool.map(primo.analysis.run_analysis_functions.threaded_align_reads(), files_to_analyze)
    

Only run the cell below if you need to perform alignment! You should only have to do this once.

In [None]:
# Read in FASTQ_PASS nanopore data and see which reads align to any of the seqs to align to
# NOTE- any reads aligning to more than one seq will be discarded

print('Aligning reads to target sequences')
aligned_reads = primo.analysis.run_analysis_functions.align_reads(seqs_to_align_dict, 
                                                                  INITIAL_POOL_DIR_PATH, 
                                                                  SCORE_THRESH, MAX_READ_LEN,
                                                                 PRCNT_DATA)
aligned_reads_text = f'Number of aligned reads (with duplicate reads):{len(aligned_reads)}'

# Remove all reads that aligned to multiple sequences
print('Removing all reads with multiple alignments')
aligned_reads = primo.analysis.run_analysis_functions.remove_duplicate_reads(aligned_reads)
aligned_reads = aligned_reads.drop(['index'], axis=1) # cleans up a stray column labeled 'index'
cleaned_aligned_reads_text = f'Number of aligned reads (no duplicate reads):{len(aligned_reads)}'

# Write dataframe to csv
print('Writing aligned read data to csv')
csv_name = f'{primo.analysis.run_analysis_functions.create_data_dir(date_label)}/baseline_pool_data.csv'
aligned_reads.to_csv(csv_name)

The cell below is to get you into the fastq_pass directory

In [None]:
print(os.getcwd())
os.chdir('../../data/sequencing/20220907_1411_MN21390_FAU64496_dec56e17/fastq_pass')

In [None]:
print('Reading in .csv data')
csv_name = '20220907_css_analysis/baseline_pool_data.csv'
aligned_reads = pd.read_csv(csv_name)
print(aligned_reads)
# aligned_reads_text = f'Number of aligned reads (with duplicate reads):{len(aligned_reads)}'
cleaned_aligned_reads_text = f'Number of aligned reads (no duplicate reads):{len(aligned_reads)}'


print('Data is being analyzed')
# Plot number of reads each sequence got
barfig_name = primo.analysis.run_analysis_functions.plot_target_freq(aligned_reads, date_label)

# Get a list of all the read lengths in the directory
all_read_lengths_in_dir = primo.analysis.run_analysis_functions.get_read_lens()

# Get a descriptive string of the total number of reads in the directory
total_reads_str = primo.analysis.run_analysis_functions.get_total_dir_reads(all_read_lengths_in_dir)

# Plot read length distribution for reads in FASTQ_PASS dir
len_dist_name = primo.analysis.run_analysis_functions.plot_len_distribution(all_read_lengths_in_dir, date_label)
# Plot read length distribution of reads length 0 to max_read_len
len_dist_zoomed_name = primo.analysis.run_analysis_functions.plot_len_distribution_zoomed(all_read_lengths_in_dir, date_label, MAX_READ_LEN)

# Make PDF and write data to it
print('Making PDF (this is a slow process, give it a minute or two)')
pdf = primo.analysis.run_analysis_functions.initialize_pdf_expt0(date_label)
# text = total_reads_str + '\n' + aligned_reads_text + '\n' + cleaned_aligned_reads_text
text = total_reads_str + '\n' + cleaned_aligned_reads_text
pdf.multi_cell(200,30, txt=text, align='L')

default_image_width = 150
default_image_height = 110
pdf.image(barfig_name, w=default_image_width, h=default_image_height)
pdf.image(len_dist_name, w=default_image_width, h=default_image_height)
pdf.image(len_dist_zoomed_name, w=default_image_width, h=default_image_height)

pdf.output(f'{primo.analysis.run_analysis_functions.create_data_dir(date_label)}/{date_label}_css_analysis_summary.pdf')


## Analyze Similarity Search Run Data
This analyzes run data where an alequot of the baseline pool has been queried with Cas9. 

Use the cell below to get into the `data/sequencing` directory

In [None]:
print(os.getcwd())

In [22]:
EXPERIMENT_DIR = '20220909_1608_MN21390_FAU67022_4dc6e799'
expt_date_label = EXPERIMENT_DIR.split('/')[-1].split('_')[0]

PRCNT_DATA = 100 # the percent of fastq files you'd like to use for this data analysis
#TODO make MIN_READ_LEN and update functions accordingly
MAX_READ_LEN = 500

QUERY = ['ACCGGTAAGGCACAGAAACG', 'cat'] # cat
# QUERY = ['ACCTGTAAGGCACAGAAACG', 'lego]' # lego
# QUERY = ['ATTTGCAAGGAACAAAAACG', 'building'] # building


Use the cell below to make a dataframe where one column is the `Query`, one column is the `Target` (1 of the 457 options) and one is the `predicted_Cas9_cleavage_score` between the two (0 is minimum rate of cleavage, 1 is maximum rate of cleavage)

In [18]:
def get_activation_score(seq1, seq2):
    """
    Given two DNA sequences of length 20,
    Returns a float between 0 and 1 of the Cas9 activity between them if one were gRNA and one were the DNA seq
       Note that order doesn't matter, and 0 means minimum cleavage, 1 means max cleavage
    """
    simulator = Simulator()
    df = pd.DataFrame({
        "target_features": [seq1],
        "query_features": [seq2]
    })
    return simulator.simulate(df)[0] # this is a float 

# get all the cas sites from the seq_df dataframe
cas_sites = seq_df['FeatureSequence'].unique()

# make a list of cas9 activity scores
cas9_activity = [get_activation_score(seq, QUERY[0]) for seq in cas_sites]

# make list of query info to add to dataframe
query_name = [QUERY[1]]*len(cas9_activity)
query_seq = [QUERY[0]]*len(cas9_activity)

# make the dataframe with all information
zipped = list(zip(query_name, query_seq, cas_sites, cas9_activity))
predicted_cas9_activity_df = pd.DataFrame(zipped, columns=['Query_Name', 'Query_Seq', 'Target_Seq', 'wtCas9_Predicted_Activity'])
    

In [19]:
predicted_cas9_activity_df

Unnamed: 0,Query_Name,Query_Seq,Target_Seq,wtCas9_Predicted_Activity
0,cat,ACCGGTAAGGCACAGAAACG,ACCGGTAAGGCACAAAAACG,0.025195
1,cat,ACCGGTAAGGCACAGAAACG,ATTTGCAAGGAACAAAAACG,0.000200
2,cat,ACCGGTAAGGCACAGAAACG,ACCTGTAAGGCACAAAAACG,0.007386
3,cat,ACCGGTAAGGCACAGAAACG,AATTGCAAGGAACAAAAACG,0.000200
4,cat,ACCGGTAAGGCACAGAAACG,ACCGGTAAGGTACAAAAACG,0.002594
...,...,...,...,...
452,cat,ACCGGTAAGGCACAGAAACG,TCCGGTAAGGCACAGAAACG,1.000000
453,cat,ACCGGTAAGGCACAGAAACG,ACGGGTAAGGAACAAAAACG,0.001281
454,cat,ACCGGTAAGGCACAGAAACG,GGCGGTAAGGCACAAAAACG,0.023131
455,cat,ACCGGTAAGGCACAGAAACG,ACTTGCAAGGCACAGAAACG,0.016613


### Read in baseline pool information

In [21]:
# make sure you're in the `data/sequencing` dir
print(os.getcwd())
os.chdir('../../data/sequencing')
print(os.getcwd()) # the output of this should be `/tf/primo/data/sequencing`

INITIAL_POOL_INFO = '20220907_1411_MN21390_FAU64496_dec56e17/fastq_pass/20220907_css_analysis/baseline_pool_data.csv'
unmodified_df = pd.read_csv(INITIAL_POOL_INFO)

/tf/primo/notebooks/04_experiments
/tf/primo/data/sequencing


### Read in new run information and align 
You can skip the next two cells if you've already run them for this run

In [None]:
# Unzip all fastq files from FASTQ_PASS dir
print('Unzipping Fastq files')
print(os.getcwd())
primo.analysis.run_analysis_functions.gz_extract(EXPERIMENT_DIR)

# Read in FASTQ_PASS nanopore data and see which reads align to any of the seqs to align to
# NOTE- any read aligning to more than seq will eventually be discarded
print('Aligning reads to target sequences')
aligned_reads = primo.analysis.run_analysis_functions.align_reads(seqs_to_align_dict, EXPERIMENT_DIR, SCORE_THRESH,
                                                                 MAX_READ_LEN, PRCNT_DATA)
aligned_reads_text = f'Number of aligned reads (with duplicate alignments):{len(aligned_reads)}'

# Remove all reads that aligned to multiple sequences
print('Removing all reads with multiple alignments')
aligned_reads = primo.analysis.run_analysis_functions.remove_duplicate_reads(aligned_reads)
cleaned_aligned_reads_text = f'Number of aligned reads (no duplicate alignments):{len(aligned_reads)}'


In [None]:
# Write dataframe to csv
print(os.getcwd())
print('Writing aligned read data to csv')
csv_name = f'{primo.analysis.run_analysis_functions.create_data_dir(expt_date_label)}/read_data.csv'
aligned_reads.to_csv(csv_name)

### Analyze run information

In [40]:
print('Reading in alignment data from .csv')
csv_name = f'{expt_date_label}_css_analysis/read_data.csv'
# print(os.getcwd())
# os.chdir('fastq_pass/')
print(os.getcwd()) # this should be /tf/primo/data/sequencing/2022XXXX_minion_run_label_XXXXXXXX/fastq_pass
aligned_reads = pd.read_csv(csv_name)
print(aligned_reads)

print('Data is being analyzed')
# Plot raw number of reads each sequence got
barfig_name = primo.analysis.run_analysis_functions.plot_target_freq(aligned_reads, expt_date_label)

# Get a list of all the read lengths in the directory
all_read_lengths_in_dir = primo.analysis.run_analysis_functions.get_read_lens() # assumed you're in fastq_pass dir

# Get a descriptive string of the total number of reads in the directory
total_reads_str = primo.analysis.run_analysis_functions.get_total_dir_reads(all_read_lengths_in_dir)

# Plot read length distribution for reads in FASTQ_PASS dir
len_dist_name = primo.analysis.run_analysis_functions.plot_len_distribution(all_read_lengths_in_dir, 
                                                                            expt_date_label)

# Plot read length distribution of reads length 0 to max_read_len
len_dist_zoomed_name = primo.analysis.run_analysis_functions.plot_len_distribution_zoomed(all_read_lengths_in_dir, 
                                                                                          expt_date_label,
                                                                                         MAX_READ_LEN)

# Calculate enrichment score and plot those results
initial_ratios = primo.analysis.run_analysis_functions.calculate_enrichment_scores(unmodified_df)
expt_ratios = primo.analysis.run_analysis_functions.calculate_enrichment_scores(aligned_reads)
enrichment_score_barfig_name = primo.analysis.run_analysis_functions.plot_es_barplot(expt_ratios, 
                                                                                     initial_ratios, 
                                                                                     expt_date_label)

# TODO 
# MAKE A BAR PLOT SHOWING THE QUERY IN A DIFFERENT COLOR
# MAKE A PLOT OF PREDICTION VS REALITY

Reading in alignment data from .csv
/tf/primo/data/sequencing/20220909_1608_MN21390_FAU67022_4dc6e799/fastq_pass
       Unnamed: 0  index                               read_id  highest_score  \
0               0      0  0af39dc5-cd1d-4f30-8dcc-b94cf03206ef           20.0   
1               1      1  df031195-1c0b-4e7e-9d7f-2144269fb2aa           19.0   
2               2      0  72f1b988-91d0-4578-a78b-6e8828604726           18.0   
3               3      0  d4fe383e-fcb6-4352-b056-a3e0994d20f0           19.0   
4               4      0  06113b27-b31c-4dd2-8177-46532f456bfb           20.0   
...           ...    ...                                   ...            ...   
10838       10842      2  19a01fdd-5834-4a0b-951d-64b092833e5e           20.0   
10839       10843      3  a49b70bf-b4a9-4ff5-b2ab-9dec687b342e           20.0   
10840       10844      4  5a7eae81-ae25-4585-8b18-1d0d0c34a0d0           18.0   
10841       10845      5  99cb93ec-9379-4b41-a857-315dca84f186           20.0

In [None]:
# Make PDF and write data to it
print('Making PDF (this is a slow process, give it a minute or two)')
pdf = initialize_pdf(date_label)
text = total_reads_str + '\n' + aligned_reads_text + '\n' + cleaned_aligned_reads_text
pdf.multi_cell(200,30, txt=text, align='L')

default_image_width = 150
default_image_height = 110
pdf.image(barfig_name, w=default_image_width, h=default_image_height)
pdf.image(enrichment_score_barfig_name, w=default_image_width, h=default_image_height)
pdf.image(len_dist_name, w=default_image_width, h=default_image_height)
pdf.image(len_dist_zoomed_name, w=default_image_width, h=default_image_height)

pdf.output(f'{create_data_dir(CURRENT_DIR, date_label)}/{date_label}_css_analysis_summary.pdf')