<a id='0.1'></a>
## 0.1 load required packages

In [1]:
%run "..\..\Startup_py3.py"
sys.path.append(r"..\..\..\..\Documents")

import ImageAnalysis3 as ia
%matplotlib notebook

from ImageAnalysis3 import *
print(os.getpid())

# library design specific tools
from ImageAnalysis3.library_tools import LibraryDesigner as ld
from ImageAnalysis3.library_tools import LibraryTools as lt
# biopython imports
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML

23484


In [2]:
# Pool directories
pool_folder = r'\\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain'
if not os.path.exists(pool_folder):
    print(f"Create pool_folder: {pool_folder}")
    os.makedirs(pool_folder)
else:
    print(f"Use pool_folder: {pool_folder}")

Use pool_folder: \\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain


In [3]:
summary_folder = os.path.join(pool_folder, 'Summary_files')
if not os.path.exists(summary_folder):
    print(f"Create summary_folder: {summary_folder}")
    os.makedirs(summary_folder)
else:
    print(f"Use summary_folder: {summary_folder}")
    
# 
overwrite = False
print(f"overwrite: {overwrite}")

Use summary_folder: \\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain\Summary_files
overwrite: False


# Scan probe files

In [4]:
sub_fds = [os.path.join(pool_folder, _fd) for _fd in os.listdir(pool_folder) if os.path.isdir(os.path.join(pool_folder, _fd))]
files = []
library_names = []
for _fd in sub_fds:
    if 'blast_full_probes.fasta' in os.listdir(_fd):
        files.append(os.path.join(_fd, 'blast_full_probes.fasta'))
        library_names.append(os.path.basename(_fd).replace('_', '-'))
    elif 'filtered_full_probes.fasta' in os.listdir(_fd):
        files.append(os.path.join(_fd, 'filtered_full_probes.fasta'))
        library_names.append(os.path.basename(_fd).replace('_', '-'))
    # sequential controls
    if 'Sequential' in os.listdir(_fd):
        if 'blast_sequential_full_probes.fasta' in os.listdir(os.path.join(_fd, 'Sequential')):
            files.append(os.path.join(_fd, 'Sequential', 'blast_sequential_full_probes.fasta'))
            library_names.append(os.path.basename(_fd).replace('_', '-')+'-internal-sequential')
            
print(files)
print(library_names)

['\\\\10.245.74.212\\Chromatin_NAS_2\\Chromatin_Libraries\\CTP-14_human_brain\\human_genome_library\\blast_full_probes.fasta', '\\\\10.245.74.212\\Chromatin_NAS_2\\Chromatin_Libraries\\CTP-14_human_brain\\human_brain_promoter_enhancer_v2\\blast_full_probes.fasta', '\\\\10.245.74.212\\Chromatin_NAS_2\\Chromatin_Libraries\\CTP-14_human_brain\\mouse_brain_long_contacts\\blast_full_probes.fasta', '\\\\10.245.74.212\\Chromatin_NAS_2\\Chromatin_Libraries\\CTP-14_human_brain\\mouse_long_5kb\\blast_full_probes.fasta', '\\\\10.245.74.212\\Chromatin_NAS_2\\Chromatin_Libraries\\CTP-14_human_brain\\mouse_long_5kb\\Sequential\\blast_sequential_full_probes.fasta', '\\\\10.245.74.212\\Chromatin_NAS_2\\Chromatin_Libraries\\CTP-14_human_brain\\human_NRNX1_5kb\\blast_full_probes.fasta', '\\\\10.245.74.212\\Chromatin_NAS_2\\Chromatin_Libraries\\CTP-14_human_brain\\human_NRNX1_5kb\\Sequential\\blast_sequential_full_probes.fasta', '\\\\10.245.74.212\\Chromatin_NAS_2\\Chromatin_Libraries\\CTP-14_human_brain

## load sub-libraries

In [5]:
final_pb_len = 162
library_2_pbs = {}

for _fl, _lib_name in zip(files, library_names):
    print(f"loading probes from file: {_fl}")
    _records = []
    with open(_fl, 'r') as _handle:
        for _record in SeqIO.parse(_handle, "fasta"):
            # modify names
            _record.id = _record.id + f"_library_{_lib_name}"
            _record.name, _record.description = '',''
            # modify seq if length doesn't match
            if len(_record.seq) < final_pb_len:
                print("**", _fl)
                _seq = _record.seq + Seq('A'*(final_pb_len - len(_record.seq)))
                _record.seq = _seq
        
            _records.append(_record)
        
    library_2_pbs[_lib_name] = _records

loading probes from file: \\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain\human_genome_library\blast_full_probes.fasta
loading probes from file: \\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain\human_brain_promoter_enhancer_v2\blast_full_probes.fasta
loading probes from file: \\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain\mouse_brain_long_contacts\blast_full_probes.fasta
loading probes from file: \\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain\mouse_long_5kb\blast_full_probes.fasta
loading probes from file: \\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain\mouse_long_5kb\Sequential\blast_sequential_full_probes.fasta
loading probes from file: \\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain\human_NRNX1_5kb\blast_full_probes.fasta
loading probes from file: \\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain\human_NRNX1_5kb\Sequential\bl

## check number of loci

In [20]:
from ImageAnalysis3.library_tools.quality_check import split_probe_by_gene

In [22]:
library_2_num_loci = {}
for _lib_name, _pbs in library_2_pbs.items():
    gene_2_pbs = split_probe_by_gene(_pbs)
    library_2_num_loci[_lib_name] = len(gene_2_pbs)

In [23]:
library_2_num_loci

{'human-genome-library': 1352,
 'human-brain-promoter-enhancer-v2': 1388,
 'mouse-brain-long-contacts': 28,
 'mouse-long-5kb': 1097,
 'mouse-long-5kb-internal-sequential': 92,
 'human-NRNX1-5kb': 668,
 'human-NRNX1-5kb-internal-sequential': 24,
 'mouse-brain-relabel-chr7': 96,
 'mouse-long-5kb-sequential': 1097,
 'mouse-TSS-integrate-CTP13': 28}

## check primers

In [25]:
print(library_names)
fwd_primers, rev_primers = [], []
num_probes = []
num_loci = []
for _lib_name in library_names:
    _fp, _rp = library_2_pbs[_lib_name][0].id.split('primers_[')[1].split(']')[0].split(',')
    fwd_primers.append(_fp)
    rev_primers.append(_rp)
    num_probes.append(len(library_2_pbs[_lib_name]))
    num_loci.append(library_2_num_loci[_lib_name])

['human-genome-library', 'human-brain-promoter-enhancer-v2', 'mouse-brain-long-contacts', 'mouse-long-5kb', 'mouse-long-5kb-internal-sequential', 'human-NRNX1-5kb', 'human-NRNX1-5kb-internal-sequential', 'mouse-brain-relabel-chr7', 'mouse-long-5kb-sequential', 'mouse-TSS-integrate-CTP13']


In [26]:
import pandas as pd
primer_summary_df = pd.DataFrame({
    'library':library_names,
    'number_probes': num_probes,
    'number_loci': num_loci,
    'forward_primers': fwd_primers,
    'reverse_primers': rev_primers,
    }
)
primer_summary_df

Unnamed: 0,library,number_probes,number_loci,forward_primers,reverse_primers
0,human-genome-library,159879,1352,W1A03_primer_2,W1A10_primer_9
1,human-brain-promoter-enhancer-v2,165445,1388,W1A07_primer_6,W1A12_primer_11
2,mouse-brain-long-contacts,3244,28,W1B05_primer_16,W1B04_primer_15
3,mouse-long-5kb,58492,1097,W1B09_primer_20,W1B10_primer_21
4,mouse-long-5kb-internal-sequential,1731,92,W1B09_primer_20,W1B10_primer_21
5,human-NRNX1-5kb,31808,668,W1A01_primer_0,W1A02_primer_1
6,human-NRNX1-5kb-internal-sequential,335,24,W1A01_primer_0,W1A02_primer_1
7,mouse-brain-relabel-chr7,7608,96,W1A05_primer_4,W1A04_primer_3
8,mouse-long-5kb-sequential,50852,1097,W1A09_primer_8,W1A08_primer_7
9,mouse-TSS-integrate-CTP13,3360,28,W1B07_primer_18,W1A06_primer_5


In [29]:
# save this primer
primer_summary_filename = os.path.join(summary_folder, 'Primer_summary.csv')
print(f"primer_summary_filename: {primer_summary_filename}")
if overwrite or not os.path.exists(primer_summary_filename):
    print("save")
    primer_summary_df.to_csv(primer_summary_filename, index=False)

primer_summary_filename: \\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain\Summary_files\Primer_summary.csv
save


## merge and save

In [9]:
# merge into one file and save
final_pool_records = []
for _lib_name, _records in library_2_pbs.items():
    print(f"appending {len(_records)} probes from library: {_lib_name}")
    final_pool_records.extend(_records)

appending 159879 probes from library: human-genome-library
appending 165445 probes from library: human-brain-promoter-enhancer-v2
appending 3244 probes from library: mouse-brain-long-contacts
appending 58492 probes from library: mouse-long-5kb
appending 1731 probes from library: mouse-long-5kb-internal-sequential
appending 31808 probes from library: human-NRNX1-5kb
appending 335 probes from library: human-NRNX1-5kb-internal-sequential
appending 7608 probes from library: mouse-brain-relabel-chr7
appending 50852 probes from library: mouse-long-5kb-sequential
appending 3360 probes from library: mouse-TSS-integrate-CTP13


In [10]:
save_filename = os.path.join(summary_folder, 'merged_CTP14_DNA.fasta')
print(f"number of probes saved: {len(final_pool_records)}")
print(f"save_filename: {save_filename}")

if overwrite or not os.path.exists(save_filename):
    print("save")
    with open(save_filename, 'w') as _output_handle:
        SeqIO.write(final_pool_records, _output_handle, "fasta")

number of probes saved: 482754
save_filename: \\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain\Summary_files\merged_CTP14_DNA.fasta
save


### Convert into dataframe

In [11]:
## Save csv version to fit SureDesign in Agilent
import pandas as pd

library_dict = {
    'SequenceName': [],
    'Sequence': [],
    'Replication': [],
}
for _record in final_pool_records:
    library_dict['SequenceName'].append(_record.id)
    library_dict['Sequence'].append(str(_record.seq))
    library_dict['Replication'].append(1)
# 
library_df = pd.DataFrame(library_dict)
library_df

Unnamed: 0,SequenceName,Sequence,Replication
0,loc_1:1235001-1265000_gene_0_pb_123_pos_8469_s...,CCCGCAATGGCTGACAACCGGTTTGCTCTGCGGACGCGGTGTTTGC...,1
1,loc_1:1235001-1265000_gene_0_pb_124_pos_8513_s...,CCCGCAATGGCTGACAACCGGAATCTATCGAAGAGGCATCGAATCT...,1
2,loc_1:1235001-1265000_gene_0_pb_125_pos_8629_s...,CCCGCAATGGCTGACAACCGGTGCACGCCGTCGAGATACCGTGCAC...,1
3,loc_1:1235001-1265000_gene_0_pb_126_pos_8680_s...,CCCGCAATGGCTGACAACCGGTTTGCTCTGCGGACGCGGTGTTTGC...,1
4,loc_1:1235001-1265000_gene_0_pb_127_pos_8736_s...,CCCGCAATGGCTGACAACCGGAATCTATCGAAGAGGCATCGAATCT...,1
...,...,...,...
482749,loc_11:78489091-78509091_gene_Vtn_pb_187_pos_1...,CACGTGGCCTCTCGCACATCTACGGATCCTACGAATACGATACGGA...,1
482750,loc_11:78489091-78509091_gene_Vtn_pb_188_pos_1...,CACGTGGCCTCTCGCACATCCCATTTCGTGCGAAGCGATACCATTT...,1
482751,loc_11:78489091-78509091_gene_Vtn_pb_189_pos_1...,CACGTGGCCTCTCGCACATCATAACGACTTGCGGATGCCAATAACG...,1
482752,loc_11:78489091-78509091_gene_Vtn_pb_190_pos_1...,CACGTGGCCTCTCGCACATCTACGGATCCTACGAATACGATACGGA...,1


In [12]:
library_csv_filename = os.path.join(summary_folder, 'merged_CTP14_full.csv')
print(f"library_csv_filename: {library_csv_filename}")
if overwrite or not os.path.exists(library_csv_filename):
    print("save")
    library_df.to_csv(library_csv_filename, index=False,)

library_csv_filename: \\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain\Summary_files\merged_CTP14_full.csv
save


In [13]:
library_txt_filename = os.path.join(summary_folder, 'merged_CTP14_DNA.txt')
print(f"library_txt_filename: {library_txt_filename}")
if overwrite or not os.path.exists(library_txt_filename):
    print("save")
    library_df.to_csv(library_txt_filename, index=False, sep='\t')

library_txt_filename: \\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain\Summary_files\merged_CTP14_DNA.txt
save


In [14]:
library_sequence_df = library_df.loc[:,'Sequence'].copy()

library_sequence_filename = os.path.join(summary_folder, 'merged_CTP14_sequence.csv')
library_sequence_df

0         CCCGCAATGGCTGACAACCGGTTTGCTCTGCGGACGCGGTGTTTGC...
1         CCCGCAATGGCTGACAACCGGAATCTATCGAAGAGGCATCGAATCT...
2         CCCGCAATGGCTGACAACCGGTGCACGCCGTCGAGATACCGTGCAC...
3         CCCGCAATGGCTGACAACCGGTTTGCTCTGCGGACGCGGTGTTTGC...
4         CCCGCAATGGCTGACAACCGGAATCTATCGAAGAGGCATCGAATCT...
                                ...                        
482749    CACGTGGCCTCTCGCACATCTACGGATCCTACGAATACGATACGGA...
482750    CACGTGGCCTCTCGCACATCCCATTTCGTGCGAAGCGATACCATTT...
482751    CACGTGGCCTCTCGCACATCATAACGACTTGCGGATGCCAATAACG...
482752    CACGTGGCCTCTCGCACATCTACGGATCCTACGAATACGATACGGA...
482753    CACGTGGCCTCTCGCACATCCCATTTCGTGCGAAGCGATACCATTT...
Name: Sequence, Length: 482754, dtype: object

In [15]:
library_sequence_filename = os.path.join(summary_folder, 'merged_CTP14_sequence.csv')
print(f"library_sequence_filename: {library_sequence_filename}")
if overwrite or not os.path.exists(library_sequence_filename):
    print("save")
    library_sequence_df.to_csv(library_sequence_filename, index=False, sep='\t')

library_sequence_filename: \\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain\Summary_files\merged_CTP14_sequence.csv
save


# split library_sequence_df

## load from here

In [16]:
import pandas as pd
library_csv_filename = os.path.join(summary_folder, 'merged_CTP14_full.csv')
print(library_csv_filename)
library_df = pd.read_csv(library_csv_filename)

\\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain\Summary_files\merged_CTP14_full.csv


In [17]:
# part 1 sequence
limit = 244000
library_sequence_part1_filename = os.path.join(summary_folder, 'merged_CTP14_sequence_part1.csv')
print(library_sequence_part1_filename)
library_df_part1 = library_df[:limit]
print(len(library_df_part1))
library_df_part1.loc[:,'Sequence'].copy().to_csv(library_sequence_part1_filename, index=False, sep='\t')

\\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain\Summary_files\merged_CTP14_sequence_part1.csv
244000


In [18]:
# part 1 sequence
library_sequence_part2_filename = os.path.join(summary_folder, 'merged_CTP14_sequence_part2.csv')
print(library_sequence_part2_filename)
library_df_part2 = library_df[limit:]
print(len(library_df_part2))
library_df_part2.loc[:,'Sequence'].copy().to_csv(library_sequence_part2_filename, index=False, sep='\t')

\\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-14_human_brain\Summary_files\merged_CTP14_sequence_part2.csv
238754


# Check OligoPools

In [12]:
import pandas as pd
library_sequence_part1_filename = os.path.join(summary_folder, 'merged_CTP14_sequence_part1.csv')
lib_df_part1 = pd.read_csv(library_sequence_part1_filename)

# extract primer to pb_num:
primerSeq_2_part1PbNum = {}
primer_len = 20
for _seq in lib_df_part1['Sequence']:
    _pm_seq = _seq[:primer_len]
    if _pm_seq not in primerSeq_2_part1PbNum:
        primerSeq_2_part1PbNum[_pm_seq] = 1
    else:
        primerSeq_2_part1PbNum[_pm_seq] += 1
        
primerSeq_2_part1PbNum

{'CCCGCAATGGCTGACAACCG': 159879, 'CGCAAACTGGTGCGGAAGGC': 84121}

In [11]:
library_sequence_part2_filename = os.path.join(summary_folder, 'merged_CTP14_sequence_part2.csv')
lib_df_part2 = pd.read_csv(library_sequence_part2_filename)

# extract primer to pb_num:
primerSeq_2_part2PbNum = {}
primer_len = 20
for _seq in lib_df_part2['Sequence']:
    _pm_seq = _seq[:primer_len]
    if _pm_seq not in primerSeq_2_part2PbNum:
        primerSeq_2_part2PbNum[_pm_seq] = 1
    else:
        primerSeq_2_part2PbNum[_pm_seq] += 1
        
primerSeq_2_part2PbNum

{'CGCAAACTGGTGCGGAAGGC': 81324,
 'CGGGTTTCGTTGCGCACACC': 3244,
 'TAGGCGTGTCGGCCAACCAG': 60223,
 'CGGCTCGCAGCGTGTAAACG': 32143,
 'CATTCAGCATTGCGCAACGG': 7608,
 'TTGTTGAGGCGGCGGAAGTC': 50852,
 'CACGTGGCCTCTCGCACATC': 3360}

{'CCCGCAATGGCTGACAACCG': 159879, 'CGCAAACTGGTGCGGAAGGC': 84121}

### check probe lengths for all sub-libraries

In [19]:
probe_lengths = [len(_r) for _r in final_pool_records]
print(np.unique(probe_lengths))

[162]


### check readout usage

In [None]:
# for each library, check all the used readouts
from ImageAnalysis3.library_tools.quality_check import check_read