# create reference files for adding adaptors and analysis

for library CTP-10 Aire (209 gene)

by Pu Zheng

2021.06.07

In [12]:
%run "..\Startup_py3.py"
sys.path.append(r"..\..\..\Documents")

import ImageAnalysis3 as ia
%matplotlib notebook

from ImageAnalysis3 import *
print(os.getpid())

# other required parameters
from ImageAnalysis3.classes import _allowed_kwds

43364


In [13]:
probe_filename = r'\\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\merged_Aire_209gene.fasta'
if not os.path.isfile(probe_filename):
    raise IOError(f"input probe file: {probe_filename} doesn't exist.")


In [16]:
# biopython imports
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML

In [18]:
pb_records = []
with open(probe_filename, 'r') as handle:
    for record in SeqIO.parse(handle, "fasta"):
        pb_records.append(record)

In [19]:
pb_records[0].id

'Genes-intronic-RNA_loc_9:104326978-104337542_gene_Acpp_pb_5_pos_213_strand_-_readouts_[Stv_59_c,Stv_59_c,Stv_59_c,Stv_59_c]'

In [21]:
# extract library names
lib_names = []
lib_splitter = 'loc'
for _r in pb_records:
    _lib_name = _r.id.split('_'+lib_splitter+'_')[0]
    if _lib_name not in lib_names:
        lib_names.append(_lib_name)

In [23]:
# extract library probes

lib_records_dict = {_n:[] for _n in lib_names}
for _r in pb_records:
    _lib_name = _r.id.split('_'+lib_splitter+'_')[0]
    if _lib_name == lib_names[lib_id]:
        lib_records_dict[_lib_name].append(_r)

In [26]:
len(lib_records_dict['Genes-intronic-RNA'])

14704

In [27]:
pb_dict = library_tools.quality_check.split_probe_by_gene(lib_records_dict['Genes-intronic-RNA'])

In [30]:
len(pb_dict.keys())

196

<module 'ImageAnalysis3.library_tools.sequences' from '..\\..\\..\\Documents\\ImageAnalysis3\\library_tools\\sequences.py'>

In [45]:
readout_usage_file = os.path.join(r'\\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_intronic_RNA', 'readout_usage.pkl')
readout_dict = pickle.load(open(readout_usage_file, 'rb'))

In [52]:
selected_readout_names = [_r.id for _r in readout_dict[_allowed_kwds[lib_type]]]

## save adaptor_sequences.csv for adding adaptors

In [48]:
readout_folder = r'\\10.245.74.212\Chromatin_NAS_2\Libraries\Readouts'
ref_files = [_fl for _fl in os.listdir(readout_folder) if 'designed_readouts' in _fl]

ref_readout_dict = {}
for _fl in ref_files:
    _channel = int(_fl.split('designed_readouts_')[1].split('.fasta')[0])
    _ref_readout_names = []
    with open(os.path.join(readout_folder, _fl), 'r') as _rd_handle:
        for _readout in SeqIO.parse(_rd_handle, "fasta"):
            _ref_readout_names.append(_readout.id)
    ref_readout_dict[_channel] = _ref_readout_names

In [60]:
ref_readout_dict

{561: ['Stv_91',
  'Stv_92',
  'Stv_94',
  'Stv_95',
  'Stv_99',
  'Stv_100',
  'Stv_101',
  'Stv_104',
  'Stv_105',
  'Stv_106',
  'Stv_107',
  'Stv_109',
  'Stv_118',
  'Stv_119',
  'Stv_120',
  'Stv_121',
  'Stv_125',
  'Stv_127',
  'Stv_129',
  'Stv_130',
  'Stv_131',
  'Stv_133',
  'Stv_136',
  'Stv_145',
  'Stv_182',
  'NDB_3',
  'NDB_6',
  'NDB_9',
  'NDB_12',
  'NDB_15',
  'NDB_18',
  'NDB_21',
  'NDB_24',
  'NDB_27',
  'NDB_30',
  'NDB_33',
  'NDB_36',
  'NDB_39',
  'NDB_42',
  'NDB_45',
  'NDB_48',
  'NDB_51',
  'NDB_54',
  'NDB_57',
  'NDB_60',
  'NDB_63',
  'NDB_66',
  'NDB_69',
  'NDB_72',
  'NDB_75',
  'NDB_78',
  'NDB_81',
  'NDB_84',
  'NDB_87',
  'NDB_90',
  'NDB_93',
  'NDB_96',
  'NDB_99',
  'NDB_102',
  'NDB_105',
  'NDB_108',
  'NDB_111',
  'NDB_114',
  'NDB_117',
  'NDB_120',
  'NDB_123',
  'NDB_126',
  'NDB_129',
  'NDB_132',
  'NDB_135',
  'NDB_138',
  'NDB_141',
  'NDB_144',
  'NDB_147',
  'NDB_150',
  'NDB_153',
  'NDB_156',
  'NDB_159',
  'NDB_162',
  'NDB_16

In [81]:
# sort regions with readout types
readout_by_channel = {_c:[] for _c in ref_readout_dict}
for _rd in readout_dict[_allowed_kwds[lib_type]]:
    for _c in readout_by_channel:
        if _rd.id in ref_readout_dict[_c]:
            readout_by_channel[_c].append(_rd.id)
readout_by_channel = {_k:_v for _k,_v in sorted(readout_by_channel.items(), key=lambda v:-int(v[0])) if len(_v) > 0}

In [82]:
readout_by_channel

{750: ['Stv_3',
  'Stv_4',
  'Stv_5',
  'Stv_6',
  'Stv_7',
  'Stv_8',
  'Stv_9',
  'Stv_10',
  'Stv_11',
  'Stv_12',
  'Stv_13',
  'Stv_14',
  'Stv_16',
  'Stv_19',
  'Stv_20',
  'Stv_21',
  'Stv_22',
  'Stv_23',
  'Stv_25',
  'Stv_26',
  'Stv_27',
  'Stv_28',
  'Stv_29',
  'Stv_30',
  'Stv_31'],
 647: ['Stv_32',
  'Stv_33',
  'Stv_35',
  'Stv_36',
  'Stv_37',
  'Stv_39',
  'Stv_40',
  'Stv_42',
  'Stv_44',
  'Stv_45',
  'Stv_46',
  'Stv_48',
  'Stv_50',
  'Stv_53',
  'Stv_54',
  'Stv_59',
  'Stv_60',
  'Stv_61',
  'Stv_63',
  'Stv_64',
  'Stv_65',
  'Stv_86',
  'Stv_87',
  'Stv_88',
  'Stv_90']}

In [83]:
readout_names = {}
for _r in pb_records:
    _rname = _r.id.split('gene_')[1].split('_')[0]
    _rd_names = _r.id.split('[')[1].split(']')[0].split(',')
    if _rname not in readout_names:
        readout_names[_rname] = []
    for _rd in _rd_names:
        if _rd not in readout_names[_rname]:
            if '_'+_allowed_kwds[lib_type] not in _rd:
                readout_names[_rname].append(_rd)
            else:
                readout_names[_rname].append(_rd.split('_'+_allowed_kwds[lib_type])[0])

In [140]:
lib_type

'combo'

In [94]:
readout_usage_file = os.path.join(r'\\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Genes_TSS_DNA', 'readout_usage.pkl')
readout_dict = pickle.load(open(readout_usage_file, 'rb'))

# load reference
readout_folder = r'\\10.245.74.212\Chromatin_NAS_2\Libraries\Readouts'
ref_files = [_fl for _fl in os.listdir(readout_folder) if 'designed_readouts' in _fl]

ref_readout_dict = {}
for _fl in ref_files:
    _channel = int(_fl.split('designed_readouts_')[1].split('.fasta')[0])
    _ref_readout_names = []
    with open(os.path.join(readout_folder, _fl), 'r') as _rd_handle:
        for _readout in SeqIO.parse(_rd_handle, "fasta"):
            _ref_readout_names.append(_readout.id)
    ref_readout_dict[_channel] = _ref_readout_names

# sort regions with readout types
readout_by_channel = {_c:[] for _c in ref_readout_dict}
for _rd in readout_dict[_allowed_kwds[lib_type]]:
    for _c in readout_by_channel:
        if _rd.id in ref_readout_dict[_c]:
            readout_by_channel[_c].append(_rd.id)
readout_by_channel = {_k:_v for _k,_v in sorted(readout_by_channel.items(), key=lambda v:-int(v[0])) if len(_v) > 0}


dict_sizes = [len(_v) for _k,_v in readout_by_channel.items()]
print(dict_sizes)

import csv

save_folder = r'\\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Summary_tables'
with open(os.path.join(save_folder, f'TSS_DNA_{lib_type}_adaptor_sequences.csv'), 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', 
                           quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
    _header = ['group', 'hyb', ]
    for _ch in readout_by_channel:
        _header += [f"{_ch}_bits", f"{_ch}_readouts"]
        
    csvwriter.writerow(_header)
    
    for _i in range(max(dict_sizes)):
        _row = [int((_i)/32)+1, _i%32+1,] 
        for _j, (_ch, _names) in enumerate(readout_by_channel.items()):
            if _i >= len(_names):
                _row += ['', '']
            else:
                _row += [f"b{_i*len(readout_by_channel)+_j}", _names[_i]]      
        csvwriter.writerow(_row)


[25, 25]


In [111]:
adaptors[0]

SeqRecord(seq=Seq('CCGCTTGCGAGTAGGGCAATGATCCGATTGGAACCGTCCCGATCCGATTGGAACCGTCCC'), id='Stv_15_2xStv_1', name='Stv_15_2xStv_1', description='Stv_15_2xStv_1', dbxrefs=[])

In [129]:
from copy import copy

In [133]:
from copy import copy
selected_adaptors = {}

for _ch, _rnames in readout_by_channel.items():
    selected_adaptors[_ch] = []
    for _rname in _rnames:
        _matched_adaptors = [_adt for _adt in adaptors if _rname in _adt.id]
        if len(_matched_adaptors) == 1:
            _matched_adaptor = copy(_matched_adaptors[0])
            _matched_adaptor.id = _matched_adaptor.id+'rc'
            _matched_adaptor.description = ""

            selected_adaptors[_ch].append(_matched_adaptor)
        else:
            print(_rname)

type

In [143]:
readout_usage_folder = os.path.dirname(readout_usage_file)
# generate csv file to order in IDT
import csv

with open(os.path.join(readout_usage_folder, f'{lib_type}_adaptor.csv'), 'w') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', lineterminator='\n',
                       quotechar='|', quoting=csv.QUOTE_MINIMAL)
    # write header
    _header = ['Name', 'Sequence', 'Scale', 'Purification']
    csvwriter.writerow(_header)
    # write sequence
    for _ch, _adaptors in selected_adaptors.items():
        for _adaptor in _adaptors:
            _info = [_adaptor.id, str(_adaptor.seq), '25nm', 'STD']
            csvwriter.writerow(_info)

In [136]:
selected_adaptors

{750: [SeqRecord(seq=Seq('TAGAAATATCCGAGTGGCATTTTGCACTGCCGTCCTTGACTTTGCACTGCCGTCCTTGAC'), id='NDB_784_2xStv_82rc', name='NDB_784_2xStv_82', description='', dbxrefs=[]),
  SeqRecord(seq=Seq('ATATGTCGATGTCCTTAGACTTTGCACTGCCGTCCTTGACTTTGCACTGCCGTCCTTGAC'), id='NDB_826_2xStv_82rc', name='NDB_826_2xStv_82', description='', dbxrefs=[]),
  SeqRecord(seq=Seq('GATCCCGGTCGGATATGTGATTTGCACTGCCGTCCTTGACTTTGCACTGCCGTCCTTGAC'), id='NDB_865_2xStv_82rc', name='NDB_865_2xStv_82', description='', dbxrefs=[]),
  SeqRecord(seq=Seq('GAACTCGTTGACTAAGCATTTTTGCACTGCCGTCCTTGACTTTGCACTGCCGTCCTTGAC'), id='NDB_817_2xStv_82rc', name='NDB_817_2xStv_82', description='', dbxrefs=[]),
  SeqRecord(seq=Seq('TAGGCGCTTTAGTAGGTACCTTTGCACTGCCGTCCTTGACTTTGCACTGCCGTCCTTGAC'), id='NDB_652_2xStv_82rc', name='NDB_652_2xStv_82', description='', dbxrefs=[]),
  SeqRecord(seq=Seq('CGTATTGCAGGATCCTTATGTTTGCACTGCCGTCCTTGACTTTGCACTGCCGTCCTTGAC'), id='NDB_718_2xStv_82rc', name='NDB_718_2xStv_82', description='', dbxrefs=[]),
  SeqRecord

In [132]:
# find adaptors
reload(library_tools.sequences)
#library_tools.sequences.fasta_reader()

adaptor_folder = r'\\10.245.74.212\Chromatin_NAS_2\Libraries\Adaptors'
adaptor_files = [os.path.join(adaptor_folder, _fl) for _fl in os.listdir(adaptor_folder) 
                 if _fl.split(os.extsep)[-1]=='fasta' and _fl.split(os.extsep)[-2][-9:] == '_adaptors']
adaptors = library_tools.sequences.fasta_reader(adaptor_files, True).load()

loading 3 fasta files
- loading from file: \\10.245.74.212\Chromatin_NAS_2\Libraries\Adaptors\20200121_extend_stv_adaptors.fasta
- loading from file: \\10.245.74.212\Chromatin_NAS_2\Libraries\Adaptors\NDB_adaptors.fasta
- loading from file: \\10.245.74.212\Chromatin_NAS_2\Libraries\Adaptors\Stv_adaptors.fasta


In [100]:
adaptor_files

['\\\\10.245.74.212\\Chromatin_NAS_2\\Libraries\\Adaptors\\20200121_extend_stv_adaptors.fasta',
 '\\\\10.245.74.212\\Chromatin_NAS_2\\Libraries\\Adaptors\\NDB_adaptors.fasta',
 '\\\\10.245.74.212\\Chromatin_NAS_2\\Libraries\\Adaptors\\Stv_adaptors.fasta']

In [85]:
dict_sizes = [len(_v) for _k,_v in readout_by_channel.items()]
print(dict_sizes)

[25, 25]


In [86]:
import csv

save_folder = r'\\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-10_Aire\Summary_tables'
with open(os.path.join(save_folder, f'Intronic_RNA_{lib_type}_adaptor_sequences.csv'), 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', 
                           quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
    _header = ['group', 'hyb', ]
    for _ch in readout_by_channel:
        _header += [f"{_ch}_bits", f"{_ch}_readouts"]
        
    csvwriter.writerow(_header)
    
    for _i in range(max(dict_sizes)):
        _row = [int((_i)/32)+1, _i%32+1,] 
        for _j, (_ch, _names) in enumerate(readout_by_channel.items()):
            if _i >= len(_names):
                _row += ['', '']
            else:
                _row += [f"b{_i*len(readout_by_channel)+_j}", _names[_i]]      
        csvwriter.writerow(_row)


In [87]:
import csv

drift_channel = '488'
dapi_channel = '405'
chrom_labels = {}

with open(os.path.join(save_folder, f'Intronic_RNA_{lib_type}_Color_Usage.csv'), 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', 
                           quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
    # write header
    _header = ['Hyb']
    for _ch in readout_by_channel:
        _header .append(str(_ch))
    _header.append(drift_channel)
    _header.append(dapi_channel)
    print(_header)
    csvwriter.writerow(_header)
    
    # write reference frame
    _ref_row = ['H0R0']
    for _ch in readout_by_channel:
        if str(_ch) in chrom_labels:
            _ref_row.append(chrom_labels[str(_ch)]+'_chrom')
        else:
            _ref_row.append("")
    _ref_row.append('beads')
    _ref_row.append('DAPI')
    print(_ref_row)
    csvwriter.writerow(_ref_row)
    
    
    for _i in range(max(dict_sizes)):
        _row = [f"H{int(_i)+1}C{int(_i)+1}",] 
        for _j, (_ch, _names) in enumerate(readout_by_channel.items()):
            if _i >= len(_names):
                _row += ['']
            else:
                _row += [f"{_allowed_kwds[lib_type]}{_i*len(readout_by_channel)+_j}"]
        _row.append("beads")
        print(_row)
        csvwriter.writerow(_row)


['Hyb', '750', '647', '488', '405']
['H0R0', '', '', 'beads', 'DAPI']
['H1C1', 'c0', 'c1', 'beads']
['H2C2', 'c2', 'c3', 'beads']
['H3C3', 'c4', 'c5', 'beads']
['H4C4', 'c6', 'c7', 'beads']
['H5C5', 'c8', 'c9', 'beads']
['H6C6', 'c10', 'c11', 'beads']
['H7C7', 'c12', 'c13', 'beads']
['H8C8', 'c14', 'c15', 'beads']
['H9C9', 'c16', 'c17', 'beads']
['H10C10', 'c18', 'c19', 'beads']
['H11C11', 'c20', 'c21', 'beads']
['H12C12', 'c22', 'c23', 'beads']
['H13C13', 'c24', 'c25', 'beads']
['H14C14', 'c26', 'c27', 'beads']
['H15C15', 'c28', 'c29', 'beads']
['H16C16', 'c30', 'c31', 'beads']
['H17C17', 'c32', 'c33', 'beads']
['H18C18', 'c34', 'c35', 'beads']
['H19C19', 'c36', 'c37', 'beads']
['H20C20', 'c38', 'c39', 'beads']
['H21C21', 'c40', 'c41', 'beads']
['H22C22', 'c42', 'c43', 'beads']
['H23C23', 'c44', 'c45', 'beads']
['H24C24', 'c46', 'c47', 'beads']
['H25C25', 'c48', 'c49', 'beads']


## save adaptor_sequences.csv for adding adaptors

In [29]:
import csv

save_folder = r'\\10.245.74.69\Chromatin_NAS_2\Libraries\SI16\color_usage_info'
with open(os.path.join(save_folder, 'adaptor_sequences.csv'), 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', 
                           quotechar='|', quoting=csv.QUOTE_MINIMAL)
    csvwriter.writerow(['group', 'hyb', 
                        '750_region', '750_readout',
                        '647_region', '647_readout',
                        '561_region', '561_readout',])
    
    chr_ids = np.array(list(readout_names.keys()))

    for _i in range(int(len(chr_ids)/3)):
        _ids = chr_ids[_i*3:_i*3+3]
        csvwriter.writerow([int((_i)/32)+1, _i%32+1,
                            _ids[0], readout_names[_ids[0]], 
                            _ids[1], readout_names[_ids[1]], 
                            _ids[2], readout_names[_ids[2]], ])

## save color_usage for analysis

In [30]:
import csv

with open(os.path.join(save_folder, 'Color_Usage.csv'), 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', 
                           quotechar='|', quoting=csv.QUOTE_MINIMAL)
    # write header
    csvwriter.writerow(['Hyb', '750', '647', '561', '488', '405',])
    
    # write reference frame
    csvwriter.writerow(['H0R0', 'forward_chrom', 'reverse_chrom', '', 'beads', 'DAPI',])
    
    chr_ids = np.array(list(readout_names.keys()))
    for _i in range(int(len(chr_ids)/3)):
        _ids = chr_ids[_i*3:_i*3+3]
        csvwriter.writerow([f"H{int(_i)+1}R{int(_i)+1}",
                            f"u{_ids[0]}",
                            f"u{_ids[1]}",
                            f"u{_ids[2]}",
                            "beads",
                            "",])
        

## save region_positions for analysis

In [31]:
import csv

with open(os.path.join(save_folder, 'Region_Positions.csv'), 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', 
                           quotechar='|', quoting=csv.QUOTE_MINIMAL)
    # write header
    csvwriter.writerow(['region', 'chr', 'start', 'end', 'midpoint',])
    
    for _i,_info in region_info.items():
        csvwriter.writerow([_i, 
                            _info['chr'], 
                            _info['start'],
                            _info['end'],
                            _info['mid'],
                           ])