# create reference files for adding adaptors and analysis

for library CTP-08 IgH (batch1)

by Pu Zheng

2020.07.29

In [1]:
%run "E:\Users\puzheng\Documents\Startup_py3.py"
sys.path.append(r"E:\Users\puzheng\Documents")

import ImageAnalysis3 as ia
%matplotlib notebook

from ImageAnalysis3 import *
print(os.getpid())

20460


In [2]:
import h5py
from ImageAnalysis3.classes import _allowed_kwds
import ast

In [3]:
probe_filename = r'\\10.245.74.212\Chromatin_NAS_2\Libraries\CTP-08_IgH\5kb\final_probes\batch_1_final_probes.fasta'
if not os.path.isfile(probe_filename):
    raise IOError(f"input probe file: {probe_filename} doesn't exist.")


In [4]:
# biopython imports
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML

## calculate probe number per region

In [5]:
pb_records = []
with open(probe_filename, 'r') as handle:
    for record in SeqIO.parse(handle, "fasta"):
        pb_records.append(record)

In [6]:
pb_dict = library_tools.quality_check.split_probe_by_gene(pb_records)

In [10]:
reg_size_dict = {_k:len(_v) for _k,_v in pb_dict.items()}
print(reg_size_dict)

{'41': 100, '42': 83, '43': 100, '44': 70, '45': 61, '46': 16, '47': 55, '48': 29, '49': 26, '50': 20, '51': 21, '52': 30, '53': 45, '54': 84, '55': 50, '56': 31, '57': 4, '59': 15, '60': 32, '61': 28, '62': 50, '63': 69, '64': 27, '65': 3, '66': 57, '67': 53, '68': 29, '69': 21, '70': 26, '71': 36, '72': 57, '73': 53, '74': 23, '75': 36, '76': 18, '77': 33, '78': 57, '79': 47, '80': 40, '81': 39, '82': 35, '83': 40, '84': 31, '85': 10, '86': 5, '87': 51, '88': 40, '89': 30, '90': 23, '91': 16, '92': 14, '93': 6, '94': 32, '95': 25, '96': 55, '97': 51, '98': 18, '99': 9, '100': 29, '101': 53, '102': 58, '103': 45, '104': 66, '105': 73, '106': 38, '107': 50, '108': 45, '109': 1, '110': 36, '111': 49, '112': 43, '113': 55, '114': 48, '115': 11, '321': 36, '323': 25, '324': 35, '325': 51, '326': 17, '327': 12, '328': 10, '329': 3, '330': 2, '331': 12, '332': 11, '333': 14, '334': 11, '335': 1, '337': 21, '339': 20, '340': 31, '341': 42, '342': 27, '343': 29, '344': 28, '345': 32, '346': 1

In [11]:
readout_names = {}
for _r in pb_records:
    _rid = int(_r.id.split('gene_')[1].split('_')[0])
    _rd_names = _r.id.split('[')[1].split(']')[0].split(',')
    if _rid not in readout_names:
        readout_names[_rid] = []
        for _rd in _rd_names:
            if _rd not in readout_names[_rid]:
                if "_u" not in _rd:
                    readout_names[_rid].append(_rd)
                else:
                    readout_names[_rid].append(_rd.split('_u')[0])
                    

In [12]:
readout_names

{41: ['Stv_19', 'Stv_19', 'Stv_19'],
 42: ['Stv_53', 'Stv_53', 'Stv_53'],
 43: ['Stv_119', 'Stv_119', 'Stv_119'],
 44: ['Stv_20', 'Stv_20', 'Stv_20'],
 45: ['Stv_54', 'Stv_54', 'Stv_54'],
 46: ['Stv_120', 'Stv_120', 'Stv_120'],
 47: ['Stv_21', 'Stv_21', 'Stv_21'],
 48: ['Stv_59', 'Stv_59', 'Stv_59'],
 49: ['Stv_121', 'Stv_121', 'Stv_121'],
 50: ['Stv_22', 'Stv_22', 'Stv_22'],
 51: ['Stv_60', 'Stv_60', 'Stv_60'],
 52: ['Stv_125', 'Stv_125', 'Stv_125'],
 53: ['Stv_23', 'Stv_23', 'Stv_23'],
 54: ['Stv_61', 'Stv_61', 'Stv_61'],
 55: ['Stv_127', 'Stv_127', 'Stv_127'],
 56: ['Stv_25', 'Stv_25', 'Stv_25'],
 57: ['Stv_63', 'Stv_63', 'Stv_63'],
 59: ['Stv_129', 'Stv_129', 'Stv_129'],
 60: ['Stv_26', 'Stv_26', 'Stv_26'],
 61: ['Stv_64', 'Stv_64', 'Stv_64'],
 62: ['Stv_130', 'Stv_130', 'Stv_130'],
 63: ['Stv_27', 'Stv_27', 'Stv_27'],
 64: ['Stv_65', 'Stv_65', 'Stv_65'],
 65: ['Stv_131', 'Stv_131', 'Stv_131'],
 66: ['Stv_28', 'Stv_28', 'Stv_28'],
 67: ['Stv_86', 'Stv_86', 'Stv_86'],
 68: ['Stv_133

In [13]:
region_info = {}
for _r in pb_records:
    _rid = int(_r.id.split('gene_')[1].split('_')[0])
    
    if _rid not in region_info:
        # extract region info
        _reg_info = _r.id.split('_gene')[0].split('chr')[1]
        #print(_reg_info)
        chr_name = 'chr'+_reg_info.split(':')[0]
        start = int(_reg_info.split(':')[1].split('-')[0])
        end = int(_reg_info.split(':')[1].split('-')[1])
        mid = int((start+end)/2)
        region_info[_rid] = {'chr': chr_name,
                             'start': start,
                             'end': end,
                             'mid': mid,
                            }

In [14]:
region_info

{41: {'chr': 'chr12', 'start': 114654081, 'end': 114659081, 'mid': 114656581},
 42: {'chr': 'chr12', 'start': 114659081, 'end': 114664081, 'mid': 114661581},
 43: {'chr': 'chr12', 'start': 114664081, 'end': 114669081, 'mid': 114666581},
 44: {'chr': 'chr12', 'start': 114669081, 'end': 114674081, 'mid': 114671581},
 45: {'chr': 'chr12', 'start': 114674081, 'end': 114679081, 'mid': 114676581},
 46: {'chr': 'chr12', 'start': 114679081, 'end': 114684081, 'mid': 114681581},
 47: {'chr': 'chr12', 'start': 114684081, 'end': 114689081, 'mid': 114686581},
 48: {'chr': 'chr12', 'start': 114689081, 'end': 114694081, 'mid': 114691581},
 49: {'chr': 'chr12', 'start': 114694081, 'end': 114699081, 'mid': 114696581},
 50: {'chr': 'chr12', 'start': 114699081, 'end': 114704081, 'mid': 114701581},
 51: {'chr': 'chr12', 'start': 114704081, 'end': 114709081, 'mid': 114706581},
 52: {'chr': 'chr12', 'start': 114709081, 'end': 114714081, 'mid': 114711581},
 53: {'chr': 'chr12', 'start': 114714081, 'end': 114

## save adaptor_sequences.csv for adding adaptors

In [15]:
readout_folder = r'\\10.245.74.212\Chromatin_NAS_2\Libraries\Readouts'
ref_files = [_fl for _fl in os.listdir(readout_folder) if 'designed_readouts' in _fl]

ref_readout_dict = {}
for _fl in ref_files:
    _channel = int(_fl.split('designed_readouts_')[1].split('.fasta')[0])
    _ref_readout_names = []
    with open(os.path.join(readout_folder, _fl), 'r') as _rd_handle:
        for _readout in SeqIO.parse(_rd_handle, "fasta"):
            _ref_readout_names.append(_readout.id)
    ref_readout_dict[_channel] = _ref_readout_names

In [17]:
# sort regions with readout types
readout_by_channel = {_c:{} for _c in ref_readout_dict}
for _reg, _names in readout_names.items():
    for _c in readout_by_channel:
        _rd = np.unique(_names)[0]
        if _rd in ref_readout_dict[_c] and _reg not in readout_by_channel.values():
            readout_by_channel[_c][_reg] = _rd

In [18]:
readout_by_channel

{750: {41: 'Stv_19',
  44: 'Stv_20',
  47: 'Stv_21',
  50: 'Stv_22',
  53: 'Stv_23',
  56: 'Stv_25',
  60: 'Stv_26',
  63: 'Stv_27',
  66: 'Stv_28',
  69: 'Stv_29',
  72: 'Stv_30',
  75: 'Stv_31',
  78: 'NDB_1',
  81: 'NDB_4',
  84: 'NDB_7',
  87: 'NDB_10',
  90: 'NDB_13',
  93: 'NDB_16',
  96: 'NDB_19',
  99: 'NDB_22',
  102: 'NDB_25',
  105: 'NDB_28',
  108: 'NDB_31',
  111: 'NDB_34',
  114: 'NDB_37',
  323: 'NDB_241',
  326: 'NDB_244',
  329: 'NDB_247',
  332: 'NDB_250',
  335: 'NDB_253',
  339: 'NDB_256',
  342: 'NDB_259',
  345: 'NDB_262',
  348: 'NDB_265',
  351: 'NDB_268',
  354: 'NDB_271',
  357: 'NDB_274',
  360: 'NDB_277',
  363: 'NDB_280',
  366: 'NDB_283',
  369: 'NDB_286',
  372: 'NDB_289',
  375: 'NDB_292',
  379: 'NDB_295',
  382: 'NDB_298',
  388: 'NDB_304',
  391: 'NDB_307',
  394: 'NDB_310'},
 647: {42: 'Stv_53',
  45: 'Stv_54',
  48: 'Stv_59',
  51: 'Stv_60',
  54: 'Stv_61',
  57: 'Stv_63',
  61: 'Stv_64',
  64: 'Stv_65',
  67: 'Stv_86',
  70: 'Stv_87',
  73: 'Stv_88

In [19]:
dict_sizes = [len(_v) for _k,_v in readout_by_channel.items()]
print(dict_sizes)

[48, 50, 46]


In [35]:
import csv

save_folder = r'\\10.245.74.158\Chromatin_NAS_6\20200807-B_Dox-IAA-STI+_CTP-08_IgH\Analysis'
with open(os.path.join(save_folder, 'adaptor_sequences.csv'), 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', 
                           quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
    _header = ['group', 'hyb', ]
    for _ch in readout_by_channel:
        _header += [f"{_ch}_region", f"{_ch}_readout"]
        
    csvwriter.writerow(_header)
    
    for _i in range(max(dict_sizes)):
        _row = [int((_i)/32)+1, _i%32+1,] 
        for _ch, _dict in readout_by_channel.items():
            if _i >= len(_dict):
                _row += ['', '']
            else:
                _regs = sorted(_dict) 
                _row += [_regs[_i], _dict[_regs[_i]]]      
        csvwriter.writerow(_row)


## save color_usage for analysis

In [37]:
import csv

drift_channel = '488'
dapi_channel = '405'
chrom_labels = {'750': 'forward',
                '647': 'reverse',}

save_folder = r'\\10.245.74.158\Chromatin_NAS_6\20200807-B_Dox-IAA-STI+_CTP-08_IgH\Analysis'

with open(os.path.join(save_folder, 'Color_Usage.csv'), 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', 
                           quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
    # write header
    _header = ['Hyb']
    for _ch in readout_by_channel:
        _header .append(str(_ch))
    _header.append(drift_channel)
    _header.append(dapi_channel)
    print(_header)
    csvwriter.writerow(_header)
    
    # write reference frame
    _ref_row = ['H0R0']
    for _ch in readout_by_channel:
        if str(_ch) in chrom_labels:
            _ref_row.append(chrom_labels[str(_ch)]+'_chrom')
        else:
            _ref_row.append("")
    _ref_row.append('beads')
    _ref_row.append('DAPI')
    print(_ref_row)
    csvwriter.writerow(_ref_row)
    
    
    for _i in range(max(dict_sizes)):
        _row = [f"H{int(_i)+1}R{int(_i)+1}",] 
        for _ch, _dict in readout_by_channel.items():
            if _i >= len(_dict):
                _row += ['']
            else:
                _regs = sorted(_dict) 
                _row += [f"u{_regs[_i]}"]
        _row.append("beads")
        print(_row)
        csvwriter.writerow(_row)


['Hyb', '750', '647', '561', '488', '405']
['H0R0', 'forward_chrom', 'reverse_chrom', '', 'beads', 'DAPI']
['H1R1', 'u41', 'u42', 'u43', 'beads']
['H2R2', 'u44', 'u45', 'u46', 'beads']
['H3R3', 'u47', 'u48', 'u49', 'beads']
['H4R4', 'u50', 'u51', 'u52', 'beads']
['H5R5', 'u53', 'u54', 'u55', 'beads']
['H6R6', 'u56', 'u57', 'u59', 'beads']
['H7R7', 'u60', 'u61', 'u62', 'beads']
['H8R8', 'u63', 'u64', 'u65', 'beads']
['H9R9', 'u66', 'u67', 'u68', 'beads']
['H10R10', 'u69', 'u70', 'u71', 'beads']
['H11R11', 'u72', 'u73', 'u74', 'beads']
['H12R12', 'u75', 'u76', 'u77', 'beads']
['H13R13', 'u78', 'u79', 'u80', 'beads']
['H14R14', 'u81', 'u82', 'u83', 'beads']
['H15R15', 'u84', 'u85', 'u86', 'beads']
['H16R16', 'u87', 'u88', 'u89', 'beads']
['H17R17', 'u90', 'u91', 'u92', 'beads']
['H18R18', 'u93', 'u94', 'u95', 'beads']
['H19R19', 'u96', 'u97', 'u98', 'beads']
['H20R20', 'u99', 'u100', 'u101', 'beads']
['H21R21', 'u102', 'u103', 'u104', 'beads']
['H22R22', 'u105', 'u106', 'u107', 'beads']
[

In [24]:
import csv


drift_channel = '488'
dapi_channel = '405'
chrom_labels = {'750': 'forward',
                '647': 'reverse',}

save_folder = r'\\10.245.74.158\Chromatin_NAS_6\20200807-B_Dox-IAA-STI+_CTP-08_IgH\Analysis'

with open(os.path.join(save_folder, 'Region_probe_num.csv'), 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', 
                           quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
    # write header
    _header = ['Hyb']
    for _ch in readout_by_channel:
        _header .append(str(_ch))
    _header.append(drift_channel)
    _header.append(dapi_channel)
    print(_header)
    csvwriter.writerow(_header)
    
    # write reference frame
    _ref_row = ['H0R0']
    for _ch in readout_by_channel:
        if str(_ch) in chrom_labels:
            _ref_row.append(chrom_labels[str(_ch)]+'_chrom')
        else:
            _ref_row.append("")
    _ref_row.append('beads')
    _ref_row.append('DAPI')
    print(_ref_row)
    csvwriter.writerow(_ref_row)
    
    
    for _i in range(max(dict_sizes)):
        _row = [f"H{int(_i)+1}R{int(_i)+1}",] 
        for _ch, _dict in readout_by_channel.items():
            if _i >= len(_dict):
                _row += ['']
            else:
                _regs = sorted(_dict) 
                _row += [reg_size_dict[str(_regs[_i])]]
        _row.append("beads")
        print(_row)
        csvwriter.writerow(_row)

['Hyb', '750', '647', '561', '488', '405']
['H0R0', 'forward_chrom', 'reverse_chrom', '', 'beads', 'DAPI']
['H1R1', 100, 83, 100, 'beads']
['H2R2', 70, 61, 16, 'beads']
['H3R3', 55, 29, 26, 'beads']
['H4R4', 20, 21, 30, 'beads']
['H5R5', 45, 84, 50, 'beads']
['H6R6', 31, 4, 15, 'beads']
['H7R7', 32, 28, 50, 'beads']
['H8R8', 69, 27, 3, 'beads']
['H9R9', 57, 53, 29, 'beads']
['H10R10', 21, 26, 36, 'beads']
['H11R11', 57, 53, 23, 'beads']
['H12R12', 36, 18, 33, 'beads']
['H13R13', 57, 47, 40, 'beads']
['H14R14', 39, 35, 40, 'beads']
['H15R15', 31, 10, 5, 'beads']
['H16R16', 51, 40, 30, 'beads']
['H17R17', 23, 16, 14, 'beads']
['H18R18', 6, 32, 25, 'beads']
['H19R19', 55, 51, 18, 'beads']
['H20R20', 9, 29, 53, 'beads']
['H21R21', 58, 45, 66, 'beads']
['H22R22', 73, 38, 50, 'beads']
['H23R23', 45, 1, 36, 'beads']
['H24R24', 49, 43, 55, 'beads']
['H25R25', 48, 11, 51, 'beads']
['H26R26', 25, 36, 10, 'beads']
['H27R27', 17, 35, 12, 'beads']
['H28R28', 3, 12, 11, 'beads']
['H29R29', 11, 2, 42

In [21]:
reg_size_dict

{'41': 100,
 '42': 83,
 '43': 100,
 '44': 70,
 '45': 61,
 '46': 16,
 '47': 55,
 '48': 29,
 '49': 26,
 '50': 20,
 '51': 21,
 '52': 30,
 '53': 45,
 '54': 84,
 '55': 50,
 '56': 31,
 '57': 4,
 '59': 15,
 '60': 32,
 '61': 28,
 '62': 50,
 '63': 69,
 '64': 27,
 '65': 3,
 '66': 57,
 '67': 53,
 '68': 29,
 '69': 21,
 '70': 26,
 '71': 36,
 '72': 57,
 '73': 53,
 '74': 23,
 '75': 36,
 '76': 18,
 '77': 33,
 '78': 57,
 '79': 47,
 '80': 40,
 '81': 39,
 '82': 35,
 '83': 40,
 '84': 31,
 '85': 10,
 '86': 5,
 '87': 51,
 '88': 40,
 '89': 30,
 '90': 23,
 '91': 16,
 '92': 14,
 '93': 6,
 '94': 32,
 '95': 25,
 '96': 55,
 '97': 51,
 '98': 18,
 '99': 9,
 '100': 29,
 '101': 53,
 '102': 58,
 '103': 45,
 '104': 66,
 '105': 73,
 '106': 38,
 '107': 50,
 '108': 45,
 '109': 1,
 '110': 36,
 '111': 49,
 '112': 43,
 '113': 55,
 '114': 48,
 '115': 11,
 '321': 36,
 '323': 25,
 '324': 35,
 '325': 51,
 '326': 17,
 '327': 12,
 '328': 10,
 '329': 3,
 '330': 2,
 '331': 12,
 '332': 11,
 '333': 14,
 '334': 11,
 '335': 1,
 '337': 2

In [20]:
for _ch, _dict in readout_by_channel.items():
    print(_dict)
    break

{41: 'Stv_19', 44: 'Stv_20', 47: 'Stv_21', 50: 'Stv_22', 53: 'Stv_23', 56: 'Stv_25', 60: 'Stv_26', 63: 'Stv_27', 66: 'Stv_28', 69: 'Stv_29', 72: 'Stv_30', 75: 'Stv_31', 78: 'NDB_1', 81: 'NDB_4', 84: 'NDB_7', 87: 'NDB_10', 90: 'NDB_13', 93: 'NDB_16', 96: 'NDB_19', 99: 'NDB_22', 102: 'NDB_25', 105: 'NDB_28', 108: 'NDB_31', 111: 'NDB_34', 114: 'NDB_37', 323: 'NDB_241', 326: 'NDB_244', 329: 'NDB_247', 332: 'NDB_250', 335: 'NDB_253', 339: 'NDB_256', 342: 'NDB_259', 345: 'NDB_262', 348: 'NDB_265', 351: 'NDB_268', 354: 'NDB_271', 357: 'NDB_274', 360: 'NDB_277', 363: 'NDB_280', 366: 'NDB_283', 369: 'NDB_286', 372: 'NDB_289', 375: 'NDB_292', 379: 'NDB_295', 382: 'NDB_298', 388: 'NDB_304', 391: 'NDB_307', 394: 'NDB_310'}


## save region_positions for analysis

In [11]:
import csv

with open(os.path.join(save_folder, 'Region_Positions.csv'), 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', 
                           quotechar='|', quoting=csv.QUOTE_MINIMAL)
    # write header
    csvwriter.writerow(['region', 'chr', 'start', 'end', 'midpoint',])
    
    for _i,_info in region_info.items():
        csvwriter.writerow([_i, 
                            _info['chr'], 
                            _info['start'],
                            _info['end'],
                            _info['mid'],
                           ])