# create reference files for adding adaptors and analysis

for library CTP-08 IgH (batch1)

by Pu Zheng

2020.07.29

In [1]:
%run "E:\Users\puzheng\Documents\Startup_py3.py"
sys.path.append(r"E:\Users\puzheng\Documents")

import ImageAnalysis3 as ia
%matplotlib notebook

from ImageAnalysis3 import *
print(os.getpid())

25832


In [2]:
import h5py
from ImageAnalysis3.classes import _allowed_kwds
import ast

In [3]:
probe_filename = r'\\10.245.74.69\Chromatin_NAS_2\Libraries\SI16\SI16.fasta'
if not os.path.isfile(probe_filename):
    raise IOError(f"input probe file: {probe_filename} doesn't exist.")


In [4]:
# biopython imports
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML

In [12]:
pb_records = []
with open(probe_filename, 'r') as handle:
    for record in SeqIO.parse(handle, "fasta"):
        if 'chr21_34660000_36140000_res5000' in record.id:
            pb_records.append(record)

In [13]:
pb_records[0].id

'chr21_34660000_36140000_res5000_reg0_pb_0__A:W1A09__B:W1A06__R:NDB_784'

In [15]:
pb_records[0].id.split('reg')

['chr21_34660000_36140000_res5000_', '0_pb_0__A:W1A09__B:W1A06__R:NDB_784']

In [16]:
readout_names = {}
for _r in pb_records:
    _rid = int(_r.id.split('reg')[1].split('_')[0])
    _rd_name = _r.id.split("R:")[1]
    
    if _rid not in readout_names:
        readout_names[_rid] = _rd_name

In [27]:
# extract region info
_reg_info = pb_records[0].id.split('_res')[0].split('chr')[1]
_resolution = int(pb_records[0].id.split('_res')[1].split('_')[0])
print(_reg_info, _resolution)
_chr, _reg_start, _reg_end = _reg_info.split('_')
_chr_name = 'chr'+_chr
_reg_start = int(_reg_start)
_reg_end = int(_reg_end)

region_info = {}
for _r in pb_records:
    _rid = int(_r.id.split('reg')[1].split('_')[0])
    
    if _rid not in region_info:
        _start = _reg_start + _rid * _resolution
        _end = _start + _resolution
        _mid = int((_start+_end)/2)

        region_info[_rid] = {'chr': 'chr'+_chr,
                             'start': _start,
                             'end': _end,
                             'mid': _mid,
                            }

21_34660000_36140000 5000


In [28]:
region_info

{0: {'chr': 'chr21', 'start': 34660000, 'end': 34665000, 'mid': 34662500},
 1: {'chr': 'chr21', 'start': 34665000, 'end': 34670000, 'mid': 34667500},
 2: {'chr': 'chr21', 'start': 34670000, 'end': 34675000, 'mid': 34672500},
 3: {'chr': 'chr21', 'start': 34675000, 'end': 34680000, 'mid': 34677500},
 4: {'chr': 'chr21', 'start': 34680000, 'end': 34685000, 'mid': 34682500},
 5: {'chr': 'chr21', 'start': 34685000, 'end': 34690000, 'mid': 34687500},
 6: {'chr': 'chr21', 'start': 34690000, 'end': 34695000, 'mid': 34692500},
 7: {'chr': 'chr21', 'start': 34695000, 'end': 34700000, 'mid': 34697500},
 8: {'chr': 'chr21', 'start': 34700000, 'end': 34705000, 'mid': 34702500},
 9: {'chr': 'chr21', 'start': 34705000, 'end': 34710000, 'mid': 34707500},
 10: {'chr': 'chr21', 'start': 34710000, 'end': 34715000, 'mid': 34712500},
 11: {'chr': 'chr21', 'start': 34715000, 'end': 34720000, 'mid': 34717500},
 12: {'chr': 'chr21', 'start': 34720000, 'end': 34725000, 'mid': 34722500},
 13: {'chr': 'chr21', 

## save adaptor_sequences.csv for adding adaptors

In [29]:
import csv

save_folder = r'\\10.245.74.69\Chromatin_NAS_2\Libraries\SI16\color_usage_info'
with open(os.path.join(save_folder, 'adaptor_sequences.csv'), 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', 
                           quotechar='|', quoting=csv.QUOTE_MINIMAL)
    csvwriter.writerow(['group', 'hyb', 
                        '750_region', '750_readout',
                        '647_region', '647_readout',
                        '561_region', '561_readout',])
    
    chr_ids = np.array(list(readout_names.keys()))

    for _i in range(int(len(chr_ids)/3)):
        _ids = chr_ids[_i*3:_i*3+3]
        csvwriter.writerow([int((_i)/32)+1, _i%32+1,
                            _ids[0], readout_names[_ids[0]], 
                            _ids[1], readout_names[_ids[1]], 
                            _ids[2], readout_names[_ids[2]], ])

## save color_usage for analysis

In [30]:
import csv

with open(os.path.join(save_folder, 'Color_Usage.csv'), 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', 
                           quotechar='|', quoting=csv.QUOTE_MINIMAL)
    # write header
    csvwriter.writerow(['Hyb', '750', '647', '561', '488', '405',])
    
    # write reference frame
    csvwriter.writerow(['H0R0', 'forward_chrom', 'reverse_chrom', '', 'beads', 'DAPI',])
    
    chr_ids = np.array(list(readout_names.keys()))
    for _i in range(int(len(chr_ids)/3)):
        _ids = chr_ids[_i*3:_i*3+3]
        csvwriter.writerow([f"H{int(_i)+1}R{int(_i)+1}",
                            f"u{_ids[0]}",
                            f"u{_ids[1]}",
                            f"u{_ids[2]}",
                            "beads",
                            "",])
        

## save region_positions for analysis

In [31]:
import csv

with open(os.path.join(save_folder, 'Region_Positions.csv'), 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', 
                           quotechar='|', quoting=csv.QUOTE_MINIMAL)
    # write header
    csvwriter.writerow(['region', 'chr', 'start', 'end', 'midpoint',])
    
    for _i,_info in region_info.items():
        csvwriter.writerow([_i, 
                            _info['chr'], 
                            _info['start'],
                            _info['end'],
                            _info['mid'],
                           ])