In [1]:
import pandas as pd
import numpy as np
import os, sys, time
import re

# bioPython
from Bio.Seq import Seq

In [2]:
# load
probe_filename = r'/lab/solexa_weissman/puzheng/MERFISH_Probes/PE_TS/PL70-76_amplifier_pools_bits_pegfish_2lvl.xlsx'
probe_df = pd.read_excel(probe_filename)

In [3]:
np.unique(probe_df['Pool name'], return_counts=True)


(array(['PL70_combined_bit_amps_15', 'PL71_combined_bit_amps_20',
        'PL72_emx1_bit_350', 'PL73_hek3_bit_350', 'PL74_rnf2_bit_350',
        'PL75_combined_bit_amps_20_btree_lvl1',
        'PL76_combined_bit_amps_20_btree_lvl2'], dtype=object),
 array([120, 120,  21,  21,  21, 120, 144]))

## Design Hek3

In [4]:
#sel_pools = ['PL72_emx1_bit_350', 'PL73_hek3_bit_350', 'PL74_rnf2_bit_350',]
sel_pools = ['PL73_hek3_bit_350', 'PL74_rnf2_bit_350']
readout_copy_nums = [4, 8, 13]
T7_promoter = Seq('TAATACGACTCACTATAGGG')
T7_promoter_rc = str(T7_promoter.reverse_complement())
# assemble
pool_2_seqs = {}
#pool_2_seqs = {_n:[] for _n in readout_copy_nums}
readout_site_dict = {'name':[], 'seq':[]}
readout_size = 20

for sel_pool in sel_pools:
    sel_probe_df = probe_df.loc[probe_df['Pool name']==sel_pool]
    for seq in list(sel_probe_df['Sequence'].values):
        # Parse
        fwd_seq, rev_primer_rc = re.findall('[ATCG]+', seq)
        fwd_primer = fwd_seq[:len(rev_primer_rc)]
        target_seq = fwd_seq[len(rev_primer_rc):]
        
        readout_seq = re.findall('[atcg]+', seq)[0]
        readout_site = readout_seq[:readout_size]
        
        #print(fwd_primer, rev_primer_rc, target_seq, readout_site)
        # append
        readout_site_dict['name'].append(sel_pool.split('_bit_350')[0])
        readout_site_dict['name'].append(readout_site)
        for _n in readout_copy_nums:
            _pool_name = f"{sel_pool.split('_bit_350')[0]}-{_n}_readouts"
            if _pool_name not in pool_2_seqs:
                pool_2_seqs[_pool_name] = []
            
            #_final_seq_list = [fwd_primer] + [readout_site]*min(_n,2) + [target_seq] + [readout_site]*max(_n-2, 0) + [rev_primer_rc]
            _final_seq_list = [fwd_primer] + [readout_site]*_n + [target_seq] + [rev_primer_rc] + [T7_promoter_rc]
            _final_seq = ''.join(_final_seq_list)
            # append
            pool_2_seqs[_pool_name].append(_final_seq)

In [5]:
pool_2_seqs.keys()

dict_keys(['PL73_hek3-4_readouts', 'PL73_hek3-8_readouts', 'PL73_hek3-13_readouts', 'PL74_rnf2-4_readouts', 'PL74_rnf2-8_readouts', 'PL74_rnf2-13_readouts'])

In [6]:
# create df
summary_dict = {'Pool name': [], 'Sequence': []}
for _pool_name, _seqs in pool_2_seqs.items():
    summary_dict['Pool name'].extend([_pool_name]*len(_seqs))
    summary_dict['Sequence'].extend(_seqs)
summary_df = pd.DataFrame(summary_dict)

In [7]:
summary_df

Unnamed: 0,Pool name,Sequence
0,PL73_hek3-4_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGgaggcggattgagattcggtg...
1,PL73_hek3-4_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGcgatggtcgtcctcgtttcgc...
2,PL73_hek3-4_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGgtttgcgtgtaatcgactctg...
3,PL73_hek3-4_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGgccgtcgtcacgtgcgagtag...
4,PL73_hek3-4_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGgatgcctcttcgatagattcg...
...,...,...
121,PL74_rnf2-13_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGggcactaggataactttaggg...
122,PL74_rnf2-13_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGgtccatgatacgaggtgatag...
123,PL74_rnf2-13_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGgcacgtatgtcccgtccattg...
124,PL74_rnf2-13_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGaagggcgatgtaacggcgcaa...


In [9]:
# save
overwrite = True
save_folder = r'/lab/solexa_weissman/puzheng/MERFISH_Probes/PE_LT/Edits'
if not os.path.exists(save_folder):
    os.makedirs(save_folder)
save_probe_filename = os.path.join(save_folder,
                             f"{time.localtime().tm_year}_{time.localtime().tm_mon}_{time.localtime().tm_mday}_variable_length_PL72.xlsx")

if not os.path.exists(save_probe_filename) or overwrite:
    print(f"saving probes to file: {save_probe_filename}")
    summary_df.to_excel(save_probe_filename, index=None)

saving probes to file: /lab/solexa_weissman/puzheng/MERFISH_Probes/PE_LT/Edits/2023_8_27_variable_length_PL72.xlsx


In [10]:
# primers
primer_dict = {"Name":['PL72_fwd', 'PL72_T7_rev', 'PL72_T7'], 
               "Sequence":[fwd_primer, str(Seq(rev_primer_rc +T7_promoter_rc).reverse_complement()), str(T7_promoter)], 
               "Scale":['25nm']*3, 
               "Purification":['STD']*3}
primer_df = pd.DataFrame(primer_dict)
primer_df

Unnamed: 0,Name,Sequence,Scale,Purification
0,PL72_fwd,CGCGCGCCTTTGGCGGGAAGTCCTG,25nm,STD
1,PL72_T7_rev,TAATACGACTCACTATAGGGAGTCGCATGCCGTGGCCGGCGACTT,25nm,STD
2,PL72_T7,TAATACGACTCACTATAGGG,25nm,STD


In [11]:
save_primer_filename = os.path.join(save_folder,
                             f"{time.localtime().tm_year}_{time.localtime().tm_mon}_{time.localtime().tm_mday}_Primers_PL72.xlsx")

if not os.path.exists(save_primer_filename) or overwrite:
    print(f"saving primers to file: {save_primer_filename}")
    primer_df.to_excel(save_primer_filename, index=None)

saving primers to file: /lab/solexa_weissman/puzheng/MERFISH_Probes/PE_LT/Edits/2023_8_27_Primers_PL72.xlsx


# Check readouts

In [17]:
readout_folder = r'/lab/solexa_weissman/puzheng/References/Readouts'
readout_reference = os.path.join(readout_folder, 'Readout_summary.xlsx')

# load
readout_table = pd.read_excel(readout_reference)
readout_len = 20

In [27]:
annotated_summary_df

Unnamed: 0,Pool name,Sequence
0,PL73_hek3-4_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGgaggcggattgagattcggtg...
1,PL73_hek3-4_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGcgatggtcgtcctcgtttcgc...
2,PL73_hek3-4_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGgtttgcgtgtaatcgactctg...
3,PL73_hek3-4_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGgccgtcgtcacgtgcgagtag...
4,PL73_hek3-4_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGgatgcctcttcgatagattcg...
...,...,...
121,PL74_rnf2-13_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGggcactaggataactttaggg...
122,PL74_rnf2-13_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGgtccatgatacgaggtgatag...
123,PL74_rnf2-13_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGgcacgtatgtcccgtccattg...
124,PL74_rnf2-13_readouts,CGCGCGCCTTTGGCGGGAAGTCCTGaagggcgatgtaacggcgcaa...


In [29]:
annotated_summary_df = summary_df.copy()
probe_bits_list = []
for _seq in annotated_summary_df['Sequence']:
    #print(_seq)
    _seq_bits = []
    for _i in range(0, len(_seq)-readout_len+1):
        _target = _seq[_i:_i+readout_len].upper()
        if _target in readout_table['Target'].values:
            #print(_target)
            _seq_bits.append(readout_table.loc[readout_table['Target']==_target, 'Name'].values[0])
            
    probe_bits_list.append(_seq_bits)
annotated_summary_df.loc[:,'readout'] = [np.unique(_rds)[0] for _rds in probe_bits_list]

## save

In [30]:
annotated_summary_df.to_excel(os.path.join(save_folder, 'PL72_Hek3_Rnf2_annotated.xlsx'), index=None)