In [2]:
# Env
import os
import pandas as pd
import numpy as np
from pathlib import Path


DATASET_DIR = Path('F:\spatial_data\dataset_probe')
RUNID = '20240808_C.Y_VirusInMigrasomes'
workdir = DATASET_DIR / RUNID
PANEL = 'PRISM'
organism = 'VSIV'

In [None]:
gene_info = pd.read_excel(os.path.join(workdir, "list_in_situ_imaging_2024.6.24.xlsx"), index_col=0)
if organism == 'mouse': gene_info['name'] = gene_info['name'].str.capitalize()
elif organism == 'human': gene_info['name'] = gene_info['name'].str.upper()
gene_list = [_.strip() for _ in gene_info['name'].unique() if _!=0]
print(len(gene_list))
gene_info.head()

# Binding sites

In [None]:
binding_df = pd.read_excel(DATASET_DIR / RUNID / "gene_binding_site.xlsx", index_col=0)
if organism == 'mouse': binding_df['gene_name'] = binding_df['gene_name'].str.capitalize()
elif organism == 'human': binding_df['gene_name'] = binding_df['gene_name'].str.upper()
binding_df = binding_df[binding_df['gene_name'].isin(gene_list)]
print(len(binding_df))
binding_df.head()

In [20]:
binding_df = pd.read_excel(workdir/'binding_site_info.xlsx', index_col=0)
binding_df.columns = [_.replace('gene','gene_name') for _ in binding_df.columns]
binding_df['binding'] = binding_df['binding_left'] + binding_df['binding_right']
binding_df

Unnamed: 0_level_0,gene_name,5'gap_pos,binding_left,binding_right,Tm_left,Tm_right,Tm,specificity,perform_minus,strand,seq,binding
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,VSIV_N_target_gnm,125,GGTATTCCACTGGATCCTCA,TTTGCAGGAAGTTTTGGAAC,53.5,53.0,66.0,,,,,GGTATTCCACTGGATCCTCATTTGCAGGAAGTTTTGGAAC
2,VSIV_N_target_gnm,611,TACTGTCATTTCCCCACACA,TCAAAAATGTCACGACCTTC,54.0,52.5,66.0,,,,,TACTGTCATTTCCCCACACATCAAAAATGTCACGACCTTC
3,VSIV_N_target_gnm,1194,TCTTCAAACCATCCGAGCCA,TTCGACCACATCTCTGCCTT,57.0,57.5,70.0,,,,,TCTTCAAACCATCCGAGCCATTCGACCACATCTCTGCCTT
4,VSIV_N_target_mRNA,383,TACAATCGGAATATTTGACC,TTGTATCCTTGAAAGCCCTG,48.5,52.5,64.0,,,,,TACAATCGGAATATTTGACCTTGTATCCTTGAAAGCCCTG
5,VSIV_N_target_mRNA,929,ATTGTCTTCTAAGTCTCCAT,ATTCTTCCGTCAAAAACCCT,50.0,54.5,64.5,,,,,ATTGTCTTCTAAGTCTCCATATTCTTCCGTCAAAAACCCT
6,VSIV_N_target_mRNA,1298,AGGCCTAAGAGAGAAGACAA,TTGGCAAGTATGCTAAGTCA,53.5,51.0,66.0,,,,,AGGCCTAAGAGAGAAGACAATTGGCAAGTATGCTAAGTCA
7,VSIV_NS-P_target_gnm,1466,GTTGTGCTTCGATCTCATCT,ATCTCTCCTACCGCCTGATC,54.5,56.5,67.0,,,,,GTTGTGCTTCGATCTCATCTATCTCTCCTACCGCCTGATC
8,VSIV_NS-P_target_gnm,1869,CCTTCTCCCGATGCTTCAAA,TGTGCACTCTGCCAGATTCC,55.0,58.0,70.0,,,,,CCTTCTCCCGATGCTTCAAATGTGCACTCTGCCAGATTCC
9,VSIV_NS-P_target_gnm,2073,GAGATGAACTCTCCTCTAGA,TGAGAACAATTCATCCAAGG,50.5,52.0,64.0,,,,,GAGATGAACTCTCCTCTAGATGAGAACAATTCATCCAAGG
10,VSIV_NS-P_target_mRNA,1704,GATGAGGAAGTGGATGTTGT,ATTTACTTCGGACTGGAAAC,54.0,52.0,65.0,,,,,GATGAGGAAGTGGATGTTGTATTTACTTCGGACTGGAAAC


# Barcode_df

In [7]:
if PANEL == 'PRISM':
    probe_df = pd.DataFrame()
    barcode_df = pd.read_excel(DATASET_DIR / "PRISM_30plex_barcode.xlsx", index_col=0)[['Barcode (82bp)']]
elif PANEL == 'SPRINTseq':
    barcode_df = pd.read_excel(DATASET_DIR / "SPRINTSEQ_369_barcode.xlsx", index_col=0)[['Barcode sequence']]
    primer_l = 'TCCCTACACGACGCTCTTCCGATCT'
    primer_r = 'CATTCCTGCTGAACCGCTCTTCCGA'
    barcode_df['Barcode(70bp)'] = primer_l + barcode_df['Barcode sequence'] + primer_r + barcode_df['Barcode sequence']
barcode_df.head()

Unnamed: 0,Barcode (82bp)
Prism_1,TCACACGACGCTCTTCCGATCTCGCAGCAGATAAATGAGCCATCTC...
Prism_2,TCACACGACGCTCTTCCGATCTCCTCAATGCTGCTGCTGTACTCTC...
Prism_3,TCACACGACGCTCTTCCGATCTGATTGCACATGTATGGCACCTCTC...
Prism_4,TCACACGACGCTCTTCCGATCTAGTAGCCGTGACTATCGACTTCTC...
Prism_5,TCACACGACGCTCTTCCGATCTTGCGTCTATTTAGTGGAGCCTCTC...


# Stitch of probe

## direct combine of binding site all

In [None]:
# prism_list = [_ for _ in range(1, 31) if _ not in [1,5,9,]]
prism_list = [_ for _ in range(1, 31)]
for num, (prism, gene) in enumerate(zip(prism_list, binding_df["gene"].tolist())):
    binding = binding_df["binding"].loc[num]
    binding_l = binding_df["binding_left"].loc[num]
    binding_r = binding_df["binding_right"].loc[num]

    # assert len(binding) == 40, f"binding site length is not 40bp: {binding}"
    # binding_l = binding[:20].lower()
    # binding_r = binding[20:].lower()
    barcode = barcode_df.loc[f"Prism_{prism}", "Barcode (82bp)"]
    probe = binding_r.lower() + barcode.upper() + binding_l.lower()

    probe_info = pd.DataFrame({
            "PRISM": [f"PRISM_{prism}"],
            "gene":[f'{gene}'],
            "probe": [probe],
            "barcode": [barcode],
            "binding": [binding],
            })

    if len(probe_df) == 0:
        probe_df = probe_info
    else:
        probe_df = pd.concat([probe_df, probe_info])

probe_df = probe_df.set_index('PRISM')
probe_df.head()

## select middle one for multi binding sites

In [None]:
# 创建一个空的DataFrame来存储结果
middle_rows = pd.DataFrame()

# 按'Gene'分组，然后找到每组的中间行
for name, group in binding_df.groupby('gene_name',sort=False):
    middle_index = len(group) // 2
    # 使用.iloc来获取真实的DataFrame索引
    middle_row = group.iloc[[middle_index]]
    middle_rows = pd.concat([middle_rows, middle_row])

binding_df = middle_rows.copy()
print(len(binding_df))
binding_df.head()

In [None]:
# prism_list = [_ for _ in range(1, 31) if _ not in [1,5,9,]]
# prism_list = [_ for _ in range(1, 31)]
probe_df = pd.DataFrame()
seq_list = [_+1 for _ in range(len(binding_df))]
seq_list = [_+1 for _ in range(30)]

for num, (id, gene) in enumerate(zip(seq_list, binding_df["gene_name"].tolist())):
    binding = binding_df["binding"].iloc[num]
    assert len(binding) == 40, f"binding site length is not 40bp: {binding}"
    binding_l = binding[:20]
    binding_r = binding[20:]
    # binding_l = binding_df["binding_left"].loc[num]
    # binding_r = binding_df["binding_right"].loc[num]
    barcode = barcode_df.loc[f'Prism_{id}', "Barcode (82bp)"] if PANEL == 'PRISM' else barcode_df.loc[id, "Barcode(70bp)"]
    probe = binding_r.lower() + barcode.upper() + binding_l.lower()

    if PANEL == 'PRISM':
        probe_info = pd.DataFrame({
            "PRISM": [f"PRISM_{id}"],
            "gene":[f'{gene.upper()}'],
            "probe_name":[f'PRISM_{id}_{gene.upper()}'],
            "probe_seq": [probe],
            "barcode_seq": [barcode],
            "binding_seq": [binding],})

    elif PANEL == 'SPRINTseq':
        probe_info = pd.DataFrame({
            "SPRINTseq": [f"SPRINTseq_{id}"],
            "gene":[f'{gene.upper()}'],
            "probe_name":[f'Seq_{id}_{gene.upper()}'],
            "probe": [probe],
            "barcode": [barcode],
            "binding": [binding],})
        
    if len(probe_df) == 0: probe_df = probe_info
    else: probe_df = pd.concat([probe_df, probe_info])

probe_df = probe_df.set_index('PRISM') if PANEL == 'PRISM' else probe_df.set_index('SPRINTseq')
probe_df.head()

## select 3 binding sites for each gene

In [25]:
probe_df = pd.DataFrame()

cont = 0
prism_pos = 0
prism_pos_list = [2,3,4,6,7,8,10,11,12,13,14,15]
prism = prism_pos_list[prism_pos]
max_cont = 3
pre_gene_name = binding_df["gene_name"].iloc[0]
for num, gene in enumerate(binding_df["gene_name"]):
    if pre_gene_name != gene:
        pre_gene_name = gene
        cont = 0
        prism_pos += 1
        prism = prism_pos_list[prism_pos]
    elif cont == max_cont:
        continue
    # print(num, gene, prism)
    cont += 1
    binding = binding_df["binding"].iloc[num]
    assert len(binding) == 40, f"binding site at pos {num} length is not 40bp: {binding}, {len(binding)} instead."

    binding_l = binding[:20].lower()
    binding_r = binding[20:].lower()
    barcode = barcode_df.loc[f"Prism_{prism}", "Barcode (82bp)"]
    probe = binding_r + barcode + binding_l

    if PANEL == 'PRISM':
        probe_info = pd.DataFrame({
            "PRISM": [f"PRISM_{prism}"],
            "gene":[f'{gene}'],
            "probe_name":[f'PRISM_{prism}_{gene}_{cont}'],
            "probe_seq": [probe],
            "barcode_seq": [barcode],
            "binding_seq": [binding],})

    elif PANEL == 'SPRINTseq':
        probe_info = pd.DataFrame({
            "SPRINTseq": [f"SPRINTseq_{prism}"],
            "gene":[f'{gene}'],
            "probe_name":[f'Seq_{prism}_{gene}_{cont}'],
            "probe": [probe],
            "barcode": [barcode],
            "binding": [binding],})
    if len(probe_df) == 0: probe_df = probe_info
    else: probe_df = pd.concat([probe_df, probe_info])
probe_df = probe_df.reset_index(drop=True)
probe_df

Unnamed: 0,PRISM,gene,probe_name,probe_seq,barcode_seq,binding_seq
0,PRISM_2,VSIV_N_target_gnm,PRISM_2_VSIV_N_target_gnm_1,tttgcaggaagttttggaacTCACACGACGCTCTTCCGATCTCCTC...,TCACACGACGCTCTTCCGATCTCCTCAATGCTGCTGCTGTACTCTC...,GGTATTCCACTGGATCCTCATTTGCAGGAAGTTTTGGAAC
1,PRISM_2,VSIV_N_target_gnm,PRISM_2_VSIV_N_target_gnm_2,tcaaaaatgtcacgaccttcTCACACGACGCTCTTCCGATCTCCTC...,TCACACGACGCTCTTCCGATCTCCTCAATGCTGCTGCTGTACTCTC...,TACTGTCATTTCCCCACACATCAAAAATGTCACGACCTTC
2,PRISM_2,VSIV_N_target_gnm,PRISM_2_VSIV_N_target_gnm_3,ttcgaccacatctctgccttTCACACGACGCTCTTCCGATCTCCTC...,TCACACGACGCTCTTCCGATCTCCTCAATGCTGCTGCTGTACTCTC...,TCTTCAAACCATCCGAGCCATTCGACCACATCTCTGCCTT
3,PRISM_3,VSIV_N_target_mRNA,PRISM_3_VSIV_N_target_mRNA_1,ttgtatccttgaaagccctgTCACACGACGCTCTTCCGATCTGATT...,TCACACGACGCTCTTCCGATCTGATTGCACATGTATGGCACCTCTC...,TACAATCGGAATATTTGACCTTGTATCCTTGAAAGCCCTG
4,PRISM_3,VSIV_N_target_mRNA,PRISM_3_VSIV_N_target_mRNA_2,attcttccgtcaaaaaccctTCACACGACGCTCTTCCGATCTGATT...,TCACACGACGCTCTTCCGATCTGATTGCACATGTATGGCACCTCTC...,ATTGTCTTCTAAGTCTCCATATTCTTCCGTCAAAAACCCT
5,PRISM_3,VSIV_N_target_mRNA,PRISM_3_VSIV_N_target_mRNA_3,ttggcaagtatgctaagtcaTCACACGACGCTCTTCCGATCTGATT...,TCACACGACGCTCTTCCGATCTGATTGCACATGTATGGCACCTCTC...,AGGCCTAAGAGAGAAGACAATTGGCAAGTATGCTAAGTCA
6,PRISM_4,VSIV_NS-P_target_gnm,PRISM_4_VSIV_NS-P_target_gnm_1,atctctcctaccgcctgatcTCACACGACGCTCTTCCGATCTAGTA...,TCACACGACGCTCTTCCGATCTAGTAGCCGTGACTATCGACTTCTC...,GTTGTGCTTCGATCTCATCTATCTCTCCTACCGCCTGATC
7,PRISM_4,VSIV_NS-P_target_gnm,PRISM_4_VSIV_NS-P_target_gnm_2,tgtgcactctgccagattccTCACACGACGCTCTTCCGATCTAGTA...,TCACACGACGCTCTTCCGATCTAGTAGCCGTGACTATCGACTTCTC...,CCTTCTCCCGATGCTTCAAATGTGCACTCTGCCAGATTCC
8,PRISM_4,VSIV_NS-P_target_gnm,PRISM_4_VSIV_NS-P_target_gnm_3,tgagaacaattcatccaaggTCACACGACGCTCTTCCGATCTAGTA...,TCACACGACGCTCTTCCGATCTAGTAGCCGTGACTATCGACTTCTC...,GAGATGAACTCTCCTCTAGATGAGAACAATTCATCCAAGG
9,PRISM_6,VSIV_NS-P_target_mRNA,PRISM_6_VSIV_NS-P_target_mRNA_1,atttacttcggactggaaacTCACACGACGCTCTTCCGATCTTGCG...,TCACACGACGCTCTTCCGATCTTGCGTCTATTTAGTGGAGCCTCCG...,GATGAGGAAGTGGATGTTGTATTTACTTCGGACTGGAAAC


# Save probe

In [26]:
probe_df.to_excel(DATASET_DIR / RUNID / f'{PANEL}_probe.xlsx')
print(len(probe_df))
probe_df.head()

30


Unnamed: 0,PRISM,gene,probe_name,probe_seq,barcode_seq,binding_seq
0,PRISM_2,VSIV_N_target_gnm,PRISM_2_VSIV_N_target_gnm_1,tttgcaggaagttttggaacTCACACGACGCTCTTCCGATCTCCTC...,TCACACGACGCTCTTCCGATCTCCTCAATGCTGCTGCTGTACTCTC...,GGTATTCCACTGGATCCTCATTTGCAGGAAGTTTTGGAAC
1,PRISM_2,VSIV_N_target_gnm,PRISM_2_VSIV_N_target_gnm_2,tcaaaaatgtcacgaccttcTCACACGACGCTCTTCCGATCTCCTC...,TCACACGACGCTCTTCCGATCTCCTCAATGCTGCTGCTGTACTCTC...,TACTGTCATTTCCCCACACATCAAAAATGTCACGACCTTC
2,PRISM_2,VSIV_N_target_gnm,PRISM_2_VSIV_N_target_gnm_3,ttcgaccacatctctgccttTCACACGACGCTCTTCCGATCTCCTC...,TCACACGACGCTCTTCCGATCTCCTCAATGCTGCTGCTGTACTCTC...,TCTTCAAACCATCCGAGCCATTCGACCACATCTCTGCCTT
3,PRISM_3,VSIV_N_target_mRNA,PRISM_3_VSIV_N_target_mRNA_1,ttgtatccttgaaagccctgTCACACGACGCTCTTCCGATCTGATT...,TCACACGACGCTCTTCCGATCTGATTGCACATGTATGGCACCTCTC...,TACAATCGGAATATTTGACCTTGTATCCTTGAAAGCCCTG
4,PRISM_3,VSIV_N_target_mRNA,PRISM_3_VSIV_N_target_mRNA_2,attcttccgtcaaaaaccctTCACACGACGCTCTTCCGATCTGATT...,TCACACGACGCTCTTCCGATCTGATTGCACATGTATGGCACCTCTC...,ATTGTCTTCTAAGTCTCCATATTCTTCCGTCAAAAACCCT
