In [5]:
# Env
import os
from pathlib import Path
import pandas as pd
import numpy as np
import re


DATASET_DIR = Path('F:\spatial_data\dataset_probe')
RUNID = '20240808_Z.Y_Mouse'
workdir = DATASET_DIR / RUNID
resultsdir = os.path.join(workdir, 'results')
organism = 'mouse'

In [6]:
gene_info = pd.read_excel(os.path.join(workdir, "Marker_genes.xlsx"))
if organism == 'mouse': gene_info['gene'] = gene_info['gene'].str.capitalize()
elif organism == 'human': gene_info['gene'] = gene_info['gene'].str.upper()
gene_list = [_.strip() for _ in gene_info['gene'].unique() if _!=0]
print(len(gene_list))
gene_info.head()

29


Unnamed: 0,gene,Remark
0,Tnni3,心肌细胞
1,Actn2,心肌细胞
2,Myl2,心肌细胞
3,Nppa,心肌细胞
4,Ankrd1,心肌细胞


In [7]:
def adjust_gene_name(gene_name, gene_list):
    gene_list = [x.upper() for x in gene_list]
    match = re.search(r'(.+)-(\d+)$', gene_name)
    if match:
        base_gene_name = match.group(1)
        if base_gene_name.upper() in gene_list or gene_name.upper() in gene_list: return base_gene_name
        else: return gene_name
    else: return gene_name


result = pd.DataFrame()
for dir in os.listdir(os.path.join(resultsdir)):
    try: result = pd.concat([result, pd.read_excel(os.path.join(resultsdir, dir, "probes_wanted.xlsx"), index_col=0)])
    except: continue

result["gene_name"] = [adjust_gene_name(_, gene_list) for _ in result["gene_name"]]
result = result.sort_values(["gene_name", "pos_on_seq"])
result.drop_duplicates(subset=["binding"], keep="first", inplace=True)
result.head()

Unnamed: 0,accession,gene_name,mol_type,organism,pos_on_seq,binding,Tm_l,Tm_r,wanted,align_num,align_accession,align_descrip,plus/minus
393,ENSMUST00000238147,Acta2,protein_coding,mouse,65,TCATGAGGTAGTCGGTGAGATCTCGGCCAGCCAAGTCCAG,52.16,58.21,True,2,XM_006526606.2|NM_007392.3,"PREDICTED: Mus musculus actin alpha 2, smooth...","-1,-1"
432,ENSMUST00000238147,Acta2,protein_coding,mouse,299,TAGGGTTCAGTGGTGCCTCTGTCAGCAGTGTCGGATGCTC,54.34,53.65,True,2,XM_006526606.2|NM_007392.3,"PREDICTED: Mus musculus actin alpha 2, smooth...","-1,-1"
454,ENSMUST00000238147,Acta2,protein_coding,mouse,535,ATGGATGGGAAAACAGCCCTGGGAGCATCATCACCAGCGA,59.25,58.23,True,2,XM_006526606.2|NM_007392.3,"PREDICTED: Mus musculus actin alpha 2, smooth...","-1,-1"
23,ENSMUST00000221162,Actn2,protein_coding,mouse,68,TTGTCAGGATCTGGGTCTCCACCTCATTGATGGTTCTGGC,52.51,50.11,True,2,XM_036157837.1|NM_033268.4,PREDICTED: Mus musculus actinin alpha 2 (Actn...,"-1,-1"
45,ENSMUST00000221162,Actn2,protein_coding,mouse,273,GGAGCTGATCCGGATGCTGTAGCTCTGGATCACCTTCTCC,58.29,51.96,True,2,XM_036157837.1|NM_033268.4,PREDICTED: Mus musculus actinin alpha 2 (Actn...,"-1,-1"


In [8]:
result.to_excel(os.path.join(workdir, 'gene_binding_site.xlsx'))

to_search = [gene for gene in gene_list if len(result[result["gene_name"] == gene]) < 1]
with open(os.path.join(workdir, "to_search.txt"), "w") as f:
    for line in to_search: f.write(line + "\n")