In [8]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import anndata
from wmb import mm10

In [9]:
group_name = "MOp-IC"

In [10]:
region, gene_group = group_name.split('-')


In [11]:
tarall = [
    "PFC",
    "MOp",
    "SSp",
    "ACA",
    "AI",
    "AUDp",
    "RSP",
    "PTLp",
    "VISp",
    "MOB",
    "ENT",
    "HPF",
    "PIR",
    "AMY",
    "STR",
    "PAL",
    "TH",
    "HY",
    "SC",
    "VTA",
    "P",
    "MY",
    "CBN",
    "CBX",
]
print(len(tarall))


24


In [12]:
adata = anndata.read_h5ad(f'/home/jzhou_salk_edu/sky_workdir/230129_targetpair_roc/data_new/{region}_RS2_geneCH.h5ad')
adata


AnnData object with n_obs × n_vars = 2568 × 19012
    obs: 'mCCCFrac', 'mCGFrac', 'mCHFrac', 'FinalmCReads', 'Plate', 'PlateNormCov', 'Sample', 'SubRegion', 'DissectionRegion', 'Target', 'Sex', 'L1', 'L2', 'L3', 'L4', 'L1_annot', 'PassTargetFilter'
    var: 'chrom', 'end', 'start'
    obsm: 'X_pca'

In [13]:
target_list = [xx for xx in tarall if xx in adata.obs["Target"].values]
nt = len(target_list)
target_list


['SSp', 'ACA', 'AI', 'AUDp', 'STR', 'TH', 'SC', 'VTA', 'P', 'MY']

In [14]:
if gene_group=='TF':
    selg = adata.var.index.isin(mm10.get_tf_gene_ids())
elif gene_group=='NT':
    selg = pd.read_csv('/rs2/genelist/NTReceptor_Tasic.txt', header=None, index_col=0)
    selg = adata.var.index.map(mm10.gene_id_to_name).isin(selg.index)
elif gene_group=='IC':
    selg = pd.concat([pd.read_csv('/rs2/genelist/IC.txt', header=None, index_col=0), pd.read_csv('/rs2/genelist/IC_Tasic.txt', header=None, index_col=0)]).drop_duplicates()
    selg = adata.var.index.map(mm10.gene_id_to_name).isin(selg.index)
elif gene_group=='NP':
    selg = pd.concat([pd.read_csv('/rs2/genelist/NPP-Receptor_Smith.txt', header=None, index_col=0), pd.read_csv('/rs2/genelist/NPP-Receptor_Tasic.txt', header=None, index_col=0)]).drop_duplicates()
    selg = adata.var.index.map(mm10.gene_id_to_name).isin(selg.index)
elif gene_group=='GO':
    selg = pd.concat([pd.read_csv('/rs2/genelist/GO0050808_SynapseOrganization.txt', header=None, index_col=0), pd.read_csv('/rs2/genelist/GO0031175_NeuronProjectionDevelopment.txt', header=None, index_col=0)]).drop_duplicates()
    selg = adata.var.index.map(mm10.gene_id_to_name).isin(selg.index)

adata = adata[:, selg].copy()
print(adata.shape)

(2568, 267)


In [8]:
clf = LogisticRegression()
result = np.zeros((nt, nt, 2, 50))
for t in range(50):
    np.random.seed(t)
    adata.obs['Replicate'] = 'rep2'
    for i in range(nt):
        tar = target_list[i]
        selc = np.where(adata.obs['Target']==tar)[0]
        selc = np.random.choice(selc, len(selc)//2, False)
        adata.obs.iloc[selc, -1] = 'rep1'
    # print(adata.obs['Replicate'].value_counts())
    for i in range(nt-1):
        for j in range(i+1,  nt):
            tar1, tar2 = target_list[i], target_list[j]
            selc = adata.obs['Target'].isin([tar1, tar2])
            data = adata.X[selc]
            label = (adata.obs.loc[selc, 'Target']==tar1).values.astype(int)
            for k,s in enumerate(['rep1', 'rep2']):
                trainfilter = (adata.obs.loc[selc, 'Replicate']==s)
                pred = clf.fit(data[trainfilter], label[trainfilter]).predict_proba(data[~trainfilter])[:,1]
                result[i,j,k,t] = roc_auc_score(label[~trainfilter], pred)
    print(t)
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [9]:
result_mean = pd.DataFrame(result.mean(axis=(2,3)), index=target_list, columns=target_list)
result_std = pd.DataFrame(result.std(axis=(2,3)), index=target_list, columns=target_list)


In [10]:
result_mean.to_hdf(f'/home/jzhou_salk_edu/sky_workdir/230322_targetpair_roc_GO/result_new/{group_name}-comprep_mean.hdf', key='data')
result_std.to_hdf(f'/home/jzhou_salk_edu/sky_workdir/230322_targetpair_roc_GO/result_new/{group_name}-comprep_std.hdf', key='data')
