In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import anndata


In [2]:
group_name = "MOp"

In [3]:
region = group_name


In [4]:
tarall = [
    "PFC",
    "MOp",
    "SSp",
    "ACA",
    "AI",
    "AUDp",
    "RSP",
    "PTLp",
    "VISp",
    "MOB",
    "ENT",
    "HPF",
    "PIR",
    "AMY",
    "STR",
    "PAL",
    "TH",
    "HY",
    "SC",
    "VTA",
    "P",
    "MY",
    "CBN",
    "CBX",
]
print(len(tarall))


24


In [8]:
targroup = {
    "IT": ["PFC", "MOp", "SSp", "ACA", "AI", "AUDp", "RSP", "PTLp", "VISp", "ENT", "HPF", "PIR", "MOB", "AMY", "STR", "PAL"],
    "ET": ["TH", "HY", "SC", "VTA", "P", "MY"],
    "CB": ["CBN", "CBX"]
}
tarmap = {yy: xx for xx in targroup for yy in targroup[xx]}


In [6]:
adata = anndata.read_h5ad(f'/home/jzhou_salk_edu/sky_workdir/230129_targetpair_roc/data/{region}_RS2_geneCH.h5ad')
adata


AnnData object with n_obs × n_vars = 2568 × 9906
    obs: 'mCCCFrac', 'mCGFrac', 'mCHFrac', 'FinalmCReads', 'Plate', 'PlateNormCov', 'Sample', 'SubRegion', 'DissectionRegion', 'Target', 'Sex', 'L1', 'L2', 'L3', 'L4', 'L1_annot', 'PassTargetFilter'
    var: 'chrom', 'end', 'start'
    obsm: 'X_pca'

In [9]:
adata.obs['TargetGroup'] = adata.obs['Target'].map(tarmap)
adata.obs['TargetGroup'].astype(str).value_counts()


IT    1409
ET    1159
Name: TargetGroup, dtype: int64

In [11]:
target_list = [xx for xx in ['IT', 'ET'] if xx in adata.obs["TargetGroup"].values]
nt = len(target_list)
target_list


['IT', 'ET']

In [13]:
clf = LogisticRegression()
result = np.zeros((nt, nt, 2, 50))
for t in range(50):
    np.random.seed(t)
    adata.obs['Replicate'] = 'rep2'
    for tar in adata.obs['Target'].unique():
        selc = np.where(adata.obs['Target']==tar)[0]
        selc = np.random.choice(selc, len(selc)//2, False)
        adata.obs.iloc[selc, -1] = 'rep1'
    # print(adata.obs['Replicate'].value_counts())
    for i in range(nt-1):
        for j in range(i+1,  nt):
            tar1, tar2 = target_list[i], target_list[j]
            selc = adata.obs['TargetGroup'].isin([tar1, tar2])
            data = adata.X[selc]
            label = (adata.obs.loc[selc, 'TargetGroup']==tar1).values.astype(int)
            for k,s in enumerate(['rep1', 'rep2']):
                trainfilter = (adata.obs.loc[selc, 'Replicate']==s)
                pred = clf.fit(data[trainfilter], label[trainfilter]).predict_proba(data[~trainfilter])[:,1]
                result[i,j,k,t] = roc_auc_score(label[~trainfilter], pred)
    print(t, result[:,:,:,t].mean(axis=2))
    

0 [[0.         0.94954945]
 [0.         0.        ]]
1 [[0.         0.95033107]
 [0.         0.        ]]
2 [[0.        0.9491833]
 [0.        0.       ]]
3 [[0.        0.9513782]
 [0.        0.       ]]
4 [[0.         0.94598816]
 [0.         0.        ]]
5 [[0.         0.94945033]
 [0.         0.        ]]
6 [[0.         0.94923583]
 [0.         0.        ]]
7 [[0.         0.95108201]
 [0.         0.        ]]
8 [[0.         0.94798988]
 [0.         0.        ]]
9 [[0.         0.94593347]
 [0.         0.        ]]
10 [[0.         0.95009523]
 [0.         0.        ]]
11 [[0.         0.95007239]
 [0.         0.        ]]
12 [[0.         0.95151188]
 [0.         0.        ]]
13 [[0.         0.94688574]
 [0.         0.        ]]
14 [[0.         0.95122715]
 [0.         0.        ]]
15 [[0.         0.95360613]
 [0.         0.        ]]
16 [[0.        0.9458847]
 [0.        0.       ]]
17 [[0.         0.94830883]
 [0.         0.        ]]
18 [[0.         0.95099661]
 [0.         0.       

In [14]:
result_mean = pd.DataFrame(result.mean(axis=(2,3)), index=target_list, columns=target_list)
result_std = pd.DataFrame(result.std(axis=(2,3)), index=target_list, columns=target_list)


In [16]:
result_mean.to_hdf(f'/home/jzhou_salk_edu/sky_workdir/230203_ITvsET/result/{region}-comprep_mean.hdf', key='data')
result_std.to_hdf(f'/home/jzhou_salk_edu/sky_workdir/230203_ITvsET/result/{region}-comprep_std.hdf', key='data')


In [17]:
count = adata.obs[['Target','Sex']].drop_duplicates()['Target'].value_counts()
selt = count.index[count>1]
adata = adata[adata.obs['Target'].isin(selt)].copy()
adata


AnnData object with n_obs × n_vars = 2472 × 9906
    obs: 'mCCCFrac', 'mCGFrac', 'mCHFrac', 'FinalmCReads', 'Plate', 'PlateNormCov', 'Sample', 'SubRegion', 'DissectionRegion', 'Target', 'Sex', 'L1', 'L2', 'L3', 'L4', 'L1_annot', 'PassTargetFilter', 'TargetGroup', 'Replicate'
    var: 'chrom', 'end', 'start'
    obsm: 'X_pca'

In [18]:
target_list = [xx for xx in targroup if xx in adata.obs["TargetGroup"].values]
nt = len(target_list)
target_list


['IT', 'ET']

In [19]:
clf = LogisticRegression()
result = np.zeros((nt, nt, 2))
for i in range(nt-1):
    for j in range(i+1,  nt):
        tar1, tar2 = target_list[i], target_list[j]
        selc = adata.obs['TargetGroup'].isin([tar1, tar2])
        data = adata.X[selc]
        label = (adata.obs.loc[selc, 'TargetGroup']==tar1).values.astype(int)
        for k,s in enumerate(['male', 'female']):
            trainfilter = (adata.obs.loc[selc, 'Sex']==s)
            pred = clf.fit(data[trainfilter], label[trainfilter]).predict_proba(data[~trainfilter])[:,1]
            result[i,j,k] = roc_auc_score(label[~trainfilter], pred)


In [20]:
result_mean = pd.DataFrame(result.mean(axis=2), index=target_list, columns=target_list)
result_std = pd.DataFrame(result.std(axis=2), index=target_list, columns=target_list)


In [21]:
result_mean.to_hdf(f'/home/jzhou_salk_edu/sky_workdir/230203_ITvsET/result/{region}-biorep_mean.hdf', key='data')
result_std.to_hdf(f'/home/jzhou_salk_edu/sky_workdir/230203_ITvsET/result/{region}-biorep_std.hdf', key='data')
