In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import anndata


In [1]:
group_name = "MOp-binCH-X_pca"

In [3]:
region, var_dim, key = group_name.split('-')


In [4]:
tarall = [
    "PFC",
    "MOp",
    "SSp",
    "ACA",
    "AI",
    "AUDp",
    "RSP",
    "PTLp",
    "VISp",
    "MOB",
    "ENT",
    "HPF",
    "PIR",
    "AMY",
    "STR",
    "PAL",
    "TH",
    "HY",
    "SC",
    "VTA",
    "P",
    "MY",
    "CBN",
    "CBX",
]
print(len(tarall))


24


In [5]:
adata = anndata.read_h5ad(f'/home/jzhou_salk_edu/sky_workdir/230129_targetpair_roc/data/{region}_RS2_{var_dim}.h5ad')
adata


AnnData object with n_obs × n_vars = 2568 × 23730
    obs: 'mCCCFrac', 'mCGFrac', 'mCHFrac', 'FinalmCReads', 'Plate', 'PlateNormCov', 'Sample', 'SubRegion', 'DissectionRegion', 'Target', 'Sex', 'L1', 'L2', 'L3', 'L4', 'L1_annot', 'PassTargetFilter'
    var: 'chrom', 'end', 'start'
    obsm: 'X_pca'

In [6]:
target_list = [xx for xx in tarall if xx in adata.obs["Target"].values]
nt = len(target_list)
target_list


['SSp', 'ACA', 'AI', 'AUDp', 'STR', 'TH', 'SC', 'VTA', 'P', 'MY']

In [None]:
clf = LogisticRegression()
result = np.zeros((nt, nt, 2, 50))
for t in range(50):
    np.random.seed(t)
    adata.obs['Replicate'] = 'rep2'
    for i in range(nt):
        tar = target_list[i]
        selc = np.where(adata.obs['Target']==tar)[0]
        selc = np.random.choice(selc, len(selc)//2, False)
        adata.obs.iloc[selc, -1] = 'rep1'
    # print(adata.obs['Replicate'].value_counts())
    for i in range(nt-1):
        for j in range(i+1,  nt):
            tar1, tar2 = target_list[i], target_list[j]
            selc = adata.obs['Target'].isin([tar1, tar2])
            if key=='X':
                data = adata.X[selc]
            else:
                data = adata.obsm[key][selc]
            label = (adata.obs.loc[selc, 'Target']==tar1).values.astype(int)
            for k,s in enumerate(['rep1', 'rep2']):
                trainfilter = (adata.obs.loc[selc, 'Replicate']==s)
                pred = clf.fit(data[trainfilter], label[trainfilter]).predict_proba(data[~trainfilter])[:,1]
                result[i,j,k,t] = roc_auc_score(label[~trainfilter], pred)
    print(t, result[:,:,:,t].mean(axis=2))
    

0 [[0.         0.65660951 0.89739027 0.99133913 0.65559583 0.95323552
  0.96257928 0.96779752 0.96788982 0.98441647]
 [0.         0.         0.87287726 0.98669503 0.7067258  0.94219775
  0.97172403 0.97543046 0.96244317 0.97879311]
 [0.         0.         0.         0.91991559 0.88221575 0.97032544
  0.97943518 0.93604806 0.98729098 0.96427874]
 [0.         0.         0.         0.         0.99079363 0.99835697
  0.99946294 0.98238078 0.99775153 0.96905792]
 [0.         0.         0.         0.         0.         0.91461925
  0.93891389 0.97043192 0.93179838 0.96471535]
 [0.         0.         0.         0.         0.         0.
  0.85710303 0.97338453 0.72414665 0.92514124]
 [0.         0.         0.         0.         0.         0.
  0.         0.91372841 0.87516962 0.90878564]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.96258311 0.9292806 ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.90588

In [20]:
result_mean = pd.DataFrame(result.mean(axis=(2,3)), index=target_list, columns=target_list)
result_std = pd.DataFrame(result.std(axis=(2,3)), index=target_list, columns=target_list)


In [25]:
result_mean.to_hdf(f'/home/jzhou_salk_edu/sky_workdir/230129_targetpair_roc/result/{group_name}-comprep_mean.hdf', key='data')
result_std.to_hdf(f'/home/jzhou_salk_edu/sky_workdir/230129_targetpair_roc/result/{group_name}-comprep_std.hdf', key='data')


In [26]:
count = adata.obs[['Target','Sex']].drop_duplicates()['Target'].value_counts()
selt = count.index[count>1]
adata = adata[adata.obs['Target'].isin(selt)].copy()
adata


AnnData object with n_obs × n_vars = 2472 × 9906
    obs: 'mCCCFrac', 'mCGFrac', 'mCHFrac', 'FinalmCReads', 'Plate', 'PlateNormCov', 'Sample', 'SubRegion', 'DissectionRegion', 'Target', 'Sex', 'L1', 'L2', 'L3', 'L4', 'L1_annot', 'PassTargetFilter', 'Replicate'
    var: 'chrom', 'end', 'start'
    obsm: 'X_pca'

In [30]:
target_list = [xx for xx in tarall if xx in adata.obs["Target"].values]
nt = len(target_list)
target_list


['SSp', 'ACA', 'AI', 'AUDp', 'STR', 'TH', 'SC', 'VTA', 'P']

In [35]:
clf = LogisticRegression()
result = np.zeros((nt, nt, 2))
for i in range(nt-1):
    for j in range(i+1,  nt):
        tar1, tar2 = target_list[i], target_list[j]
        selc = adata.obs['Target'].isin([tar1, tar2])
        if key=='X':
            data = adata.X[selc]
        else:
            data = adata.obsm[key][selc]
        label = (adata.obs.loc[selc, 'Target']==tar1).values.astype(int)
        for k,s in enumerate(['male', 'female']):
            trainfilter = (adata.obs.loc[selc, 'Sex']==s)
            pred = clf.fit(data[trainfilter], label[trainfilter]).predict_proba(data[~trainfilter])[:,1]
            result[i,j,k] = roc_auc_score(label[~trainfilter], pred)


In [36]:
result_mean = pd.DataFrame(result.mean(axis=2), index=target_list, columns=target_list)
result_std = pd.DataFrame(result.std(axis=2), index=target_list, columns=target_list)


In [46]:
result_mean.to_hdf(f'/home/jzhou_salk_edu/sky_workdir/230129_targetpair_roc/result/{group_name}-biorep_mean.hdf', key='data')
result_std.to_hdf(f'/home/jzhou_salk_edu/sky_workdir/230129_targetpair_roc/result/{group_name}-biorep_std.hdf', key='data')
