In [1]:
import warnings
warnings.filterwarnings('ignore')
import joblib
import sys
sys.modules['sklearn.externals.joblib'] = joblib
sys.path.append('./../src/')
import dill
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import MultiLabelBinarizer
import utils
from utils import id_to_name, name_to_id
import pickle
import random
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

random.seed(9)
np.random.seed(9)

with open('./../annotation/ontologies.dill', 'rb') as f:
    zero_tree, subtree = dill.load(f)

mlb=utils.mlb

Mv, meta = dill.load(open(f'./../data/GEO/preprocessed/training.dill', 'rb'))
zeroshot_Mv, zeroshot_meta = dill.load(open(f'./../data/GEO/preprocessed/zeroshot.dill', 'rb'))

meta_multi = utils.propagate_parent(subtree, meta, tissue_col='training.ID', outdict=False)
meta_mlb = mlb.transform(meta_multi['training.ID'].values)

propagating with training.ID


In [2]:
print(meta['training.ID'].nunique())
print(meta['Dataset'].nunique())

55
210


In [3]:
import pandas as pd

df = pd.read_csv('/grain/rad4/quick/methyl-classification/id_display_mappings.txt', sep='\t')
id_to_color = pd.Series(df.color.values, index=df.ID).to_dict()

In [4]:
# Combine descendants into a set while preserving order
descendants_set = []
system_to_children = dict()
seen = set()
for x in subtree.successors('root'):
    descendants_set.append(x)
    seen.add(x)
    system_to_children[x] = list(nx.descendants(subtree, x)) + [x]
    for descendant in nx.descendants(subtree, x):
        if descendant not in seen:
            descendants_set.append(descendant)
            seen.add(descendant)

print(descendants_set)

['UBERON:0002204', 'UBERON:0002481', 'UBERON:0002371', 'UBERON:0002418', 'CL:0000056', 'UBERON:0004535', 'UBERON:8450002', 'UBERON:0002113', 'UBERON:0000483', 'UBERON:0001043', 'UBERON:0002424', 'UBERON:0005384', 'UBERON:0004802', 'UBERON:0001004', 'UBERON:0001005', 'UBERON:0002048', 'UBERON:0002050', 'UBERON:0001987', 'UBERON:0012168', 'UBERON:0002331', 'CL:0002322;BTO:0001086', 'UBERON:0002390', 'CL:0000576', 'CL:0000094', 'CL:0000542', 'CL:0000084', 'UBERON:0000178', 'CL:0000775', 'CL:2000001', 'CL:0000623', 'CL:0000738', 'CL:0000910;BTO:0000289', 'CL:0000236', 'UBERON:0000990', 'UBERON:0000079', 'UBERON:0002367', 'UBERON:0001295', 'UBERON:0003134', 'UBERON:0000002', 'CL:0000019', 'UBERON:0000992', 'UBERON:0003889', 'UBERON:0001032', 'UBERON:0000949', 'UBERON:0000006', 'UBERON:0002046', 'UBERON:0001264', 'UBERON:0002107', 'UBERON:0002330', 'UBERON:0000310', 'UBERON:0001836', 'UBERON:0001016', 'UBERON:0000955', 'UBERON:0001870', 'UBERON:0000956', 'UBERON:0000451', 'UBERON:0001871', '

In [5]:
print(Mv.shape)
print(len(meta['training.ID'].unique()))
print(len(subtree))

(10351, 297598)
55
73


In [6]:
filename = f"_whole"
minipatch_location = f"./../data/GEO/minipatch/minipatch{filename}_selector"
print(f"loading minipatch selector from {minipatch_location}")
fitted_selector = dill.load(open(minipatch_location, 'rb'))

selection_frequency_threshold = 0.65
selection_freq = pd.DataFrame(fitted_selector.Pi_hat_last_k_, index=Mv.columns)
minipatch_probes = list(selection_freq[selection_freq[0]>=selection_frequency_threshold].index)
Mv_minipatch = Mv45 = Mv[minipatch_probes]
print(Mv_minipatch.shape)

loading minipatch selector from ./../data/GEO/minipatch/minipatch_whole_selector
(10351, 190)


In [13]:
np.sum(meta['UBERON.ID']==meta['training.ID'])

9046

In [8]:
meta.drop(columns=['Unnamed: 0', 'File','FileSeries'])

Unnamed: 0_level_0,Dataset,Annotated.tissue,UBERON.ID,UBERON.Name,Display.Name,merged.ID,training.ID,File,FileSeries
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GSM2859937,GSE107038,liver,UBERON:0002107,liver,liver,UBERON:0002107,UBERON:0002107,raw/GEO/GSE107038/GSM2859937_10003886118_R03C02,GSE107038
GSM2859938,GSE107038,liver,UBERON:0002107,liver,liver,UBERON:0002107,UBERON:0002107,raw/GEO/GSE107038/GSM2859938_10003886118_R04C02,GSE107038
GSM2859939,GSE107038,liver,UBERON:0002107,liver,liver,UBERON:0002107,UBERON:0002107,raw/GEO/GSE107038/GSM2859939_10003886118_R05C02,GSE107038
GSM2859940,GSE107038,liver,UBERON:0002107,liver,liver,UBERON:0002107,UBERON:0002107,raw/GEO/GSE107038/GSM2859940_10003886118_R06C02,GSE107038
GSM2859941,GSE107038,liver,UBERON:0002107,liver,liver,UBERON:0002107,UBERON:0002107,raw/GEO/GSE107038/GSM2859941_10003886150_R01C01,GSE107038
...,...,...,...,...,...,...,...,...,...
GSM3080996,GSE112696,t cell,CL:0000084,T cell,T cell,CL:0000084,CL:0000084,raw/GEO/GSE112696/GSM3080996_3998920058_R01C02,GSE112696
GSM3080997,GSE112696,t cell,CL:0000084,T cell,T cell,CL:0000084,CL:0000084,raw/GEO/GSE112696/GSM3080997_3998920058_R03C02,GSE112696
GSM3080998,GSE112696,t cell,CL:0000084,T cell,T cell,CL:0000084,CL:0000084,raw/GEO/GSE112696/GSM3080998_3998920058_R05C02,GSE112696
GSM3081000,GSE112696,t cell,CL:0000084,T cell,T cell,CL:0000084,CL:0000084,raw/GEO/GSE112696/GSM3081000_200400320036_R03C01,GSE112696
