In [5]:
# import out preprocessing code
import sys
sys.path.insert(1, '../../')
from sc_preprocessing import sc_preprocess
from tensorflow.keras.utils import to_categorical, normalize

# general imports
import warnings
import numpy as np
import os
import pandas as pd
import scipy as sp
from scipy.sparse import coo_matrix
from argparse import ArgumentParser
import keras as K
from scipy.stats import spearmanr, pearsonr

# Images, plots, display, and visualization
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.preprocessing import scale

import pickle
import gzip
from pathlib import Path

# set seeds
from numpy.random import seed
seed(1)


In [2]:
Label_full = np.concatenate([np.full(1000, i) for i in range(28)], axis=0)
label_full = to_categorical(Label_full)


In [2]:
count_file = "/beevol/home/davidson/checkouts/sc_bulk_ood/data/single_cell_data/gbm28/expression/IDHwtGBM.processed.SS2.logTPM.txt.gz"
meta_file = "/beevol/home/davidson/checkouts/sc_bulk_ood/data/single_cell_data/gbm28/metadata/IDHwt.GBM.Metadata.SS2.tsv"
curr_sample = 'MGH101'
num_cells = 500

In [3]:

def read_gbm28_input(count_file, meta_file):

    # read in the counts
    count_ptr = gzip.open(count_file, "r")
    count_matr = pd.read_table(count_ptr)

    # read cell_type labels 
    meta_info = pd.read_table(meta_file, skiprows=[1])
    meta_info.rename(columns = {'NAME':'Name', 'CellAssignment':'CellType'}, inplace = True)

    # transpose the count matr
    count_df = count_matr.transpose()
    count_df.columns = count_df.iloc[0]
    count_df = count_df.drop(count_df.index[0])
    expr_col = count_df.columns

    # merge
    count_df['Name'] = count_df.index
    count_meta_df = count_df.merge(meta_info, left_on=["Name"], right_on=["Name"])


    return (count_meta_df, expr_col)




In [6]:

count_meta_df, expr_col = read_gbm28_input(count_file, meta_file)
count_meta_df_regular_cells = count_meta_df[count_meta_df['Sample'] != 'MGH101' and count_meta_df['CellType'] != 'Malignant']
count_meta_df_tumor_cells = count_meta_df[count_meta_df['Sample'] == 'MGH101' and count_meta_df['CellType'] == 'Malignant']
count_meta_df_samp = count_meta_df_regular_cells.append(count_meta_df_tumor_cells)

In [15]:
count_meta_df_regular_cells = count_meta_df[count_meta_df['Sample'] != 'MGH101']
count_meta_df_regular_cells = count_meta_df_regular_cells[count_meta_df['CellType'] != 'Malignant']

count_meta_df_tumor_cells = count_meta_df[count_meta_df['Sample'] == 'MGH101']
count_meta_df_tumor_cells = count_meta_df_tumor_cells[count_meta_df['CellType'] == 'Malignant']


count_meta_df_samp = count_meta_df_regular_cells.append(count_meta_df_tumor_cells)

count_meta_df_samp

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [12]:
aug_data_path = "/beevol/home/davidson/checkouts/sc_bulk_ood/data/single_cell_data/augmented_gbm28_data/"
X_train, Y_train, gene_df, sig_df = sc_preprocess.read_diva_files(aug_data_path, None, 'MGH100')
gene_df = gene_df.to_frame(index=False, name='gene_ids')
Y_train

Unnamed: 0,Macrophage,Malignant,Oligodendrocyte,T-cell
0,0.119617,0.019139,0.334928,0.526316
0,0.449761,0.004785,0.033493,0.511962
0,0.110048,0.004785,0.004785,0.880383
0,0.057416,0.167464,0.062201,0.712919
0,0.009569,0.311005,0.205742,0.473684
...,...,...,...,...
0,0.095694,0.162679,0.229665,0.511962
0,0.258373,0.148325,0.550239,0.043062
0,0.009569,0.0,0.0,0.990431
0,0.0,0.717703,0.0,0.282297


In [17]:
aug_data_path = "/beevol/home/davidson/checkouts/sc_bulk_ood/data/single_cell_data/augmented_gbm28_data/"
res_data_path = "/beevol/home/davidson/checkouts/sc_bulk_ood/results/single_cell_data/diva_gbm28/"

reference_Y_id = "MGH122"
test_id = "MGH122"
n_train = 1000
n_tot_samples = 27

patient_ids = ["MGH122", "MGH66", "MGH100", "MGH101", "MGH102", 
                "MGH104", "MGH105", "MGH106", "MGH110", 
                "MGH113", "MGH115", "MGH121", 
                "MGH124", "MGH125", "MGH128", "MGH129", 
                "MGH136", "MGH143", "MGH151", "MGH152", 
                "BT749", "BT771", "BT786", "BT830", 
                "BT920", "BT1160", "BT1187"]


def read_gene_file(res_path):
    gene_file = os.path.join(res_path, f"train-gbm28-DIVA_features.pkl")
    gene_path = Path(gene_file)
    gene_df_train = pickle.load( open( gene_path, "rb" ) )
    return gene_df_train

# read in all the expression data used
X_train_all = []
Y_train_all = []
for curr_id in patient_ids:
    X_train, Y_train, _, _ = sc_preprocess.read_diva_files(aug_data_path, None, curr_id)
    Y_train.reindex(columns=['Malignant', 'Macrophage', 'T-cell', 'Oligodendrocyte'], fill_value=0)
    X_train_all.append(X_train)
    Y_train_all.append(Y_train)

X_train = pd.concat(X_train_all, ignore_index=True)
Y_train = pd.concat(Y_train_all, ignore_index=True)
X_full = X_train.to_numpy()
Y_full = Y_train.to_numpy()


In [18]:
gbm28_train = read_gene_file(res_data_path)
tmp_X = X_train[gbm28_train["gene_ids"]]


In [20]:


count_file = "/beevol/home/davidson/checkouts/sc_bulk_ood/data/single_cell_data/gbm28/expression/IDHwtGBM.processed.SS2.logTPM.txt.gz"
meta_file = '/beevol/home/davidson/checkouts/sc_bulk_ood/data/single_cell_data/gbm28/metadata/IDHwt.GBM.Metadata.SS2.tsv'

# read in the counts
count_ptr = gzip.open(count_file, "r")
count_matr = pd.read_table(count_ptr)



TypeError: loop of ufunc does not support argument 0 of type str which has no callable exp method

In [47]:

# read cell_type labels 
meta_info = pd.read_table(meta_file, skiprows=[1])
meta_info.rename(columns = {'NAME':'Name', 'CellAssignment':'CellType'}, inplace = True)

# transpose the count matr
count_df = count_matr.transpose()
count_df.columns = count_df.iloc[0]
count_df = count_df.drop(count_df.index[0])
expr_col = count_df.columns



In [49]:
col_ids = count_df.columns
col_ids


Index(['A1BG', 'A1BG-AS1', 'A1CF', 'A2M', 'A2M-AS1', 'A2ML1', 'A2MP1',
       'A4GALT', 'A4GNT', 'AA06',
       ...
       'ZWILCH', 'ZWINT', 'ZXDA', 'ZXDB', 'ZXDC', 'ZYG11A', 'ZYG11B', 'ZYX',
       'ZZEF1', 'ZZZ3'],
      dtype='object', name='GENE', length=23686)

In [44]:
name_idx = count_df.index
col_ids = count_df.columns

count_df = np.array(count_df,dtype=np.float32)
count_df = np.ceil(np.exp(count_df)-1)

# merge
count_df = pd.DataFrame(count_df)
count_df.columns = col_ids
count_df['Name'] = name_idx
count_meta_df = count_df.merge(meta_info, left_on=["Name"], right_on=["Name"])



In [46]:
count_meta_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,GenesExpressed,GeneticSubclone,MESlike2,MESlike1,AClike,OPClike,NPClike1,NPClike2,G1S,G2M
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,3369,,,,,,,,,
1,0.0,0.0,0.0,205.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3135,,,,,,,,,
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,3442,,,,,,,,,
3,0.0,0.0,0.0,5992.0,0.0,1.0,0.0,0.0,0.0,0.0,...,3299,,,,,,,,,
4,0.0,0.0,0.0,3659.0,0.0,1.0,0.0,0.0,0.0,0.0,...,3702,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7432,,0.231617,0.751092,1.281128,-0.423515,-1.436833,-1.538469,-0.157276,-0.552874
7926,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5164,,-0.272180,0.507133,2.432837,0.404859,-0.683064,-1.003373,-0.514522,-0.567344
7927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6954,,-0.561094,-0.226724,0.222048,-0.036195,0.309032,-0.270425,1.717503,0.573917
7928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5014,,-0.425092,1.079559,1.432765,0.609435,-0.914624,-0.361749,-0.559702,-0.901167


In [35]:
X_colmean = X_full.mean(axis=0)
X_full = X_full[:,np.where(X_colmean > 0)[0]]
gene_df = gene_df.iloc[np.where(X_colmean > 0)[0]]

## get the top variable genes
X_colmean = X_full.mean(axis=0)
X_colvar = X_full.var(axis=0)
X_CoV = np.array(np.divide(X_colvar, X_colmean))
idx_top = np.argpartition(X_CoV, -1000)[-1000:]
X_full = X_full[:,idx_top]
gene_df = gene_df.iloc[idx_top]


In [36]:
X_full = scale(X_full, axis=1)


NameError: name 'scale' is not defined

In [59]:
# simulate pseudobulks for each sample
num_samples = 10

# make the pseudobulks
prop_df, pseudobulks_df = sc_preprocess.make_prop_and_sum(count_meta_df_samp, 
                                            expr_col, 
                                            num_samples, 
                                            num_cells,
                                            use_true_prop=False)

# make the proportions instead of cell counts
prop_df = prop_df.div(prop_df.sum(axis=1), axis=0)


if not np.all(np.isclose(prop_df.sum(axis=1), 1.)):
    assert False, "Proportions do not sum to 1"



0


In [61]:
pseudobulks_df

GENE,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A4GALT,A4GNT,AA06,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
0,0.0,2.71286,2.492156,2438.61743,1.249446,20.878872,0.0,0.0,0.0,0.0,...,14.493981,107.898187,4.344769,0.0,184.199017,182.512403,274.44521,1120.054475,73.267304,98.006856
0,9.944277,3.864587,1.876624,1851.884402,1.831677,25.668789,0.0,0.0,0.0,0.0,...,56.01639,127.979513,0.0,12.819932,227.024323,178.228726,236.45637,1317.365098,98.163356,159.458281
0,21.117486,2.439088,2.382811,1188.726114,2.470753,30.261124,0.0,0.0,0.0,0.0,...,83.410605,116.388531,13.034308,6.474459,212.462665,184.923607,322.162495,1409.15525,155.536241,172.753336
0,4.990941,1.923081,2.410077,1616.744501,1.093265,35.131279,0.0,0.0,0.0,0.0,...,70.212127,156.720554,8.689539,8.891421,170.101354,183.176833,267.378517,1393.496391,81.173343,172.736303
0,18.659621,2.809681,2.253566,541.991451,1.419979,39.001901,0.0,0.0,0.0,0.0,...,140.555552,171.916205,13.034308,38.949204,193.773658,190.798725,296.554473,1587.026502,117.907943,251.356499
0,0.0,2.491235,1.732111,2614.737009,1.561808,21.880204,0.0,0.0,0.0,0.0,...,0.559695,70.919796,0.0,0.0,243.898298,187.567612,278.783063,1073.011175,72.06874,82.593681
0,0.0,3.472163,2.254746,2152.742564,1.405627,21.522234,0.0,0.0,0.0,0.0,...,49.020146,115.074396,0.0,14.764537,232.620431,188.782325,292.960483,1261.52816,48.379877,157.855107
0,24.917101,1.956205,2.541215,243.336793,0.42605,59.772321,0.0,0.0,0.0,0.0,...,219.793333,181.09062,26.068616,40.912423,192.193885,187.746729,290.733914,1804.904176,171.951484,247.146143
0,8.715344,5.016448,2.475328,256.476567,0.213025,29.69375,0.0,0.0,0.0,0.0,...,178.527457,220.491186,21.723847,54.92545,191.166304,188.33394,260.613049,1713.788351,110.010602,254.263091
0,4.990941,2.883751,2.834394,2100.825997,0.937085,34.874556,0.0,0.0,0.0,0.0,...,12.938701,67.845464,0.0,10.83112,224.35854,180.529992,271.744657,1114.069629,79.306688,134.151783


In [15]:
a = "/beevol/home/davidson/checkouts/sc_bulk_ood/data/single_cell_data/cybersort_pbmc/pbmc_rep1_sm2_0_cybersort_mix.tsv.gz"
df = pd.read_csv(a, sep='\t')
df

Unnamed: 0,gene_ids,0,1,2,3,4,5,6,7,8,...,990,991,992,993,994,995,996,997,998,999
0,ENSG00000000003_TSPAN6,13.108058,0.000000,2.014272,12.554541,3.021408,7.049952,0.000000,2.014272,7.049952,...,0.000000,4.582061,0.000000,1.191642,4.766567,7.603469,1.007136,9.986753,4.582061,6.042816
1,ENSG00000000005_TNMD,0.000000,0.000000,0.000000,372.269269,0.000000,0.000000,372.269269,0.000000,0.000000,...,0.000000,1116.807806,0.000000,0.000000,0.000000,744.538537,372.269269,0.000000,372.269269,0.000000
2,ENSG00000000419_DPM1,1945.273199,335.461869,3117.107279,1882.401219,2538.077204,2372.485360,3292.891236,2254.139754,1602.549538,...,2441.354769,2937.409882,4403.165241,788.774612,2845.678400,2453.273231,1895.328979,1699.391460,1606.161715,2547.406236
3,ENSG00000000457_SCYL3,3.466014,1.763206,0.000000,82.810271,5.289617,7.052823,16.750454,4.392915,96.094712,...,0.000000,1.733007,0.000000,0.000000,1.733007,20.246667,105.792343,84.633875,1.748106,128.714018
4,ENSG00000000460_C1orf112,279.365553,6815.738286,1.157697,33.073686,3841.462469,5.325009,1702.971565,3113.887013,3687.585516,...,829.165317,1159.264503,136.700228,7819.705525,413.462165,146.777145,13.497024,1734.020584,300.437773,777.807573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22649,ENSG00000283095_ABC11-4932300O16.1,2.777296,0.877302,5.442107,3.703061,0.000000,1.754604,1.784320,1.814036,1.754604,...,0.000000,3.665566,7.256143,0.000000,0.000000,1.803067,1.851530,2.777296,0.877302,5.323243
22650,ENSG00000283103_LLNLR-245B6.1,367.670236,0.000000,136.891219,222.378170,183.550482,30.591747,245.917138,148.912631,183.307946,...,100.977482,217.103499,121.852560,30.591747,213.659079,381.326682,194.578600,30.591747,123.709715,432.038914
22651,ENSG00000283117_CTD-2060L22.1,572.940620,88.309435,288.849110,568.872679,491.152428,363.029240,310.136199,357.622956,266.794542,...,487.633154,347.139528,252.843675,295.415242,445.572253,585.378156,470.168014,354.099127,520.565668,606.566959
22652,ENSG00000283118_RP11-107E5.4,154.034401,0.799605,772.062378,66.965998,91.387183,623.832443,209.225542,386.889457,160.267794,...,624.237378,250.867783,398.203157,266.412473,285.567950,168.293463,541.339204,19.680130,712.739842,168.664930


In [18]:
a = "/beevol/home/davidson/checkouts/sc_bulk_ood/data/single_cell_data/augmented_pbmc_data/pbmc_rep2_10xV2_pseudo_11.pkl"
b = pickle.load( open( a, "rb" ) )
b

gene_ids,ENSG00000000003_TSPAN6,ENSG00000000419_DPM1,ENSG00000000457_SCYL3,ENSG00000000460_C1orf112,ENSG00000000938_FGR,ENSG00000000971_CFH,ENSG00000001036_FUCA2,ENSG00000001084_GCLC,ENSG00000001167_NFYA,ENSG00000001460_STPG1,...,ENSG00000283031_RP11-557N21.1,ENSG00000283033_RP11-219A15.5,ENSG00000283063_TRBV6-2,ENSG00000283064_RP1-221C16.8,ENSG00000283072_RP11-359G22.4,ENSG00000283073_RP11-834C11.15,ENSG00000283078_RP11-11M20.4,ENSG00000283103_LLNLR-245B6.1,ENSG00000283117_CTD-2060L22.1,ENSG00000283125_RP11-299P2.2
0,79.525143,398.221848,113.478553,38.846200,3370.571361,6.806771,192.070502,371.081266,41.254518,72.339413,...,14.797661,3.230755,4.441847,6.279886,0.000000,25.537944,0.933827,221.139040,31.859117,3.309420
0,33.006476,81.964536,26.220345,5.072592,533.786865,4.877299,27.427874,52.599885,18.544340,20.269513,...,1.899358,2.108259,3.165083,3.812445,0.000000,3.015295,0.000000,35.925021,6.225257,1.103140
0,140.178898,222.619038,45.371634,24.208375,458.334278,0.000000,48.433196,126.817961,69.344958,53.154300,...,9.217499,1.122496,4.109243,20.823210,3.078166,7.016956,0.000000,88.872423,16.273553,0.000000
0,90.925150,394.544734,101.473289,19.258006,5842.287721,1.873601,261.821881,366.490153,40.726155,62.490065,...,8.528850,1.045059,0.000000,5.039327,0.000000,29.206329,2.801480,197.348709,36.280103,0.000000
0,60.809705,248.697465,70.348577,14.255176,3626.791646,4.855851,169.154269,228.029709,44.695345,41.733107,...,3.774367,4.218765,2.220923,8.754395,0.000000,17.695964,3.735307,141.720011,23.073146,1.793999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,70.278127,353.917878,82.102286,32.260658,3589.289248,4.778533,204.323327,295.494426,52.897843,51.028045,...,11.347450,3.135177,1.110462,7.570739,0.000000,21.554778,2.801480,217.303029,40.696719,0.000000
0,241.914459,381.545529,74.984185,69.770780,601.332996,0.000000,23.210617,275.183737,108.566154,116.005275,...,0.949679,1.122496,1.027311,1.301451,0.000000,5.745530,0.000000,368.571370,9.600721,0.000000
0,199.841955,398.689717,94.215155,63.077674,2071.395657,1.873601,109.688671,335.404719,85.679474,87.441496,...,4.822883,0.000000,0.000000,2.602901,0.000000,6.762549,0.000000,330.658874,12.478243,0.000000
0,83.928023,505.819871,129.198475,46.897950,4001.119978,2.904932,243.582392,447.795243,33.331302,63.711067,...,21.904932,1.054129,3.141301,5.063152,0.000000,34.681001,0.933827,237.918425,38.469780,0.897000


In [7]:
aug_dp = "/beevol/home/davidson/checkouts/sc_bulk_ood/data/single_cell_data/augmented_pbmc_data/"
X_train, Y_train, gene_df, sig_df = sc_preprocess.read_diva_files(aug_dp, 0, "pbmc_rep1_sm2")

In [10]:
a = sig_df.transpose()
new_header = a.iloc[0] #grab the first row for the header
a = a[1:] #take the data less the header row
a.columns = new_header #set the header row as the df header
a

CellType,Cytotoxic T cell,Cytotoxic T cell.1,Cytotoxic T cell.2,Cytotoxic T cell.3,Cytotoxic T cell.4,Cytotoxic T cell.5,Cytotoxic T cell.6,Cytotoxic T cell.7,Cytotoxic T cell.8,Cytotoxic T cell.9,...,Megakaryocyte,Megakaryocyte.1,CD16+ monocyte,CD16+ monocyte.1,CD16+ monocyte.2,CD16+ monocyte.3,CD16+ monocyte.4,CD16+ monocyte.5,CD16+ monocyte.6,CD16+ monocyte.7
ENSG00000000003_TSPAN6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000000005_TNMD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000000419_DPM1,366.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,24.0,0.0,5.0,0.0,0.0,0.0,0.0
ENSG00000000457_SCYL3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000000460_C1orf112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000283095_ABC11-4932300O16.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000283103_LLNLR-245B6.1,0.0,0.0,0.0,0.0,0.0,0.0,82.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000283117_CTD-2060L22.1,0.0,0.0,4.0,1.0,10.0,4.0,3.0,3.0,11.0,1.0,...,0.0,0.0,6.0,0.0,8.0,6.0,4.0,2.0,1.0,1.0
ENSG00000283118_RP11-107E5.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,0.0,...,0.0,0.0,8.0,0.0,0.0,0.0,12.0,23.0,0.0,0.0
