In [1]:
## Create Toy Dataset for testing purposes

In [2]:
import os
import numpy as np
import pandas as pd
import sys
sys.path.append('../functions_classes')
from l1000_classes import L1000_dataset

In [3]:
# filepaths for data
data_dir = '../data/L1000_phase_1/'
# gctx file
L1000_fname = 'GSE92742_Broad_LINCS_Level3_INF_mlr12k_n1319138x12328.gctx'
# gene metadata
gene_info_file = 'GSE92742_Broad_LINCS_gene_info.txt'
# sample metadata
inst_info_file = 'GSE92742_Broad_LINCS_inst_info.txt'

# initialize object

phase_1 = L1000_dataset(gctx_path = os.path.join(data_dir, L1000_fname), 
                        inst_info_path = os.path.join(data_dir, inst_info_file), 
                        gene_info_path = os.path.join(data_dir, gene_info_file))

In [4]:
inst_info = phase_1.get(data_name = 'inst_info')

In [5]:
inst_info.dtypes

inst_id            object
rna_plate          object
rna_well           object
pert_id            object
pert_iname         object
pert_type          object
pert_dose         float64
pert_dose_unit     object
pert_time           int64
pert_time_unit     object
cell_id            object
dtype: object

In [6]:
inst_info.head

<bound method NDFrame.head of                                        inst_id            rna_plate rna_well  \
0        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:F13   ASG001_MCF7_24H_X1      F13   
1        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:G13   ASG001_MCF7_24H_X1      G13   
2        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:I13   ASG001_MCF7_24H_X1      I13   
3        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:K13   ASG001_MCF7_24H_X1      K13   
4        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:N13   ASG001_MCF7_24H_X1      N13   
5        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P08   ASG001_MCF7_24H_X1      P08   
6        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P12   ASG001_MCF7_24H_X1      P12   
7        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P17   ASG001_MCF7_24H_X1      P17   
8        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P18   ASG001_MCF7_24H_X1      P18   
9        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P19   ASG001_MCF7_24H_X1      P19   
10       ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P21   ASG001_MCF7_24H_X1      P21   
11       A

In [7]:
gene_info = phase_1.get(data_name = 'gene_info')

In [8]:
gene_info.dtypes

pr_gene_id         int64
pr_gene_symbol    object
pr_gene_title     object
pr_is_lm           int64
pr_is_bing         int64
dtype: object

In [9]:
gene_info.head

<bound method NDFrame.head of        pr_gene_id pr_gene_symbol  \
0             780           DDR1   
1            7849           PAX8   
2            2978         GUCA1A   
3            2049          EPHB3   
4            2101          ESRRA   
5            8717          TRADD   
6           10594          PRPF8   
7             826         CAPNS1   
8           11224          RPL35   
9            6158          RPL28   
10           1982         EIF4G2   
11           8664          EIF3D   
12          11315          PARK7   
13           6727          SRP14   
14           2665           GDI2   
15           6135          RPL11   
16           6144          RPL21   
17           6152          RPL24   
18           4735          SEPT2   
19           6233         RPS27A   
20           6207          RPS13   
21           2197            FAU   
22           1072           CFL1   
23           6141          RPL18   
24           8665          EIF3F   
25           6193           RPS5  

In [10]:
# get 100 of each cell type
np.random.seed(42)
cell_ids = np.ndarray([], dtype = 'str')
unique_cell_types = np.unique(inst_info['cell_id'].to_numpy().astype('str'))
print(unique_cell_types)
for i in range(0, len(unique_cell_types)):
    print('cell type ' + str(i + 1) + ' of ' + str(len(unique_cell_types)))
    cell_type_i = unique_cell_types[i]
    cell_index = np.argwhere(inst_info['cell_id'].to_numpy().astype('str') == cell_type_i)
    cell_ids_type = inst_info['inst_id'].iloc[cell_index.flatten()].to_numpy().astype('str')
    # print(len(cell_ids_type))
    cell_ids_type = np.random.choice(cell_ids_type.flatten(), 
                                     np.min([100, len(cell_ids_type.flatten())]), 
                                     replace = False)
    if (len(cell_ids_type) > 0):
        cell_ids = np.append(cell_ids, cell_ids_type)

['A375' 'A549' 'A673' 'AGS' 'ASC' 'BT20' 'CD34' 'CL34' 'CORL23' 'COV644'
 'DV90' 'EFO27' 'FIBRNPC' 'H1299' 'HA1E' 'HCC15' 'HCC515' 'HCT116'
 'HEC108' 'HEK293T' 'HEKTE' 'HEPG2' 'HL60' 'HS27A' 'HS578T' 'HT115' 'HT29'
 'HUH7' 'JHUEM2' 'JURKAT' 'LOVO' 'MCF10A' 'MCF7' 'MCH58' 'MDAMB231'
 'MDST8' 'NCIH1694' 'NCIH1836' 'NCIH2073' 'NCIH508' 'NCIH596' 'NCIH716'
 'NEU' 'NKDBA' 'NOMO1' 'NPC' 'OV7' 'PC3' 'PHH' 'PL21' 'RKO' 'RMGI' 'RMUGS'
 'SHSY5Y' 'SKB' 'SKBR3' 'SKL' 'SKLU1' 'SKM1' 'SKMEL1' 'SKMEL28' 'SNGM'
 'SNU1040' 'SNUC4' 'SNUC5' 'SW480' 'SW620' 'SW948' 'T3M10' 'THP1' 'TYKNU'
 'U266' 'U2OS' 'U937' 'VCAP' 'WSUDLCL2']
cell type 1 of 76
cell type 2 of 76
cell type 3 of 76
cell type 4 of 76
cell type 5 of 76
cell type 6 of 76
cell type 7 of 76
cell type 8 of 76
cell type 9 of 76
cell type 10 of 76
cell type 11 of 76
cell type 12 of 76
cell type 13 of 76
cell type 14 of 76
cell type 15 of 76
cell type 16 of 76
cell type 17 of 76
cell type 18 of 76
cell type 19 of 76
cell type 20 of 76
cell type 21 

In [11]:
print(len(cell_ids))

64040


In [12]:
# select landmark genes
lm_gene_ind = np.argwhere(gene_info['pr_is_lm'].to_numpy().astype('int') == 1).flatten()
lm_genes = gene_info['pr_gene_id'].iloc[lm_gene_ind].to_numpy().astype('str')
print(len(lm_genes))

978


In [13]:
# save data
output_dir = '../data/phase_1_toy_data'
if os.path.exists(output_dir):
    os.system('rm -rf ' + output_dir)
prefix = 'phase_1_toy'
phase_1.save_data(row_ids = lm_genes, 
                  col_ids = cell_ids, 
                  output_dir = output_dir, 
                  prefix = prefix)

using data and metadata for selected samples and genes
keeping 978 of 12328 genes
keeping 64039 of 1319138 samples


  ' col ids were not in data')
