In [1]:
import os
import numpy as np
import pandas as pd
import sys
sys.path.append('../functions_classes')
from l1000_classes import L1000_dataset

In [2]:
# filepaths for data
data_dir = '../data/L1000_phase_1/'
# gctx file
L1000_fname = 'GSE92742_Broad_LINCS_Level3_INF_mlr12k_n1319138x12328.gctx'
# gene metadata
gene_info_file = 'GSE92742_Broad_LINCS_gene_info.txt'
# sample metadata
inst_info_file = 'GSE92742_Broad_LINCS_inst_info.txt'

# initialize object

phase_1 = L1000_dataset(gctx_path = os.path.join(data_dir, L1000_fname), 
                        inst_info_path = os.path.join(data_dir, inst_info_file), 
                        gene_info_path = os.path.join(data_dir, gene_info_file))

In [3]:
inst_info = phase_1.get(data_name = 'inst_info')

In [4]:
inst_info.dtypes

inst_id            object
rna_plate          object
rna_well           object
pert_id            object
pert_iname         object
pert_type          object
pert_dose         float64
pert_dose_unit     object
pert_time           int64
pert_time_unit     object
cell_id            object
dtype: object

In [5]:
inst_info.head

<bound method NDFrame.head of                                        inst_id            rna_plate rna_well  \
0        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:F13   ASG001_MCF7_24H_X1      F13   
1        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:G13   ASG001_MCF7_24H_X1      G13   
2        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:I13   ASG001_MCF7_24H_X1      I13   
3        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:K13   ASG001_MCF7_24H_X1      K13   
4        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:N13   ASG001_MCF7_24H_X1      N13   
5        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P08   ASG001_MCF7_24H_X1      P08   
6        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P12   ASG001_MCF7_24H_X1      P12   
7        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P17   ASG001_MCF7_24H_X1      P17   
8        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P18   ASG001_MCF7_24H_X1      P18   
9        ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P19   ASG001_MCF7_24H_X1      P19   
10       ASG001_MCF7_24H_X1_B7_DUO52HI53LO:P21   ASG001_MCF7_24H_X1      P21   
11       A

In [6]:
gene_info = phase_1.get(data_name = 'gene_info')

In [7]:
gene_info.dtypes

pr_gene_id         int64
pr_gene_symbol    object
pr_gene_title     object
pr_is_lm           int64
pr_is_bing         int64
dtype: object

In [8]:
gene_info.head

<bound method NDFrame.head of        pr_gene_id pr_gene_symbol  \
0             780           DDR1   
1            7849           PAX8   
2            2978         GUCA1A   
3            2049          EPHB3   
4            2101          ESRRA   
5            8717          TRADD   
6           10594          PRPF8   
7             826         CAPNS1   
8           11224          RPL35   
9            6158          RPL28   
10           1982         EIF4G2   
11           8664          EIF3D   
12          11315          PARK7   
13           6727          SRP14   
14           2665           GDI2   
15           6135          RPL11   
16           6144          RPL21   
17           6152          RPL24   
18           4735          SEPT2   
19           6233         RPS27A   
20           6207          RPS13   
21           2197            FAU   
22           1072           CFL1   
23           6141          RPL18   
24           8665          EIF3F   
25           6193           RPS5  

In [9]:
# only use landmark genes
gene_ids = gene_info['pr_gene_id'][gene_info['pr_is_lm'] == 1].to_numpy().astype('str')
sample_ids = inst_info['inst_id'].to_numpy().astype('str')
# randomly sample 10,000 samples
sample_ids = np.random.choice(sample_ids, size = np.power(10, 3), replace = False)

In [10]:
len(gene_ids)

978

In [11]:
len(sample_ids)

1000

In [12]:
phase_1.load_data(row_ids = gene_ids, col_ids = sample_ids)

In [16]:
data_mat = phase_1.get('data')
# note that returned data matrix is supposed to have samples in rows, features in columns.
# why they decided to put genes under ROW metadata and samples under COL metadata
# and then use a sample x feature matrix for storage is beyond me.
data_mat.shape

(1000, 978)

In [17]:
current_genes = phase_1.get('current_genes')
current_samples = phase_1.get('current_inst')

In [18]:
len(current_genes)

978

In [19]:
len(current_samples)

1000