In [1]:
import os
GPU_id = 7
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [2]:
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
Path.ls = lambda x: list(x.iterdir())

import cudf as gd
import nvstrings

from tqdm import tqdm
from collections import defaultdict
import pandas as pd
import numpy as np
import time
import pickle

print(gd.__version__)

0.9.0


### Functions

In [3]:
cat_cols = ['treatment','cell_line']
int_cols = ['cellID','fileID']

def read_cell_line(path):
    df = gd.read_csv(path)
    float_cols = [i for i in df.columns if i not in cat_cols+int_cols]
    for i in int_cols:
        df[i] = df[i].astype('int32')
    
    for i in float_cols:
        if df[i].dtype == 'O':
            df.loc[df[i]=='NA',i] = None
        df[i] = df[i].astype('float32')
    return df

In [4]:
def read_all_cell_lines(path):
    dfs = []
    path = path/'single_cell_phospo'
    paths = list((path/'subchallenge_1').ls())+list((path/'complete_cell_lines').ls())
    for i in tqdm(paths,total=len(paths)):
        df = read_cell_line(i)
        dfs.append(df)
    
    df = gd.concat(dfs)
    del dfs
    return df

### Feature engineering

In [5]:
path = Path('/raid/data/ml/dream/single_cell_breast_cancer')
path.ls()

[PosixPath('/raid/data/ml/dream/single_cell_breast_cancer/prediction_template'),
 PosixPath('/raid/data/ml/dream/single_cell_breast_cancer/output'),
 PosixPath('/raid/data/ml/dream/single_cell_breast_cancer/transcriptomics_genomics'),
 PosixPath('/raid/data/ml/dream/single_cell_breast_cancer/proteomics'),
 PosixPath('/raid/data/ml/dream/single_cell_breast_cancer/cache'),
 PosixPath('/raid/data/ml/dream/single_cell_breast_cancer/FileID_table.csv'),
 PosixPath('/raid/data/ml/dream/single_cell_breast_cancer/Antibody_table.csv'),
 PosixPath('/raid/data/ml/dream/single_cell_breast_cancer/backup'),
 PosixPath('/raid/data/ml/dream/single_cell_breast_cancer/single_cell_phospo'),
 PosixPath('/raid/data/ml/dream/single_cell_breast_cancer/CellLines.csv'),
 PosixPath('/raid/data/ml/dream/single_cell_breast_cancer/median_phospho')]

In [6]:
%%time
df = read_all_cell_lines(path)
print(df.shape)
df.head().to_pandas()

100%|██████████| 50/50 [00:17<00:00,  3.31it/s]


(19509387, 42)
CPU times: user 11.8 s, sys: 7.22 s, total: 19 s
Wall time: 19.2 s


Unnamed: 0,treatment,cell_line,time,cellID,fileID,b.CATENIN,cleavedCas,CyclinB,GAPDH,IdU,...,p.PDPK1,p.PLCg2,p.RB,p.S6,p.S6K,p.SMAD23,p.SRC,p.STAT1,p.STAT3,p.STAT5
0,EGF,LY2,0.0,1,1600,1.695869,2.479688,2.46698,1.721575,4.2502,...,2.07778,,4.11024,,2.483872,2.419281,2.159551,1.435568,0.652426,1.73338
1,EGF,LY2,0.0,1,1662,0.252953,2.012776,1.20489,1.913279,4.78735,...,1.575423,,6.12789,,2.124639,0.551474,2.361749,2.315083,0.652426,2.34484
2,EGF,LY2,0.0,2,1600,0.224784,2.381034,1.57411,1.282657,5.28419,...,1.087352,,2.94259,,1.676179,0.551474,2.941293,2.896088,0.841045,1.511762
3,EGF,LY2,0.0,2,1662,2.473927,3.579243,2.09122,3.386645,4.97592,...,3.218199,,7.71275,,2.841005,1.807767,2.803607,3.1187,2.300759,2.503745
4,EGF,LY2,0.0,3,1600,0.224784,1.429828,1.20489,1.809322,4.2502,...,0.754511,,4.4104,,1.72779,1.294836,2.339936,2.009363,0.652426,1.875179


In [7]:
%%time
str_map = {}
for col in cat_cols+['time']:
    df[col],cat = df[col].factorize()
    str_map[col] = {i:c for c,i in enumerate(cat)}
print(df.shape)
str_map

(19509387, 42)
CPU times: user 1.26 s, sys: 580 ms, total: 1.84 s
Wall time: 1.84 s


{'treatment': {'EGF': 0,
  'full': 1,
  'iEGFR': 2,
  'iMEK': 3,
  'iPI3K': 4,
  'iPKC': 5},
 'cell_line': {'184A1': 0,
  'AU565': 1,
  'BT20': 2,
  'BT474': 3,
  'BT549': 4,
  'CAL148': 5,
  'CAL51': 6,
  'CAL851': 7,
  'DU4475': 8,
  'EFM19': 9,
  'EFM192A': 10,
  'EVSAT': 11,
  'HBL100': 12,
  'HCC1187': 13,
  'HCC1395': 14,
  'HCC1419': 15,
  'HCC1500': 16,
  'HCC1569': 17,
  'HCC1599': 18,
  'HCC1937': 19,
  'HCC1954': 20,
  'HCC2157': 21,
  'HCC2185': 22,
  'HCC2218': 23,
  'HCC3153': 24,
  'HCC38': 25,
  'HCC70': 26,
  'HDQP1': 27,
  'JIMT1': 28,
  'LY2': 29,
  'MACLS2': 30,
  'MCF10A': 31,
  'MCF10F': 32,
  'MCF7': 33,
  'MDAMB134VI': 34,
  'MDAMB157': 35,
  'MDAMB175VII': 36,
  'MDAMB361': 37,
  'MDAMB415': 38,
  'MDAMB436': 39,
  'MDAMB453': 40,
  'MDAkb2': 41,
  'MFM223': 42,
  'MPE600': 43,
  'MX1': 44,
  'OCUBM': 45,
  'T47D': 46,
  'UACC812': 47,
  'UACC893': 48,
  'ZR7530': 49},
 'time': {0.0: 0,
  5.5: 1,
  7.0: 2,
  9.0: 3,
  12.0: 4,
  13.0: 5,
  14.0: 6,
  15.0: 7,
 

In [8]:
df.columns

Index(['treatment', 'cell_line', 'time', 'cellID', 'fileID', 'b.CATENIN',
       'cleavedCas', 'CyclinB', 'GAPDH', 'IdU', 'Ki.67', 'p.4EBP1',
       'p.Akt.Ser473.', 'p.AKT.Thr308.', 'p.AMPK', 'p.BTK', 'p.CREB', 'p.ERK',
       'p.FAK', 'p.GSK3b', 'p.H3', 'p.HER2', 'p.JNK', 'p.MAP2K3', 'p.MAPKAPK2',
       'p.MEK', 'p.MKK3.MKK6', 'p.MKK4', 'p.NFkB', 'p.p38', 'p.p53',
       'p.p90RSK', 'p.PDPK1', 'p.PLCg2', 'p.RB', 'p.S6', 'p.S6K', 'p.SMAD23',
       'p.SRC', 'p.STAT1', 'p.STAT3', 'p.STAT5'],
      dtype='object')

### Reorder columns

In [9]:
print(df.shape)
test_markers = ["p.Akt.Ser473.", "p.ERK",  "p.HER2", "p.PLCg2","p.S6"]
valid_markers = ['p.GSK3b','p.MAPKAPK2','p.BTK']
cols = ['treatment','cell_line','time','cellID','fileID']
markers = valid_markers+test_markers + [i for i in df.columns if i not in cols+valid_markers+test_markers]
df = df[cols+markers]
print(df.shape)
df.columns

(19509387, 42)
(19509387, 42)


Index(['treatment', 'cell_line', 'time', 'cellID', 'fileID', 'p.GSK3b',
       'p.MAPKAPK2', 'p.BTK', 'p.Akt.Ser473.', 'p.ERK', 'p.HER2', 'p.PLCg2',
       'p.S6', 'b.CATENIN', 'cleavedCas', 'CyclinB', 'GAPDH', 'IdU', 'Ki.67',
       'p.4EBP1', 'p.AKT.Thr308.', 'p.AMPK', 'p.CREB', 'p.FAK', 'p.H3',
       'p.JNK', 'p.MAP2K3', 'p.MEK', 'p.MKK3.MKK6', 'p.MKK4', 'p.NFkB',
       'p.p38', 'p.p53', 'p.p90RSK', 'p.PDPK1', 'p.RB', 'p.S6K', 'p.SMAD23',
       'p.SRC', 'p.STAT1', 'p.STAT3', 'p.STAT5'],
      dtype='object')

In [10]:
%%time
pdf = df.to_pandas()
del df

CPU times: user 1.32 s, sys: 2.51 s, total: 3.84 s
Wall time: 3.84 s


### Train, valid, test split

In [11]:
test_cell_lines = [i.split('.')[0] for i in os.listdir(path/'single_cell_phospo/subchallenge_1')]
test_cell_lines_hash = [str_map['cell_line'][i] for i in test_cell_lines]

valid_cell_lines = ['MPE600','BT474','HCC2185','MCF7','184A1','BT549']
valid_cell_lines_hash = [str_map['cell_line'][i] for i in valid_cell_lines]

mask_valid = (pdf.cell_line.isin(valid_cell_lines_hash))
mask_test = (pdf.cell_line.isin(test_cell_lines_hash))

pdf['is_va'] = 0
pdf.loc[mask_valid,'is_va']=1
pdf.loc[mask_test,'is_va']=1

mask_train = ~(mask_valid|mask_test)

train = pdf.loc[mask_train].dropna(subset=markers)
valid = pdf.loc[mask_valid].dropna(subset=markers)


str_map['num'] = {'train':train.shape[0],
                  'valid':valid.shape[0],
                  'test':mask_test.sum()}

pdf = pd.concat([train, valid, pdf.loc[mask_test]],axis=0).reset_index(drop=True)

In [12]:
%%time
path = str(path)
pdf.to_pickle('%s/cache/miao.pickle'%path)
pickle.dump(str_map,open('%s/cache/miao_map.pickle'%path,'wb'))

CPU times: user 964 ms, sys: 3.76 s, total: 4.72 s
Wall time: 5.82 s


In [13]:
pdf.shape

(13405399, 43)