In [1]:
import os
GPU_id = 5
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [2]:
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
Path.ls = lambda x: list(x.iterdir())

import cudf as gd
import nvstrings
from numba import cuda

from fastai.tabular import *
from fastai.callbacks import SaveModelCallback
from fastai.basic_data import DatasetType
from fastai.callback import Callback
from fastai.torch_core import add_metrics


from datetime import datetime
from tqdm import tqdm
from collections import defaultdict
import pandas as pd
import numpy as np
import time
import xgboost as xgb
import pickle

print(gd.__version__)

0.10.0


### Load data

In [3]:
%%time
path = Path('/raid/data/ml/dream/single_cell_breast_cancer')
df = pd.read_pickle('%s/cache/miao.pickle'%path)#.fillna(1.0)
str_map = pickle.load(open('%s/cache/miao_map.pickle'%path,'rb'))

for col in ['treatment','cell_line','time']:
    str_map[col] = {v:k for k,v in str_map[col].items()}

CPU times: user 416 ms, sys: 2.54 s, total: 2.96 s
Wall time: 2.95 s


In [4]:
test_markers = ["p.Akt.Ser473.", "p.ERK",  "p.HER2", "p.PLCg2","p.S6"]
valid_markers = ['p.GSK3b','p.MAPKAPK2','p.BTK']
ycols = valid_markers + test_markers
for i in ycols:
    print(i,df[i].min())

p.GSK3b 1.10118
p.MAPKAPK2 0.499066
p.BTK 0.543688
p.Akt.Ser473. 0.550685
p.ERK 0.442259
p.HER2 0.881374
p.PLCg2 0.986227
p.S6 2.39632


### Merge fileID_table

In [4]:
%%time
fileid = gd.read_csv(path/'FileID_table.csv')
not_used = ['Unnamed: 0', 'cell_line', 'treatment','fileID','date','time']
newcols = []
for col in fileid.columns:
    if col not in not_used:# and fileid[col].unique().shape[0]>=5:
        fileid[col],_ = fileid[col].factorize()
        newcols.append(col)
print('new cols',len(newcols))
fileid = fileid.to_pandas()

new cols 31
CPU times: user 4.18 s, sys: 1.9 s, total: 6.07 s
Wall time: 10.5 s


In [5]:
%%time
#df.drop('time',axis=1,inplace=True)
newcols = [i for i in newcols if i not in df.columns]
print(newcols)
df = df.merge(fileid[newcols+['fileID']],on='fileID',how='left')

['time_course', 'plate', 'PAM50', 'Neve', 'Marcotte', 'Gene.cluster', 'Classification', 'AR', 'HER2', 'PGR', 'ERa', 'Age', 'primary.tumor', 'metastatic.site', 'formed.mets', 'Growth.medium', 'Origin', 'Source', 'Lehmann', 'Basal.profile', 'Responce.profile', 'Lehmann.from.Lehmann', 'ER', 'PR', 'HER2.1', 'TP53', 'Source.1', 'Tumor.type', 'Ethnicity', 'Plate.Nr', 'fCluster']
CPU times: user 2.31 s, sys: 2.24 s, total: 4.56 s
Wall time: 4.55 s


In [6]:
N = str_map['num']['train'] + str_map['num']['valid']
va = df['is_va'].values
va[N:] = 2
df['is_va'] = va

### get seq id

In [7]:
%%time
cols = ['treatment','cell_line','cellID','time_course','fileID','time']
gdf = df[cols]
gdf['idx'] = np.arange(df.shape[0])
gdf = gd.from_pandas(gdf)
gdf.head()

CPU times: user 1.03 s, sys: 2.06 s, total: 3.1 s
Wall time: 3.29 s


Unnamed: 0,treatment,cell_line,cellID,time_course,fileID,time,idx
0,0,21,1,1,321,0,0
1,0,21,1,0,342,0,1
2,0,21,2,1,321,0,2
3,0,21,2,0,342,0,3
4,0,21,3,1,321,0,4


In [8]:
%%time

cols = ['treatment','cell_line','cellID','time_course','fileID']
gdf = gdf.sort_values(cols)

def get_seq_id(time,seq_id,time_course):
    N = len(time)
    for i in range(cuda.threadIdx.x, N, cuda.blockDim.x):
        seq_id[i] = 0
        if i>0:
            if time_course[i]!=time_course[i-1]:
                seq_id[i] = 1
            elif time[i]<time[i-1]-1:# and (time[i] == 0 or time[i] == 1):
                seq_id[i] = 1
            
        else:
            seq_id[i] = 1
cols = ['treatment','cell_line','cellID']        
gdf = gdf.groupby(cols,method="cudf",as_index=False).apply_grouped(get_seq_id,
                  incols=['time','time_course'],
                  outcols={'seq_id': np.int32},
                  tpb=32)

gdf['seq_id'] = gdf['seq_id'].cumsum()
gdf = gdf.sort_values(['seq_id','time'])

def get_step_id(time,step_id):
    for i in range(cuda.threadIdx.x, len(time), cuda.blockDim.x):
        step_id[i] = i
        
gdf = gdf.groupby('seq_id',method="cudf",as_index=False).apply_grouped(get_step_id,incols=['time'],
                  outcols={'step_id': np.int32},
                  tpb=32)
gdf = gdf.sort_values('idx')

gdf.head(20)

CPU times: user 596 ms, sys: 292 ms, total: 888 ms
Wall time: 1.62 s


Unnamed: 0,treatment,cell_line,cellID,time_course,fileID,time,idx,seq_id,step_id
1255775,0,21,1,1,321,0,0,319474,0
1255766,0,21,1,0,342,0,1,319472,0
1255796,0,21,2,1,321,0,2,319478,0
1255787,0,21,2,0,342,0,3,319476,0
1255817,0,21,3,1,321,0,4,319482,0
1255808,0,21,3,0,342,0,5,319480,0
1255838,0,21,4,1,321,0,6,319486,0
1255829,0,21,4,0,342,0,7,319484,0
1255859,0,21,5,1,321,0,8,319490,0
1255850,0,21,5,0,342,0,9,319488,0


In [9]:
dg = gdf.groupby('seq_id').agg({'time':'count'})
dg['time'].value_counts()

1    2989149
2    2386357
3     751314
6     404008
4     116019
5     100294
Name: time, dtype: int32

In [10]:
dg.columns = ['seq_len']
gdf = gdf.merge(dg.reset_index(),on='seq_id',how='left')
gdf = gdf.sort_values('idx')

In [11]:
%%time
df['seq_id'] = gdf['seq_id'].to_pandas().values
df['step_id'] = gdf['step_id'].to_pandas().values
df['seq_len'] = gdf['seq_len'].to_pandas().values
#del gdf

CPU times: user 140 ms, sys: 324 ms, total: 464 ms
Wall time: 465 ms


In [12]:
df['seq_len'].value_counts()

2    4772714
1    2989149
6    2424048
3    2253942
5     501470
4     464076
Name: seq_len, dtype: int64

In [13]:
df[df['seq_len']==1].head()

Unnamed: 0,treatment,cell_line,time,cellID,fileID,p.GSK3b,p.MAPKAPK2,p.BTK,p.Akt.Ser473.,p.ERK,...,HER2.1,TP53,Source.1,Tumor.type,Ethnicity,Plate.Nr,fCluster,seq_id,step_id,seq_len
30736,0,21,5,4272,344,3.26654,1.51823,1.907095,4.704089,3.516731,...,1,13,2,2,1,3,0,336556,0,1
30737,0,21,5,4273,344,5.29567,2.827672,2.788101,6.01901,4.997181,...,1,13,2,2,1,3,0,336560,0,1
30738,0,21,5,4274,344,1.10118,1.312255,1.197475,2.317964,2.525699,...,1,13,2,2,1,3,0,336564,0,1
30739,0,21,5,4275,344,4.27202,2.443686,2.648984,5.671833,5.346557,...,1,13,2,2,1,3,0,336568,0,1
30740,0,21,5,4276,344,4.272,2.350869,2.5837,5.593953,4.584862,...,1,13,2,2,1,3,0,336572,0,1


mask = (df['treatment']==0)&(df['cell_line']==21)&(df.cellID==4272)
dt = df.loc[mask][['treatment','cell_line','cellID','time_course','fileID','time','seq_id','step_id']]
dt = dt.sort_values('time_course')
dt

In [14]:
df.columns

Index(['treatment', 'cell_line', 'time', 'cellID', 'fileID', 'p.GSK3b',
       'p.MAPKAPK2', 'p.BTK', 'p.Akt.Ser473.', 'p.ERK', 'p.HER2', 'p.PLCg2',
       'p.S6', 'b.CATENIN', 'cleavedCas', 'CyclinB', 'GAPDH', 'IdU', 'Ki.67',
       'p.4EBP1', 'p.AKT.Thr308.', 'p.AMPK', 'p.CREB', 'p.FAK', 'p.H3',
       'p.JNK', 'p.MAP2K3', 'p.MEK', 'p.MKK3.MKK6', 'p.MKK4', 'p.NFkB',
       'p.p38', 'p.p53', 'p.p90RSK', 'p.PDPK1', 'p.RB', 'p.S6K', 'p.SMAD23',
       'p.SRC', 'p.STAT1', 'p.STAT3', 'p.STAT5', 'is_va', 'time_course',
       'plate', 'PAM50', 'Neve', 'Marcotte', 'Gene.cluster', 'Classification',
       'AR', 'HER2', 'PGR', 'ERa', 'Age', 'primary.tumor', 'metastatic.site',
       'formed.mets', 'Growth.medium', 'Origin', 'Source', 'Lehmann',
       'Basal.profile', 'Responce.profile', 'Lehmann.from.Lehmann', 'ER', 'PR',
       'HER2.1', 'TP53', 'Source.1', 'Tumor.type', 'Ethnicity', 'Plate.Nr',
       'fCluster', 'seq_id', 'step_id', 'seq_len'],
      dtype='object')

In [15]:
test_markers = ["p.Akt.Ser473.", "p.ERK",  "p.HER2", "p.PLCg2","p.S6"]
valid_markers = ['p.GSK3b','p.MAPKAPK2','p.BTK']
valid_markers+test_markers

['p.GSK3b',
 'p.MAPKAPK2',
 'p.BTK',
 'p.Akt.Ser473.',
 'p.ERK',
 'p.HER2',
 'p.PLCg2',
 'p.S6']

In [16]:
%%time
cols = ['seq_len','treatment','cell_line','is_va']+newcols+['cellID','seq_id','step_id']
dt = df.set_index(cols).unstack(-1)
print(dt.shape)
dt.head()

(6747141, 234)
CPU times: user 2min 25s, sys: 1min 28s, total: 3min 54s
Wall time: 3min 54s


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0,Unnamed: 21_level_0,Unnamed: 22_level_0,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Unnamed: 28_level_0,Unnamed: 29_level_0,Unnamed: 30_level_0,Unnamed: 31_level_0,Unnamed: 32_level_0,Unnamed: 33_level_0,Unnamed: 34_level_0,Unnamed: 35_level_0,Unnamed: 36_level_0,time,time,time,time,time,time,fileID,fileID,fileID,fileID,...,p.STAT3,p.STAT3,p.STAT3,p.STAT3,p.STAT5,p.STAT5,p.STAT5,p.STAT5,p.STAT5,p.STAT5
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,step_id,0,1,2,3,4,5,0,1,2,3,...,2,3,4,5,0,1,2,3,4,5
seq_len,treatment,cell_line,is_va,time_course,plate,PAM50,Neve,Marcotte,Gene.cluster,Classification,AR,HER2,PGR,ERa,Age,primary.tumor,metastatic.site,formed.mets,Growth.medium,Origin,Source,Lehmann,Basal.profile,Responce.profile,Lehmann.from.Lehmann,ER,PR,HER2.1,TP53,Source.1,Tumor.type,Ethnicity,Plate.Nr,fCluster,cellID,seq_id,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2
1,0,0,1,0,1,5,3,4,0,0,1,1,1,1,0,1,1,1,10,5,0,2,0,0,0,2,0,1,11,0,0,0,3,4,11882,23763,5.0,,,,,,2660.0,,,,...,,,,,2.75527,,,,,
1,0,0,1,0,1,5,3,4,0,0,1,1,1,1,0,1,1,1,10,5,0,2,0,0,0,2,0,1,11,0,0,0,3,4,11883,23765,5.0,,,,,,2660.0,,,,...,,,,,1.95018,,,,,
1,0,0,1,0,1,5,3,4,0,0,1,1,1,1,0,1,1,1,10,5,0,2,0,0,0,2,0,1,11,0,0,0,3,4,11884,23767,5.0,,,,,,2660.0,,,,...,,,,,2.38754,,,,,
1,0,0,1,0,1,5,3,4,0,0,1,1,1,1,0,1,1,1,10,5,0,2,0,0,0,2,0,1,11,0,0,0,3,4,11885,23769,5.0,,,,,,2660.0,,,,...,,,,,3.03145,,,,,
1,0,0,1,0,1,5,3,4,0,0,1,1,1,1,0,1,1,1,10,5,0,2,0,0,0,2,0,1,11,0,0,0,3,4,11886,23771,5.0,,,,,,2660.0,,,,...,,,,,2.06792,,,,,


In [17]:
%%time
l0 = dt.columns.get_level_values(0)
l1 = dt.columns.get_level_values(1)
cols = ['%s_%s'%(i,j) for i,j in zip(l0,l1)]
dt.columns = cols
dt = dt.reset_index()
dt = dt.sort_values('is_va')
dt = dt.reset_index(drop=True)
dt.head()

CPU times: user 8.8 s, sys: 13.7 s, total: 22.5 s
Wall time: 22.5 s


Unnamed: 0,seq_len,treatment,cell_line,is_va,time_course,plate,PAM50,Neve,Marcotte,Gene.cluster,...,p.STAT3_2,p.STAT3_3,p.STAT3_4,p.STAT3_5,p.STAT5_0,p.STAT5_1,p.STAT5_2,p.STAT5_3,p.STAT5_4,p.STAT5_5
0,6,0,49,0,1,0,3,4,6,3,...,0.846357,0.733101,2.123893,2.297142,2.720982,2.069571,3.305178,3.082339,3.354231,2.780798
1,2,0,49,0,0,0,3,4,6,3,...,,,,,2.14846,2.529249,,,,
2,2,0,49,0,0,0,3,4,6,3,...,,,,,2.828693,2.568433,,,,
3,2,0,49,0,0,0,3,4,6,3,...,,,,,2.866521,2.896402,,,,
4,2,0,49,0,0,0,3,4,6,3,...,,,,,2.176365,2.768626,,,,


In [18]:
%%time
path = str(path)
dt.to_pickle('%s/cache/rnn_data.pickle'%path)

CPU times: user 3.17 s, sys: 13.9 s, total: 17.1 s
Wall time: 21 s
