# Data

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from rdkit import Chem
from rdkit.Chem import RDKFingerprint
import numpy as np

In [2]:
de_train = pd.read_parquet(f"./train/de_train.parquet")
de_train.head()

Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,control,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,NK cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.10472,-0.077524,-1.625596,-0.144545,0.143555,...,-0.227781,-0.010752,-0.023881,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755
1,T cells CD4+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.915953,-0.88438,0.371834,-0.081677,-0.498266,...,-0.494985,-0.303419,0.304955,-0.333905,-0.315516,-0.369626,-0.095079,0.70478,1.096702,-0.869887
2,T cells CD8+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,-0.387721,-0.305378,0.567777,0.303895,-0.022653,...,-0.119422,-0.033608,-0.153123,0.183597,-0.555678,-1.494789,-0.21355,0.415768,0.078439,-0.259365
3,T regulatory cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.232893,0.129029,0.336897,0.486946,0.767661,...,0.451679,0.704643,0.015468,-0.103868,0.865027,0.189114,0.2247,-0.048233,0.216139,-0.085024
4,NK cells,Mometasone Furoate,LSM-3349,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,False,4.290652,-0.063864,-0.017443,-0.541154,0.570982,...,0.758474,0.510762,0.607401,-0.123059,0.214366,0.487838,-0.819775,0.112365,-0.122193,0.676629


Encode cell types

In [3]:
cell_types = de_train.iloc[:,0:1]
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False).fit(cell_types)
cell_types_ohe = ohe.transform(cell_types)
cell_types_ohe = cell_types_ohe.tolist()
de_train['cell_type_ohe'] = cell_types_ohe
de_train.head()

Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,control,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,...,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,cell_type_ohe
0,NK cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.10472,-0.077524,-1.625596,-0.144545,0.143555,...,-0.010752,-0.023881,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"
1,T cells CD4+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.915953,-0.88438,0.371834,-0.081677,-0.498266,...,-0.303419,0.304955,-0.333905,-0.315516,-0.369626,-0.095079,0.70478,1.096702,-0.869887,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
2,T cells CD8+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,-0.387721,-0.305378,0.567777,0.303895,-0.022653,...,-0.033608,-0.153123,0.183597,-0.555678,-1.494789,-0.21355,0.415768,0.078439,-0.259365,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
3,T regulatory cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.232893,0.129029,0.336897,0.486946,0.767661,...,0.704643,0.015468,-0.103868,0.865027,0.189114,0.2247,-0.048233,0.216139,-0.085024,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0]"
4,NK cells,Mometasone Furoate,LSM-3349,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,False,4.290652,-0.063864,-0.017443,-0.541154,0.570982,...,0.510762,0.607401,-0.123059,0.214366,0.487838,-0.819775,0.112365,-0.122193,0.676629,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"


Encode perturbagens:

In [4]:
perts = de_train["SMILES"]
print(len(perts))
perts = perts.dropna()
print(len(perts))
cs_arr = []
for pert in perts:
  try:
    cs = Chem.CanonSmiles(pert)
    cs_arr.append(cs)
  except:
    print('Invalid SMILES: ', pert)
print(len(cs_arr))

614
614
614


In [5]:
mols = [Chem.MolFromSmiles(x) for x in cs_arr]
fps = [np.array(RDKFingerprint(mol)) for mol in mols]
de_train['fingerprint'] = fps
de_train.head()

Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,control,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,cell_type_ohe,fingerprint
0,NK cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.10472,-0.077524,-1.625596,-0.144545,0.143555,...,-0.023881,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, ..."
1,T cells CD4+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.915953,-0.88438,0.371834,-0.081677,-0.498266,...,0.304955,-0.333905,-0.315516,-0.369626,-0.095079,0.70478,1.096702,-0.869887,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, ..."
2,T cells CD8+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,-0.387721,-0.305378,0.567777,0.303895,-0.022653,...,-0.153123,0.183597,-0.555678,-1.494789,-0.21355,0.415768,0.078439,-0.259365,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, ..."
3,T regulatory cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.232893,0.129029,0.336897,0.486946,0.767661,...,0.015468,-0.103868,0.865027,0.189114,0.2247,-0.048233,0.216139,-0.085024,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, ..."
4,NK cells,Mometasone Furoate,LSM-3349,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,False,4.290652,-0.063864,-0.017443,-0.541154,0.570982,...,0.607401,-0.123059,0.214366,0.487838,-0.819775,0.112365,-0.122193,0.676629,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, ..."


Combine 2 input features (cell type & perturbagen) into 1:

In [6]:
res_arr = []
for i in range(len(de_train)):
  ct_ohe = np.array(de_train.iloc[i,-2:-1][0])
  fingerprint = np.array(de_train.iloc[i,-1:][0])
  res = np.concatenate((ct_ohe, fingerprint), axis=0)
  res_arr.append(res)
de_train['merged'] = res_arr
de_train.head()

Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,control,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,cell_type_ohe,fingerprint,merged
0,NK cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.10472,-0.077524,-1.625596,-0.144545,0.143555,...,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,T cells CD4+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.915953,-0.88438,0.371834,-0.081677,-0.498266,...,-0.333905,-0.315516,-0.369626,-0.095079,0.70478,1.096702,-0.869887,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, ...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,T cells CD8+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,-0.387721,-0.305378,0.567777,0.303895,-0.022653,...,0.183597,-0.555678,-1.494789,-0.21355,0.415768,0.078439,-0.259365,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
3,T regulatory cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.232893,0.129029,0.336897,0.486946,0.767661,...,-0.103868,0.865027,0.189114,0.2247,-0.048233,0.216139,-0.085024,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
4,NK cells,Mometasone Furoate,LSM-3349,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,False,4.290652,-0.063864,-0.017443,-0.541154,0.570982,...,-0.123059,0.214366,0.487838,-0.819775,0.112365,-0.122193,0.676629,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, ..."


# Blend

In [7]:
from numpy import hstack
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import sklearn

In [8]:
def get_models():
    models = []
    models.append(('lr', LinearRegression()))
    models.append(('knn', KNeighborsRegressor()))
    models.append(('cart', DecisionTreeRegressor()))
    models.append(('svm', SVR()))
    return models

In [9]:
def fit_ensemble(models, X_train, X_val, y_train, y_val):
    # fit all models on the training set and predict on hold out set
    meta_X = []
    for name, model in models:
    # fit in training set
         model.fit(X_train, y_train)
         # predict on hold out set
         yhat = model.predict(X_val)
         #print(yhat.shape)
         # reshape predictions into a matrix with one column
         yhat = yhat.reshape(len(yhat), 1)
         # store predictions as input for blending
         meta_X.append(yhat)
         # create 2d array from predictions, each set is an input feature
    meta_X = hstack(meta_X)
    # define blending model
    blender = LinearRegression()
     # fit on predictions from base models
    blender.fit(meta_X, y_val)
    return blender

In [10]:
# make a prediction with the blending ensemble
def predict_ensemble(models, blender, X_test):
    # make predictions with base models
    meta_X = []
    for name, model in models:
         # predict with base model
         yhat = model.predict(X_test)
         # reshape predictions into a matrix with one column
         yhat = yhat.reshape(len(yhat), 1)
         # store prediction
         meta_X.append(yhat)
    # create 2d array from predictions, each set is an input feature
    meta_X = hstack(meta_X)
    # predict
    return blender.predict(meta_X)

In [11]:
b_cells_df = de_train[de_train["cell_type"] == "B cells"]
my_cells_df = de_train[de_train["cell_type"] == "Myeloid cells"]
b_cells_idxes = list(b_cells_df.index)
my_cells_idxes = list(my_cells_df.index)

In [12]:
tmp_x = de_train.iloc[:,-1:]
arr = []
for i in range(len(tmp_x)):
    if i in b_cells_idxes or i in my_cells_idxes:
        continue
    arr.append(tmp_x.iloc[i,0])
X = np.array(arr)
print(X.shape)
tmp_y = de_train.iloc[:,5:-3]
#tmp_y = de_train.iloc[:,5:105]
arr = []
for i in range(len(tmp_y)):
    if i in b_cells_idxes or i in my_cells_idxes:
        continue
    arr.append(tmp_y.iloc[i].values)
Y = np.array(arr)
print(Y.shape)

(580, 2054)
(580, 18211)


In [13]:
tmp_x = de_train.iloc[:,-1:]
arr = []
for i in range(len(tmp_x)):
    if i in b_cells_idxes:
        arr.append(tmp_x.iloc[i,0])
X_b = np.array(arr)
print(X_b.shape)

#tmp_y = de_train.iloc[:,5:105]
tmp_y = de_train.iloc[:,5:-3]
arr = []
for i in range(len(tmp_y)):
    if i in b_cells_idxes:
        arr.append(tmp_y.iloc[i].values)
Y_b = np.array(arr)
print(Y_b.shape)

(17, 2054)
(17, 18211)


In [14]:
tmp_x = de_train.iloc[:,-1:]
arr = []
for i in range(len(tmp_x)):
    if i in my_cells_idxes:
        arr.append(tmp_x.iloc[i,0])
X_my = np.array(arr)
print(X_my.shape)

#tmp_y = de_train.iloc[:,5:105]
tmp_y = de_train.iloc[:,5:-3]
arr = []
for i in range(len(tmp_y)):
    if i in my_cells_idxes:
        arr.append(tmp_y.iloc[i].values)
Y_my = np.array(arr)
print(Y_my.shape)

(17, 2054)
(17, 18211)


In [15]:
train_x_split, test_x_split, train_y_split, test_y_split = train_test_split(X, Y, test_size=0.01, random_state=42) 
train_x, val_x, train_y, val_y = train_test_split(train_x_split, train_y_split, test_size=0.01, random_state=42)
print(len(val_x))
    
train_xb, test_xb, train_yb, test_yb = train_test_split(X_b, Y_b, test_size=0.2, random_state=42)
train_xb, val_xb, train_yb, val_yb = train_test_split(train_xb, train_yb, test_size=0.7, random_state=42)
print(len(val_xb))
print(len(test_xb))

train_xmy, test_xmy, train_ymy, test_ymy = train_test_split(X_my, Y_my, test_size=0.2, random_state=42)
train_xmy, val_xmy, train_ymy, val_ymy = train_test_split(train_xmy, train_ymy, test_size=0.7, random_state=42)
print(len(val_xmy))
print(len(test_xmy))
    
X_train = np.concatenate((train_xb, train_xmy, train_x), axis=0)
y_train = np.concatenate((train_yb, train_ymy, train_y), axis=0)
X_val = np.concatenate((val_xb, val_xmy, val_x), axis=0)
y_val = np.concatenate((val_yb, val_ymy, val_y), axis=0) 
X_test = np.concatenate((test_xb, test_xmy, test_x_split), axis=0)
y_test = np.concatenate((test_yb, test_ymy, test_y_split), axis=0)
# summarize data split
print('Train: %s, Val: %s, Test: %s' % (X_train.shape, X_val.shape, X_test.shape))

6
10
4
10
4
Train: (574, 2054), Val: (26, 2054), Test: (14, 2054)


In [16]:
def mrrmse_np(y_pred, y_true):
    return np.sqrt(np.square(y_true - y_pred).mean(axis=1)).mean()

In [17]:
from tqdm import tqdm
import math

In [18]:
glob_models = []
ctr = 0
for i in tqdm(range(Y.shape[1])):
    best_model = None
    best_score = math.inf
    loc_models = get_models()
    blender = fit_ensemble(loc_models, X_train, X_val, y_train[:, i], y_val[:, i])
    yhat = predict_ensemble(loc_models, blender, X_test)
    score = sklearn.metrics.mean_squared_error(y_test[:, 1], yhat)
    if score < best_score:
        best_score = score
        best_model = (loc_models, blender)
    # print('Blending MAE: %.3f' % score)
    loc_bs_models = get_models()
    is_better = False
    for name, model in loc_bs_models:
        model.fit(X_train, y_train[:, i])
        yhat = model.predict(X_test)
        score = sklearn.metrics.mean_squared_error(y_test[:, i], yhat)
        # print('>%s MAE: %.3f' % (name, bs_score))
        if score < best_score:
            best_model = model
            best_score = score
    glob_models.append(best_model)

 64%|██████▍   | 11659/18211 [5:01:42<2:47:29,  1.53s/it]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

100%|██████████| 18211/18211 [7:49:55<00:00,  1.55s/it]  


On a small experiment, for 36 / 50 times, one of the baseline models performed better than the ensemble one. 

For some genes, a single model is enough; for others, ensembles performs better. 

# Prepare Predictions

In [19]:
id_map = pd.read_csv("./train/id_map.csv", sep=",", dtype=str)
id_map.columns
id_map.head()

Unnamed: 0,id,cell_type,sm_name
0,0,B cells,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-...
1,1,B cells,ABT-199 (GDC-0199)
2,2,B cells,ABT737
3,3,B cells,AMD-070 (hydrochloride)
4,4,B cells,AT 7867


In [20]:
ohe_col = []
for x in id_map.cell_type:
  if x == "B cells":
    ohe_col.append([1.0, 0.0, 0.0, 0.0, 0.0, 0.0])
  else:
    ohe_col.append([0.0, 1.0, 0.0, 0.0, 0.0, 0.0])
id_map["ohe_col"] = ohe_col
id_map.head()

Unnamed: 0,id,cell_type,sm_name,ohe_col
0,0,B cells,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
1,1,B cells,ABT-199 (GDC-0199),"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
2,2,B cells,ABT737,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
3,3,B cells,AMD-070 (hydrochloride),"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
4,4,B cells,AT 7867,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]"


In [21]:
cp_df = de_train[["sm_name", "SMILES"]].drop_duplicates()
cp_df.head()

Unnamed: 0,sm_name,SMILES
0,Clotrimazole,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1
4,Mometasone Furoate,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...
8,Idelalisib,CC[C@H](Nc1ncnc2[nH]cnc12)c1nc2cccc(F)c2c(=O)n...
14,Vandetanib,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1
18,Bosutinib,COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc...


In [22]:
cs_arr = []
sm_arr = cp_df["SMILES"]
for sm in sm_arr:
  try:
    cs = Chem.CanonSmiles(sm)
    cs_arr.append(cs)
  except:
    print('Invalid SMILES: ', sm)
print(len(cs_arr))

146


In [23]:
mols = [Chem.MolFromSmiles(x) for x in cs_arr]
fps = [np.array(RDKFingerprint(mol)) for mol in mols]
cp_df['fingerprint'] = fps
cp_df["fingerprint"] = cp_df["fingerprint"].apply(lambda x: np.array(x))
cp_df.head()

Unnamed: 0,sm_name,SMILES,fingerprint
0,Clotrimazole,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,"[0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, ..."
4,Mometasone Furoate,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,"[1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, ..."
8,Idelalisib,CC[C@H](Nc1ncnc2[nH]cnc12)c1nc2cccc(F)c2c(=O)n...,"[1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, ..."
14,Vandetanib,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1,"[1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, ..."
18,Bosutinib,COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc...,"[1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, ..."


In [24]:
id_map_merged = pd.merge(id_map, cp_df, on="sm_name", how='inner')
assert len(id_map) == len(id_map_merged)
id_map_merged.head()

Unnamed: 0,id,cell_type,sm_name,ohe_col,SMILES,fingerprint
0,0,B cells,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",Cc1nc2c(-c3cnc(N)nc3)nc(N3CCOCC3)nc2n1C(C)C,"[1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, ..."
1,128,Myeloid cells,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",Cc1nc2c(-c3cnc(N)nc3)nc(N3CCOCC3)nc2n1C(C)C,"[1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, ..."
2,1,B cells,ABT-199 (GDC-0199),"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...,"[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, ..."
3,129,Myeloid cells,ABT-199 (GDC-0199),"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...,"[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, ..."
4,2,B cells,ABT737,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",CN(C)CC[C@H](CSc1ccccc1)Nc1ccc(S(=O)(=O)NC(=O)...,"[0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."


In [25]:
res_arr = []
for i in range(len(id_map_merged)):
  ct_ohe = np.array(id_map_merged.iloc[i,-3:-2][0])
  fingerprint = np.array(id_map_merged.iloc[i,-1:][0])
  res = np.concatenate((ct_ohe, fingerprint), axis=0)
  res_arr.append(res)
id_map_merged['merged'] = res_arr
id_map_merged.head()

Unnamed: 0,id,cell_type,sm_name,ohe_col,SMILES,fingerprint,merged
0,0,B cells,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",Cc1nc2c(-c3cnc(N)nc3)nc(N3CCOCC3)nc2n1C(C)C,"[1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, ..."
1,128,Myeloid cells,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",Cc1nc2c(-c3cnc(N)nc3)nc(N3CCOCC3)nc2n1C(C)C,"[1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, ..."
2,1,B cells,ABT-199 (GDC-0199),"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...,"[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, ..."
3,129,Myeloid cells,ABT-199 (GDC-0199),"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...,"[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, ..."
4,2,B cells,ABT737,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",CN(C)CC[C@H](CSc1ccccc1)Nc1ccc(S(=O)(=O)NC(=O)...,"[0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ..."


In [26]:
tmp_id_x = id_map_merged.iloc[:,-1:]
arr = np.empty(len(tmp_id_x), dtype=object)
for i in range(len(tmp_id_x)):
  arr[i] = tmp_id_x.iloc[i,0]
id_X = arr
id_X = np.vstack(id_X)
print(id_X.shape)
genes = np.array(de_train.columns[5:18216])
print(genes[-1])

(255, 2054)
ZZEF1


In [36]:
def predict(model, x):
    try:
        models = model[0]
        blender = model[1]
        return predict_ensemble(models, blender, x)
    except:
        return model.predict(x)

In [37]:
predict(glob_models[1], X_test)

array([ 0.10510787,  0.28035497,  1.69671062,  2.59249249,  0.38109985,
        0.28035497,  1.69671062,  2.59249249, -0.06498003, -0.1682473 ,
       -0.42519547, -0.39448931, -0.25920667, -0.07457715])

Predict on the Kaggle submission set:

In [39]:
id_y_pred = np.array([predict(model, id_X) for model in tqdm(glob_models)]).T
id_y_pred.shape

100%|██████████| 18211/18211 [42:48<00:00,  7.09it/s] 


(255, 18211)

In [40]:
out_fh = open("./submission_ensemble.csv", 'w')
out_fh.write("id")
for g in genes:
  out_fh.write("," + g)
out_fh.write("\n")
ctr = 0
for row in id_y_pred:
    out_fh.write(str(ctr))
    for col in row:
        out_fh.write(',' + str(col))
    out_fh.write('\n')
    ctr += 1
out_fh.close()

In [41]:
!kaggle competitions submit -c open-problems-single-cell-perturbations -f ./submission_ensemble.csv -m "lgbm"

/bin/bash: kaggle: command not found
