In [1]:
# library

import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import lightgbm as lgb
from sklearn.model_selection import train_test_split


In [2]:
# data load

de_train = pd.read_parquet("/home/aiuser/taeuk/open-problems-single-cell-perturbations/de_train.parquet")
id_map = pd.read_csv("/home/aiuser/taeuk/open-problems-single-cell-perturbations/id_map.csv")
submisstion = pd.read_csv("/home/aiuser/taeuk/open-problems-single-cell-perturbations/sample_submission.csv")

In [3]:
de_train.iloc[:,[1,4]].groupby('control').count()

Unnamed: 0_level_0,sm_name
control,Unnamed: 1_level_1
False,602
True,12


In [4]:
de_train.iloc[:, 4:20].groupby('control').mean()

Unnamed: 0_level_0,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,AAK1,AAMDC,AAMP,AAR2,AARS,AARS2
control,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
False,0.324189,0.206141,0.213337,0.164029,0.614011,0.876088,-0.006655,0.326709,-0.034642,0.024108,0.148769,0.039187,0.199448,0.182943,0.19142
True,2.502865,2.498949,-3.116513,-0.260439,4.209188,4.467209,0.363602,1.426473,2.220072,1.996812,7.762612,5.392893,0.247453,5.099949,1.393589


In [5]:
# data preprocessing

features = ["cell_type", "sm_name"]

X0 = pd.get_dummies(de_train[features])
X0_submit = pd.get_dummies(id_map[features])
print("X0 shape :",X0.shape)
print("X0_submit shape :",X0_submit.shape)

# X0_submit 기준으로 나머지 열 제거

X0 = X0.drop(list(set(X0.columns) - set(X0_submit.columns)), axis=1)
print("X0 shape :",X0.shape)
X0 = de_train[features].join(X0)
X0.head()

X0 shape : (614, 152)
X0_submit shape : (255, 131)
X0 shape : (614, 131)


Unnamed: 0,cell_type,sm_name,cell_type_B cells,cell_type_Myeloid cells,sm_name_5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-6-yl)pyrimidin-2-amine,sm_name_ABT-199 (GDC-0199),sm_name_ABT737,sm_name_AMD-070 (hydrochloride),sm_name_AT 7867,sm_name_AT13387,...,sm_name_Tivozanib,sm_name_Topotecan,sm_name_Tosedostat,sm_name_Trametinib,sm_name_UNII-BXU45ZH6LI,sm_name_Vandetanib,sm_name_Vanoxerine,sm_name_Vardenafil,sm_name_Vorinostat,sm_name_YK 4-279
0,NK cells,Clotrimazole,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,T cells CD4+,Clotrimazole,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,T cells CD8+,Clotrimazole,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,T regulatory cells,Clotrimazole,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,NK cells,Mometasone Furoate,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
# mean by cell & mean by compound

means_cell = de_train.iloc[:, [0]+list(range(5, de_train.shape[1]))].groupby("cell_type").mean().reset_index()
means_comp = de_train.iloc[:, [1]+list(range(5, de_train.shape[1]))].groupby("sm_name").mean().reset_index()

# 전에 구했던 5개의 페어를 특징으로 이용
'''
(B cells, Myeloid cells)
(B cells, NK cells)
(B cells, T cells CD4+)
(Myeloid cells, NK cells)
(T cells CD4+, NK cells) 해당 쌍은 유전자 17개에 대해서 상관계수가 0.5 이상임

B cells --> NK cells, T cells CD4+
Myeloid cells --> NK cells 을 추가적으로 feature로 사용하자

일단은 아래와 같이 feature 추가
B cells : T cells CD4+
Myeloid cells : NK cells
NK cells : Myeloid cells
T cells CD4+ : B cells
T cells CD8+ : T cells CD4+
T regulatory cells : T cells CD4+
'''
key_map = {
    "B cells":"T cells CD4+",
    "Myeloid cells":"NK cells",
    "NK cells":"Myeloid cells",
    "T cells CD4+":"B cells",
    "T cells CD8+":"T cells CD4+",
    "T regulatory cells":"T cells CD4+"
}

# mrrmse


In [8]:
# Example of A2M

gene_names = de_train.columns[5:]
gene = gene_names[2]

X = X0.merge(means_cell[["cell_type", gene]], how="left", on="cell_type")
X = X.merge(means_comp[["sm_name", gene]], how="left", on="sm_name")

means_split = means_cell[["cell_type", gene]].copy()
permute_cell = means_split.copy()
for cell in means_split.cell_type:
    permute_cell.loc[permute_cell.cell_type==cell, gene] = means_split.loc[means_split.cell_type==key_map[cell], gene].values
X = X.merge(permute_cell, how="left", on="cell_type")
X = X.drop(["cell_type","sm_name"], axis=1)
y = de_train.loc[:, gene]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1108)
model = lgb.LGBMRegressor()
model.fit(X_train, y_train)
pred = model.predict(X_test)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.168196 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 429, number of used features: 2
[LightGBM] [Info] Start training from score 0.140582
