In [1]:
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# import lightgbm as lgb

In [2]:
path = "../data/de_train.parquet"
train_df = pd.read_parquet(path,engine="pyarrow")
train_df.head(2)

Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,control,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,NK cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.10472,-0.077524,-1.625596,-0.144545,0.143555,...,-0.227781,-0.010752,-0.023881,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755
1,T cells CD4+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.915953,-0.88438,0.371834,-0.081677,-0.498266,...,-0.494985,-0.303419,0.304955,-0.333905,-0.315516,-0.369626,-0.095079,0.70478,1.096702,-0.869887


In [3]:
path = '../data/id_map.csv'
df_id_map = pd.read_csv(path)
print(df_id_map.shape)

(255, 3)


In [4]:
df_id_map.head(2)

Unnamed: 0,id,cell_type,sm_name
0,0,B cells,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-...
1,1,B cells,ABT-199 (GDC-0199)


In [5]:
n_components = 35
n_components_for_compound_encoding = 20


# predict_method = 'train_aggregation_by_compounds'
# predict_method = 'train_aggregation_by_compounds_with_denoising_pca'
# predict_method = 'train_aggregation_by_compounds_with_denoising_ICA'
predict_method = 'train_aggregation_by_compounds_with_denoising_TSVD'

if '_pca' in predict_method:
    str_inf_target_dimred = 'PCA' 
    reducer = PCA(n_components=n_components )
elif '_ICA' in predict_method:
    str_inf_target_dimred = 'ICA' 
#     reducer = PCA(n_components=n_components )
    reducer = FastICA(n_components=n_components, random_state=0, whiten='unit-variance')
elif '_TSVD' in predict_method:
    str_inf_target_dimred = 'TSVD' 
#     reducer = PCA(n_components=n_components )
#     reducer = FastICA(n_components=n_components, random_state=0, whiten='unit-variance')
    reducer = TruncatedSVD(n_components=n_components, n_iter=7, random_state=42)
else:
    str_inf_target_dimred = ''
    
print(str_inf_target_dimred, reducer)

TSVD TruncatedSVD(n_components=35, n_iter=7, random_state=42)


In [6]:
Y = train_df.iloc[:,5:].values
Yr = reducer.fit_transform(Y)

In [7]:
train_df.shape

(614, 18216)

In [8]:
Yr.shape

(614, 35)

In [9]:
Yr[:, :n_components_for_compound_encoding]

array([[ 2.89373825e+01,  3.25693174e+00, -2.17718795e+00, ...,
         1.71697383e+00,  2.43747294e+00,  1.17374369e+00],
       [ 9.34854576e+00,  8.66256211e+00, -2.06019648e+00, ...,
         9.57226672e-01,  4.68278086e+00, -3.77149246e+00],
       [-3.58110979e+01,  6.78206995e+00, -4.38171568e+00, ...,
         2.03040417e+00, -1.03710292e+00, -3.52523495e+00],
       ...,
       [-5.03158567e+00,  6.43942181e+00, -2.87129303e-04, ...,
         4.14887610e+00,  5.19326099e+00, -3.74356351e-01],
       [-5.28922641e+01,  7.50236438e+00, -2.89089984e+00, ...,
         1.12076210e+00,  1.74396997e+00, -1.35842003e+00],
       [-6.67468860e+01,  8.28862864e+00, -4.76962697e+00, ...,
        -5.42812517e+00,  2.95258313e+00, -2.17275842e+00]])

In [10]:
df_tmp = pd.DataFrame(Yr[:, :n_components_for_compound_encoding], index = train_df.index  )
df_tmp['column for aggregation'] = train_df['sm_name']
df_tmp.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,column for aggregation
0,28.937383,3.256932,-2.177188,0.468545,-5.621313,4.551657,0.579008,-1.745717,-0.093655,-5.538261,...,1.484589,1.648768,-1.687988,-0.827857,0.02023,-3.380221,1.716974,2.437473,1.173744,Clotrimazole
1,9.348546,8.662562,-2.060196,0.569955,-2.757469,5.832881,-3.394118,-3.942866,0.464306,-5.546544,...,-3.477309,0.317478,-1.808493,-0.95153,-0.743172,-4.011046,0.957227,4.682781,-3.771492,Clotrimazole


In [11]:
df_compound_encoded = df_tmp.groupby('column for aggregation').mean() # quantile( quantile )
print('df_compound_encoded.shape', df_compound_encoded.shape )

df_compound_encoded.shape (146, 20)


In [12]:
df_compound_encoded.head(2)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
column for aggregation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-6-yl)pyrimidin-2-amine,63.586319,-1.465497,-0.562572,-1.732934,20.9966,34.568289,-6.739745,-3.576149,0.356735,-25.661156,-2.904752,-4.330203,14.512692,2.759831,-0.440801,-1.129141,-3.808963,12.068773,10.358412,-1.580439
ABT-199 (GDC-0199),21.00697,1.129267,0.303375,-3.547908,-2.140733,-5.49211,2.184463,-0.790178,-0.359174,0.998044,0.166214,-0.462383,-1.396626,2.500967,-1.491096,-1.264126,1.647789,2.790226,1.024438,-0.689964


In [13]:
X = pd.DataFrame(index = train_df['sm_name']) # 
X['IX'] = np.arange(len(train_df))
X = X.join( df_compound_encoded , how = 'left').sort_values('IX')
X = X.iloc[:,1:]
X = X.values 
print( X.shape )
#X.head(3)

(614, 20)


In [14]:
X_submit = pd.DataFrame(index = df_id_map['sm_name']) # 
X_submit['IX'] = np.arange(len(X_submit))
X_submit = X_submit.join( df_compound_encoded , how = 'left').sort_values('IX')
X_submit = X_submit.iloc[:,1:]
X_submit = X_submit.values 
print( X_submit.shape )

(255, 20)


In [15]:
train_df.head(3)

Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,control,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,NK cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.10472,-0.077524,-1.625596,-0.144545,0.143555,...,-0.227781,-0.010752,-0.023881,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755
1,T cells CD4+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.915953,-0.88438,0.371834,-0.081677,-0.498266,...,-0.494985,-0.303419,0.304955,-0.333905,-0.315516,-0.369626,-0.095079,0.70478,1.096702,-0.869887
2,T cells CD8+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,-0.387721,-0.305378,0.567777,0.303895,-0.022653,...,-0.119422,-0.033608,-0.153123,0.183597,-0.555678,-1.494789,-0.21355,0.415768,0.078439,-0.259365


In [27]:
genes = train_df.columns[5:] # 18211 genes
temp = train_df.groupby(['cell_type']).size() > 20
validation_cell_types = temp[temp].index # 4 cell types
validation_cell_types

Index(['NK cells', 'T cells CD4+', 'T cells CD8+', 'T regulatory cells'], dtype='object', name='cell_type')

In [34]:
train_sm_names = train_df.query("cell_type == 'B cells'").sm_name.values # 17 compounds including the two control compounds

features = ['cell_type', 'sm_name']

def cross_val_svd(model, label, n_components=5):
    mrrmse_list = []
    for fold, val_cell_type in enumerate(validation_cell_types):
        mask_va = (train_df.cell_type == val_cell_type) & \
            ~train_df.sm_name.isin(train_sm_names)
        mask_tr = ~mask_va # 485 or 487 training rows
        
        train = train_df[mask_tr]
        val = train_df[mask_va]
        y_true = val[genes]
        
        svd = TruncatedSVD(n_components=n_components, random_state=1)
        z_tr = svd.fit_transform(train[genes])
        print(train[features])
        model.fit(train[features], z_tr)
        y_pred = svd.inverse_transform(model.predict(val[features]))
        
        mrrmse = np.sqrt(np.square(y_true - y_pred).mean(axis=1)).mean()
        print(f"# Fold {fold}: {mrrmse:5.3f} val='{val_cell_type}'")
        mrrmse_list.append(mrrmse)
    mrrmse = np.array(mrrmse_list).mean()
    print(f"# Overall {mrrmse:5.3f} {label}")
    return

In [36]:
model =  MultiOutputRegressor( make_pipeline(StandardScaler(), SVR(C=20, epsilon=0.3, kernel = 'linear' ) ) )
#cross_val_svd(model, 'LABEL',n_components=n_components)

In [37]:
n_splits = 2
kf = KFold(n_splits=n_splits, random_state = 42, shuffle = True )
IX = pd.DataFrame(); IX['IX'] = range(len(train_df))

alpha_regularization_for_linear_models = 100000
model = Ridge(alpha=alpha_regularization_for_linear_models)

params = {
    'num_leaves': 5,
    'learning_rate': 0.1,
    'verbose': -1,
    'num_boost_round': 20,    # Number of boosting iterations (trees)
}

# model = MultiOutputRegressor( lgb.LGBMRegressor( **params) )
model =  MultiOutputRegressor( make_pipeline(StandardScaler(), SVR(C=20, epsilon=0.3, kernel = 'linear' ) ) )


Y = train_df.iloc[:,5:].values

Y_reduced_submit = np.zeros( (len(df_id_map) , Yr.shape[1] )   ); 
cnt_blend_submit = 0
Y_submit = np.zeros( (len(df_id_map) , 18211 )   ); 
cnt_blend_submit = 0

print(X.shape)
for i, (train_index, test_index) in enumerate(kf.split(IX)):
    print(f"Fold {i}:", len(test_index) )
    # print(f"  Train: index={train_index}")
    # print(f"  Test:  index={test_index}")
    
    Yr_train = reducer.fit_transform(Y[train_index,:])
    X_train = X[train_index,:]
    print(X_train)
    model.fit(X_train, Yr_train)
    
    Yr_test = reducer.transform(Y[test_index,:])
    X_test = X[test_index,:]
    Yr_pred = model.predict(X_test) # , Yr_train)
    r2 = r2_score(Yr_test,  Yr_pred )
    print('r2 test:', r2)
 
    r2_test = r2    
    
    Yr_pred = model.predict(X_train) # , Yr_train)
    r2 = r2_score(Yr_train,  Yr_pred )
    print('r2 train', r2)
    

    Y_submit =  (Y_submit * cnt_blend_submit + reducer.inverse_transform(model.predict(X_submit))) / \
    ( cnt_blend_submit + 1)
    
    cnt_blend_submit += 1
    
    
s = mean_squared_error(Y,Y_oof, squared = False)
print(s)

(614, 20)
Fold 0: 307
[[ 14.79796212   4.21953585  -1.74883442 ...   1.15631244   1.60635239
   -2.00377306]
 [ 75.953835    12.82768496   4.79327079 ...  95.12559209  35.74021811
  -29.66366746]
 [ 33.08062697   0.32728429  -7.85582797 ...   3.05723302   3.39977004
   -0.1253318 ]
 ...
 [ -6.26769079   0.45515642  -1.67035554 ...   0.78078761   1.86036041
   -1.13810841]
 [-27.4267271    5.62066525  -1.72503153 ...   0.70573366   3.33026952
   -1.01651906]
 [-27.4267271    5.62066525  -1.72503153 ...   0.70573366   3.33026952
   -1.01651906]]
r2 test: 0.04486505380880883
r2 train 0.21193333043478044
Fold 1: 307
[[ 14.79796212   4.21953585  -1.74883442 ...   1.15631244   1.60635239
   -2.00377306]
 [ 14.79796212   4.21953585  -1.74883442 ...   1.15631244   1.60635239
   -2.00377306]
 [ 14.79796212   4.21953585  -1.74883442 ...   1.15631244   1.60635239
   -2.00377306]
 ...
 [ -6.26769079   0.45515642  -1.67035554 ...   0.78078761   1.86036041
   -1.13810841]
 [-27.4267271    5.62066525

NameError: name 'Y_oof' is not defined

In [None]:
Y_submit.shape

In [None]:
df_submit = pd.DataFrame(Y_submit, columns = train_df.columns[5:])
df_submit.index.name = 'id'
print( df_submit.shape )
display(df_submit)
df_submit.to_csv('submission.csv')