In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import log_loss, make_scorer

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Data

DATA_DIR = '/kaggle/input/lish-moa/'

train_features = pd.read_csv(DATA_DIR + 'train_features.csv')
train_targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
test_features = pd.read_csv(DATA_DIR + 'test_features.csv')

test_targets = pd.DataFrame(0, index= range(len(test_features)), columns=train_targets.columns)
test_targets['sig_id'] = test_features['sig_id']

In [None]:
# Preprocess

# cp_type mask
trt_mask = train_features.cp_type=='trt_cp'
test_trt_mask = test_features.cp_type=='trt_cp'

# cp_dose binarization
train_features['is_D1'] = train_features['cp_dose'] == 'D1'
test_features['is_D1'] = test_features['cp_dose'] == 'D1'

# Drop categorical columns
train_features.drop(columns=['sig_id','cp_type','cp_dose'], inplace=True)
train_targets.drop(columns=['sig_id'], inplace=True)
test_features.drop(columns=['sig_id','cp_type','cp_dose'], inplace=True)

In [None]:
# Features

GENES = [col for col in train_features.columns if col.startswith('g-')] # 772 
CELLS = [col for col in train_features.columns if col.startswith('c-')] # 100
CAT = ['cp_time','is_D1']

In [None]:
def pca_apply(N_GENES, N_CELLS):
    """Redueix les dimensions de les dades d'entrenament i test utilitzant PCA"""

    pca_genes = PCA(n_components=N_GENES)
    pca_genes.fit(train_features.loc[trt_mask, GENES].append(test_features.loc[test_trt_mask, GENES], ignore_index=True))
    X_genes = pca_genes.transform(train_features.loc[trt_mask, GENES])
    test_genes = pca_genes.transform(test_features.loc[test_trt_mask, GENES])

    pca_cells = PCA(n_components=N_CELLS)
    pca_cells.fit(train_features.loc[trt_mask, CELLS].append(test_features.loc[test_trt_mask, CELLS], ignore_index=True))
    X_cells = pca_cells.transform(train_features.loc[trt_mask, CELLS])
    test_cells = pca_cells.transform(test_features.loc[test_trt_mask, CELLS])

    X_cat = train_features.loc[trt_mask, CAT].to_numpy()
    test_cat = test_features.loc[test_trt_mask, CAT].to_numpy()
    
    X_gc = np.append(X_genes,X_cells, axis=1)
    df_X = np.append(X_cat, X_gc, axis=1)

    test_gc = np.append(test_genes,test_cells, axis=1)
    df_test = np.append(test_cat, test_gc, axis=1)
    
    return df_X, df_test

In [None]:
# Models

X, test = pca_apply(20,5)
Y = train_targets.loc[trt_mask,:].to_numpy()

COLS = list(range(Y.shape[1]))

parameters = {'n_estimators':[10,15,20],
              'reg_lambda':[5,10]
             }
Y_pred = []
loss = []
log_loss_function = make_scorer(log_loss, greater_is_better=False, needs_proba=True, labels=[0,1])


for col in COLS:
    
    # Hyperparameters
    search = GridSearchCV(XGBClassifier(objective='binary:logistic', learning_rate=1, gamma=10),
                          parameters, cv=4, scoring=log_loss_function,
                          return_train_score=False)
    
    search.fit(X, Y[:,col])
    
    # Predict
    Y_pred.append(search.predict_proba(test)[:,1].T)
    
    # Loss
    loss.append(search.best_score_)
        
    print(col, "\t loss: ", search.best_score_, "\t", search.best_params_, "  \t", np.mean(loss))      

In [None]:
# Export results

test_targets.loc[test_trt_mask,'5-alpha_reductase_inhibitor':] = np.array(Y_pred).T 
test_targets.to_csv('submission.csv', index = False)