In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import random
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import os
import gc
import datetime
import time
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss
from tqdm.notebook import tqdm

from category_encoders import CountEncoder
from xgboost import XGBClassifier


# jupyter内で描画するためのおまじない
%matplotlib inline 

In [None]:
train_features = pd.read_csv('./input/train_features.csv')
train_features.head()

In [None]:
train_targets = pd.read_csv('./input/train_targets_scored.csv')
train_targets.head()

In [None]:
test_features = pd.read_csv('./input/test_features.csv')
test_features.head()

In [None]:
dataset = pd.concat([train_features, test_features])
dataset.head()

In [None]:
dataset.info()

In [None]:
sns.countplot(x='cp_type', data=dataset);

In [None]:
sns.countplot(x='cp_time', data=dataset)

In [None]:
sns.countplot(x='cp_dose', data=dataset)

In [None]:
sns.distplot(dataset['g-1'])

In [None]:
ss = pd.read_csv('./input/sample_submission.csv')
cols = [c for c in ss.columns.values if c != 'sig_id']

In [None]:
# def log_loss_metric(y_true, y_pred):
#     y_pred_clip = np.clip(y_pred, 1e-15, 1 - 1e-15)
#     loss = - np.mean(np.mean(y_true * np.log(y_pred_clip) + (1 - y_true) * np.log(1 - y_pred_clip), axis = 1))
#     return loss

In [None]:
def preprocess(df):
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    return df

In [None]:
train = preprocess(train_features)
test = preprocess(test_features)

# drop id col
X = train.iloc[:,1:].to_numpy()
X_test = test.iloc[:,1:].to_numpy()
y = train_targets.iloc[:,1:].to_numpy() 

In [None]:
classifier = MultiOutputClassifier(XGBClassifier())

# 明示的に名前をつけるのでmake_pipelineではなくPipelineを使う
clf = Pipeline([('encode', CountEncoder(cols=[0, 2])),
                ('classify', classifier)
               ])

In [None]:
params = {'classify__estimator__colsample_bytree': 0.6522,
          'classify__estimator__gamma': 3.6975,
          'classify__estimator__learning_rate': 0.0503,
          'classify__estimator__max_delta_step': 2.0706,
          'classify__estimator__max_depth': 10,
          'classify__estimator__min_child_weight': 31.5800,
          'classify__estimator__n_estimators': 166,
          'classify__estimator__subsample': 0.8639
         }

_ = clf.set_params(**params)

In [None]:
oof_preds = np.zeros(y.shape)
test_preds = np.zeros((test.shape[0], y.shape[1]))
oof_losses = []
kf = KFold(n_splits=5)
for fn, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    print('Starting fold: ', fn)
    X_train, X_val = X[trn_idx], X[val_idx]
    y_train, y_val = y[trn_idx], y[val_idx]
    
    # drop where cp_type==ctl_vehicle (baseline)
#     ctl_mask = X_train[:,0]=='ctl_vehicle'
#     X_train = X_train[~ctl_mask,:]
#     y_train = y_train[~ctl_mask]
    
    clf.fit(X_train, y_train)
    val_preds = clf.predict_proba(X_val) # list of preds per class
    val_preds = np.array(val_preds)[:,:,1].T # take the positive class
    oof_preds[val_idx] = val_preds
    
#     多次元を一次元に戻してからlog_lossを求める
    loss = log_loss(np.ravel(y_val), np.ravel(val_preds))
    oof_losses.append(loss)
    preds = clf.predict_proba(X_test)
    preds = np.array(preds)[:,:,1].T # take the positive class
    test_preds += preds / NFOLDS
    
print(oof_losses)
print('Mean OOF loss across folds', np.mean(oof_losses))
print('STD OOF loss across folds', np.std(oof_losses))

In [None]:
# control_mask = train['cp_type']=='ctl_vehicle'
# oof_preds[control_mask] = 0

print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(oof_preds)))

In [None]:
# control_mask = test['cp_type']=='ctl_vehicle'
# test_preds[control_mask] = 0

In [None]:
sub.iloc[:,1:] = test_preds
sub.to_csv('submission.csv', index=False)

In [None]:
# scaler = StandardScaler()
# X = scaler.fit_transform(train.values[:, top_feats])
# x_tt = scaler.transform(test_features.values[:, top_feats])

# kernel = Nystroem(kernel = 'rbf', n_components = 100, random_state = 0)
# X = kernel.fit_transform(X)
# x_tt = kernel.transform(x_tt)

In [None]:
# N_STARTS = 3
# N_SPLITS = 5

# res = train_targets.copy()
# ss.loc[:, train_targets.columns] = 0
# res.loc[:, train_targets.columns] = 0

# for tar in tqdm(range(train_targets.shape[1])):
    
#     start_time = time.time()
#     targets = train_targets.values[:, tar]
    
#     if targets.sum() >= N_SPLITS:
        
#         for seed in range(N_STARTS):

#             skf = StratifiedKFold(n_splits = N_SPLITS, random_state = seed, shuffle = True)

#             for n, (tr, te) in enumerate(skf.split(targets, targets)):
                
#                 x_tr, x_val = X[tr], X[te]
#                 y_tr, y_val = targets[tr], targets[te]
                
#                 model = LogisticRegression(random_state = seed)
#                 model.fit(x_tr, y_tr)
#                 ss.loc[:, train_targets.columns[tar]] += model.predict_proba(x_tt)[:, 1] / (N_SPLITS * N_STARTS)
#                 res.loc[te, train_targets.columns[tar]] += model.predict_proba(x_val)[:, 1] / N_STARTS            
    
#     score = log_loss(train_targets.loc[:, train_targets.columns[tar]].values, res.loc[:, train_targets.columns[tar]].values)
#     print(f'[{str(datetime.timedelta(seconds = time.time() - start_time))[2:7]}] Target {tar}:', score)

In [None]:
# print(f'Model OOF Metric: {log_loss_metric(train_targets.values, res.values)}')
# res.loc[train['cp_type'] == 1, train_targets.columns] = 0
# ss.loc[test['cp_type'] == 1, train_targets.columns] = 0
# print(f'Model OOF Metric with postprocessing: {log_loss_metric(train_targets.values, res.values)}')

In [None]:
# np.save('klr_oof.npy', res[cols].values)
# np.save('klr_sub.npy', ss[cols].values)
# ss.to_csv('submission.csv', index = False)