In [68]:
import cv2
import copy
import time
import pathlib
import random
import warnings
import datetime
import scipy as sp
import pandas as pd
import numpy as np
from time import time
import lightgbm as lgb
from catboost import Pool
from functools import partial
import matplotlib.pyplot as plt
from catboost import CatBoost
from sklearn import preprocessing
from tqdm import tqdm_notebook as tqdm
from collections import Counter, defaultdict
from scipy.interpolate import UnivariateSpline
from hyperopt import hp, tpe, Trials, fmin, space_eval
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold
from sklearn.metrics import confusion_matrix, average_precision_score
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=5)
warnings.filterwarnings("ignore")

In [33]:
def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', average_precision_score(y_true.get_label(), y_predicted), True

In [93]:
filepath = "../../../../pythonwork/20200529atma/input/"
train = pd.read_csv(filepath + "train.csv")
test = pd.read_csv(filepath + "test.csv")
fitting = pd.read_csv(filepath + "fitting.csv")
sample_submission = pd.read_csv(filepath + "atmacup5__sample_submission.csv")
spec_df = pd.read_csv(filepath + "spec.csv")

# FE

In [160]:
def transform(df):
    new_df = df.copy()
    # merge original csv and fitting data
    new_df = pd.merge(new_df, fitting, on="spectrum_id", how="left")
    
    # aggragate spec information
    spec_agg = spec_df.groupby("spectrum_filename")["intensity"].agg(["max", "min", "mean", "std"])
    spec_agg.columns = ["intensity_" + c for c in spec_agg.columns]
    root_len = []
    for i, file_df in spec_df.groupby("spectrum_filename"):
        x = file_df["wavelength"].values
        y = file_df["intensity"].values
        spline = UnivariateSpline(x, y-np.max(y)/2, s=0)
        roots = spline.roots()
        #root_max.append(y)
        root_len.append(len(roots))
    spec_agg["root_len"] = root_len
    
    # merge original csv and spec information
    new_df= pd.merge(new_df, spec_agg.reset_index(), on="spectrum_filename", how="left")
    
    # remove unnecessary columns
    new_df = new_df.drop(["spectrum_id",  "spectrum_filename","chip_id"], axis=1)
    
    # create new variables
    new_df["layout_dis"] = np.sqrt(new_df["layout_x"]**2 + new_df["layout_y"]**2)

    return new_df
new_train = transform(train)
new_test = transform(test)
print(f'train shape: {new_train.shape}')
print(f'test shape: {new_test.shape}')

train shape: (7436, 21)
test shape: (6952, 20)


In [161]:
new_train.head()

Unnamed: 0,exc_wl,layout_a,layout_x,layout_y,pos_x,target,params0,params1,params2,params3,params4,params5,params6,rms,beta,intensity_max,intensity_min,intensity_mean,intensity_std,root_len,layout_dis
0,850,2,36,140,1313.081,0,30.808589,581.1802,1037.714752,1.531423,22469.651641,1032.317268,8.29561,10.028668,0.02521298,1751.0,-228.0,40.292752,172.206792,1,144.554488
1,780,3,0,168,159.415,0,91.300897,17405.82,1080.510452,4.766233,33257.123175,1077.468855,8.018225,7.948485,0.3435612,4219.0,-263.0,166.958984,463.428363,2,168.0
2,780,1,34,29,-610.7688,0,106.642946,1e-10,1119.464438,2.0,42579.867913,1378.883338,11.687417,10.739859,2.348528e-15,2412.0,-235.0,151.577691,327.857694,2,44.687806
3,780,2,32,139,1214.618,0,306.933674,10994.86,1139.855067,5.198692,39349.741703,1145.212849,9.445029,10.379948,0.2183921,3209.0,-52.0,523.080947,436.48141,2,142.6359
4,780,0,45,85,-257.6161,0,46.133256,22276.22,1120.918337,5.668012,31054.928673,1117.107782,7.65871,8.31655,0.4176962,3998.0,-245.0,138.187717,472.009931,2,96.17692


# modelling

## lgb

In [162]:
categoricals = ['exc_wl', 'layout_a',]
lgbm_params = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.05, 'random_seed':44,'max_depth': 3}

def modelling(new_train, new_test, lgbm_params):
    X_train = new_train.drop(['target'],axis=1).copy()
    y_train = new_train.target.copy()
    
    remove_features = []
    for i in X_train.columns:
        if (X_train[i].std() == 0) and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    X_test = new_test.copy()
    X_test = X_test.drop(remove_features, axis=1)

    n_folds=5
    skf=StratifiedKFold(n_splits = n_folds)
    models = []

    valid = np.array([])
    valid_lgb = pd.DataFrame(np.zeros([X_train.shape[0]]))
    real = np.array([])
    features_list = [i for i in X_train.columns]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    initial = lgbm_params["random_seed"]
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        print("Fold "+str(i+1))
        #lgbm_params["random_seed"] = initial + i
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]

        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]
        
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
           num_boost_round=10000,early_stopping_rounds=10,verbose_eval = 1000, feval=pr_auc_metric, categorical_feature = categoricals)
        valid_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
        valid = np.concatenate([valid, valid_predict])
        valid_lgb.iloc[test_index]  = clf.predict(X_test2, num_iteration = clf.best_iteration).reshape(X_test2.shape[0], 1)
        real = np.concatenate([real, y_test2])
        feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()

        models.append(clf)
        
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]

    score = average_precision_score(real, valid)
    print("average precision score = {}".format(average_precision_score(real, valid)))
    print(confusion_matrix(real, np.round(valid)))
    pred_value = np.zeros(X_test.shape[0])
    for model in models:
        pred_value += model.predict(X_test, num_iteration = model.best_iteration) / len(models)
    return score, pred_value, feature_importance_df, valid_lgb

metric, pred_value, feature_importance_df, _ = modelling(new_train, new_test, lgbm_params)

Fold 1
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[207]	training's pr_auc: 0.992157	valid_1's pr_auc: 0.812085
Fold 2
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[180]	training's pr_auc: 0.986191	valid_1's pr_auc: 0.811803
Fold 3
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[286]	training's pr_auc: 0.999083	valid_1's pr_auc: 0.840971
Fold 4
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[162]	training's pr_auc: 0.972749	valid_1's pr_auc: 0.870349
Fold 5
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[154]	training's pr_auc: 0.974504	valid_1's pr_auc: 0.834798
average precision score = 0.8265681204839699
[[7183   17]
 [  94  142]]


In [164]:
feature_importance_df.sort_values("Cv", ascending=True).reset_index(drop=True)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Average,Std,Cv
0,intensity_min,70,52,76,51,67,63.2,9.987993,0.158038
1,intensity_max,146,159,202,148,121,155.2,26.498302,0.170736
2,root_len,15,16,11,20,13,15.0,3.03315,0.20221
3,params6,151,114,165,103,102,127.0,26.038433,0.205027
4,intensity_std,57,51,97,71,51,65.4,17.408044,0.266178
5,params5,51,63,81,51,35,56.2,15.263027,0.271584
6,params2,100,72,132,64,80,89.6,24.344198,0.271699
7,rms,95,89,95,53,48,76.0,20.995238,0.276253
8,params1,155,118,178,87,94,126.4,35.080479,0.277535
9,layout_dis,48,37,36,20,28,33.8,9.389356,0.277792


# submission

In [165]:
final_pred = pred_value
score = metric
sample_submission["target"] = final_pred
sample_submission.to_csv("../../../20200529atma/result/atmacup3_sample_submission"+str(score)[:-10]+".csv", index = False)

In [166]:
sample_submission.head()

Unnamed: 0,target
0,0.00023
1,0.000207
2,0.000377
3,0.000299
4,0.007845
