In [9]:
import cv2
import copy
import time
import pathlib
import random
import warnings
import datetime
import scipy as sp
import pandas as pd
import numpy as np
from time import time
import lightgbm as lgb
from catboost import Pool
from functools import partial
import matplotlib.pyplot as plt
from scipy.stats import laplace
from catboost import CatBoost
from keras.models import Model
from sklearn import preprocessing
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from scipy.optimize import curve_fit
from keras.layers import Dense, Input
from sklearn.decomposition import PCA
from tqdm import tqdm_notebook as tqdm
from collections import Counter, defaultdict
from scipy.interpolate import UnivariateSpline
from hyperopt import hp, tpe, Trials, fmin, space_eval
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold
from sklearn.metrics import confusion_matrix, average_precision_score, r2_score
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=5)
warnings.filterwarnings("ignore")

In [10]:
def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', average_precision_score(y_true.get_label(), y_predicted), True

def mixture_laplace(x, loc1, scale1, coef1, loc2, scale2, coef2, loc3, scale3, coef3):
    coef1 = coef1 / (coef1 + coef2 + coef3)
    coef2 = coef2 / (coef1 + coef2 + coef3)
    coef3 = coef3 / (coef1 + coef2 + coef3)

    # scaleを非負にする
    scale1 = np.e ** scale1
    scale2 = np.e ** scale2
    scale3 = np.e ** scale3

    lap1 = laplace(loc1, scale1).pdf(x) * coef1
    lap2 = laplace(loc2, scale2).pdf(x) * coef2
    lap3 = laplace(loc3, scale3).pdf(x) * coef3
    # sin1 = sin_a * np.sin(sin_b * x + sin_c)
    return lap1 + lap2 + lap3

In [11]:
filepath = "../../../../pythonwork/20200529atma/input/"
train = pd.read_csv(filepath + "train.csv")
test = pd.read_csv(filepath + "test.csv")
fitting = pd.read_csv(filepath + "fitting.csv")
sample_submission = pd.read_csv(filepath + "atmacup5__sample_submission.csv")
spec_df = pd.read_csv(filepath + "spec.csv")
wave_df = pd.read_csv(filepath + "wave_df.csv")
wave_test = pd.read_csv(filepath + "wave_test.csv")

In [12]:
wave_df = pd.concat([wave_df, train.target], axis=1)

# FE

In [121]:
#clf = TSNE(n_components=2)
#X = wave_df.values.T
#z = clf.fit_transform(X)
#projected_df = pd.DataFrame(z, columns=['project_0', 'project_1'])
#projected_df= pd.concat([train, projected_df], axis=1)

In [14]:
def pos_x_modify(row):
    if row["layout_a"] ==2 or row["layout_a"] ==3:
        row["layout_x"] += 47
    return row["layout_x"]

def pos_y_modify(row):
    if row["layout_a"] ==1 or row["layout_a"] ==3:
        row["layout_y"] += 191
    return row["layout_y"]

def transform(df):
    new_df = df.copy()
    # merge original csv and fitting data
    new_df = pd.merge(new_df, fitting, on="spectrum_id", how="left")
    
    # aggragate spec information
    spec_agg = spec_df.groupby("spectrum_filename")["intensity"].agg(["max", "min", "mean", "std"])
    spec_agg.columns = ["intensity_" + c for c in spec_agg.columns]
    fft = []
    #r2 = []
    for i, file_df in spec_df.groupby("spectrum_filename"):
        x = file_df["wavelength"].values
        y = file_df["intensity"].values
        
        #fast fourier transformation
        F = np.fft.fft(y)
        Amp = np.abs(F)
        fft.append(np.quantile(Amp, 0.95))
        
        # fitting approximation curve
        #try:
          #  y = y - y.min()
            #scale = y.sum()
            #y = y / scale

            #p0 = [
             #   file_df["wavelength"].iloc[np.argsort(y)[::-1][1]], 1, 1,
              #  file_df["wavelength"].iloc[np.argsort(y)[::-1][10]], 3, 1,
               # file_df["wavelength"].iloc[np.argsort(y)[::-1][30]], 3, 1,
            #]
            #params, _ = curve_fit(mixture_laplace, file_df["wavelength"].values, y, p0=p0, maxfev=10000)
            #y_pred = mixture_laplace(file_df["wavelength"].values, *params)
            #r2.append(r2_score(y*scale, y_pred*scale))
        #except RuntimeError as e:
          #  r2.append(-1)
        
    spec_agg["amp_0.95"] = fft
    #spec_agg["r2_score"] = r2
    
    # merge original csv and spec information
    new_df= pd.merge(new_df, spec_agg.reset_index(), on="spectrum_filename", how="left")
    
    # remove unnecessary columns
    if "target" in new_df.columns:
        new_df = new_df.drop(["spectrum_id",  "spectrum_filename"], axis=1)
    else:
        new_df = new_df.drop(["spectrum_id",  "spectrum_filename","chip_id"], axis=1)
    
    # create new variables
    new_df["layout_x"] = new_df.apply(lambda x: pos_x_modify(x), axis=1)
    new_df["layout_y"] = new_df.apply(lambda x: pos_y_modify(x), axis=1)
    new_df["ratio2_5"] = new_df["params2"] / new_df["params5"] # I don't know the meaning, but seems effective
    new_df["ratio4_1"] = new_df["params4"] / new_df["params1"] # I don't know the meaning, but seems effective
    
    return new_df
new_train = transform(train)
new_test = transform(test)
print(f'train shape: {new_train.shape}')
print(f'test shape: {new_test.shape}')

train shape: (7436, 23)
test shape: (6952, 21)


In [117]:
new_train.head()

Unnamed: 0,chip_id,exc_wl,layout_a,layout_x,layout_y,pos_x,target,params0,params1,params2,params3,params4,params5,params6,rms,beta,intensity_max,intensity_min,intensity_mean,intensity_std,amp_0.95,ratio2_5,ratio4_1
0,79ad4647da6de6425abf,850,2,83,140,1313.081,0,30.808589,581.1802,1037.714752,1.531423,22469.651641,1032.317268,8.29561,10.028668,0.02521298,1751.0,-228.0,40.292752,172.206792,7643.154333,1.005229,38.66211
1,79ad4647da6de6425abf,780,3,47,359,159.415,0,91.300897,17405.82,1080.510452,4.766233,33257.123175,1077.468855,8.018225,7.948485,0.3435612,4219.0,-263.0,166.958984,463.428363,28918.58781,1.002823,1.91069
2,c695a1e61e002b34e556,780,1,34,220,-610.7688,0,106.642946,1e-10,1119.464438,2.0,42579.867913,1378.883338,11.687417,10.739859,2.348528e-15,2412.0,-235.0,151.577691,327.857694,18130.78214,0.811863,425798700000000.0
3,c695a1e61e002b34e556,780,2,79,139,1214.618,0,306.933674,10994.86,1139.855067,5.198692,39349.741703,1145.212849,9.445029,10.379948,0.2183921,3209.0,-52.0,523.080947,436.48141,22149.537147,0.995322,3.57892
4,c695a1e61e002b34e556,780,0,45,85,-257.6161,0,46.133256,22276.22,1120.918337,5.668012,31054.928673,1117.107782,7.65871,8.31655,0.4176962,3998.0,-245.0,138.187717,472.009931,30874.913315,1.003411,1.394084


# modelling

## lgb

In [118]:
categoricals = ['layout_a',]
lgbm_params = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.05, 'random_seed':44,'max_depth': 3}

def modelling(new_train, new_test, lgbm_params):
    X_train = new_train.drop(['target'],axis=1).copy()
    y_train = new_train.target.copy()
    
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train["chip_id"]))
    X_train["chip_id"] = lbl.transform(list(X_train["chip_id"]))
    
    remove_features = []
    for i in X_train.columns:
        if (X_train[i].std() == 0) and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    X_test = new_test.copy()
    X_test = X_test.drop(remove_features, axis=1)

    n_folds=6
    skf=GroupKFold(n_splits = n_folds)
    models = []

    valid = np.array([])
    valid_lgb = pd.DataFrame(np.zeros([X_train.shape[0]]))
    real = np.array([])
    features_list = [i for i in X_train.columns if i != "chip_id"]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    initial = lgbm_params["random_seed"]
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train, X_train["chip_id"])):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]

        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]
        #print(X_train2.chip_id.unique())
        #print(lbl.inverse_transform(X_train2.chip_id.unique()))
        
        X_train2.drop(["chip_id"], axis=1, inplace=True)
        X_test2.drop(["chip_id"], axis=1, inplace=True)
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
           num_boost_round=10000,early_stopping_rounds=10,verbose_eval = 1000, feval=pr_auc_metric, categorical_feature = categoricals)
        valid_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
        valid = np.concatenate([valid, valid_predict])
        valid_lgb.iloc[test_index]  = clf.predict(X_test2, num_iteration = clf.best_iteration).reshape(X_test2.shape[0], 1)
        real = np.concatenate([real, y_test2])
        feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()
        
        if i != 5:
            models.append(clf)
        
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]

    score = average_precision_score(real, valid)
    print("average precision score = {}".format(average_precision_score(real, valid)))
    print(confusion_matrix(real, np.round(valid)))
    pred_value = np.zeros(X_test.shape[0])
    for model in models:
        pred_value += model.predict(X_test, num_iteration = model.best_iteration) / len(models)
    return score, pred_value, feature_importance_df, valid_lgb

metric, pred_value, feature_importance_df, _ = modelling(new_train, new_test, lgbm_params)

Fold 1
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[211]	training's pr_auc: 0.997629	valid_1's pr_auc: 0.84985
Fold 2
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[172]	training's pr_auc: 0.994174	valid_1's pr_auc: 0.88605
Fold 3
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[86]	training's pr_auc: 0.927829	valid_1's pr_auc: 0.886802
Fold 4
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[138]	training's pr_auc: 0.968529	valid_1's pr_auc: 0.752586
Fold 5
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[110]	training's pr_auc: 0.958056	valid_1's pr_auc: 0.545463
Fold 6
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[10]	training's pr_auc: 0.685267	valid_1's pr_auc: 0.2
average precision s

In [84]:
categoricals = ['layout_a',]
lgbm_params = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.05, 'random_seed':44,'max_depth': 3}

def modelling_skf(new_train, new_test, lgbm_params):
    X_train = new_train.drop(['target'],axis=1).copy()
    y_train = new_train.target.copy()
    
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train["chip_id"]))
    X_train["chip_id"] = lbl.transform(list(X_train["chip_id"]))
    
    remove_features = []
    for i in X_train.columns:
        if (X_train[i].std() == 0) and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    X_test = new_test.copy()
    X_test = X_test.drop(remove_features, axis=1)

    n_folds=5
    skf=StratifiedKFold(n_splits = n_folds)
    models = []

    valid = np.array([])
    valid_lgb = pd.DataFrame(np.zeros([X_train.shape[0]]))
    real = np.array([])
    features_list = [i for i in X_train.columns if i != "chip_id"]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    initial = lgbm_params["random_seed"]
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]

        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]
        
        X_train2.drop(["chip_id"], axis=1, inplace=True)
        X_test2.drop(["chip_id"], axis=1, inplace=True)
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
           num_boost_round=10000,early_stopping_rounds=10,verbose_eval = 1000, feval=pr_auc_metric, categorical_feature = categoricals)
        valid_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
        valid = np.concatenate([valid, valid_predict])
        valid_lgb.iloc[test_index]  = clf.predict(X_test2, num_iteration = clf.best_iteration).reshape(X_test2.shape[0], 1)
        real = np.concatenate([real, y_test2])
        feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()
        
        models.append(clf)
        
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]

    score = average_precision_score(real, valid)
    print("average precision score = {}".format(average_precision_score(real, valid)))
    print(confusion_matrix(real, np.round(valid)))
    pred_value = np.zeros(X_test.shape[0])
    for model in models:
        pred_value += model.predict(X_test, num_iteration = model.best_iteration) / len(models)
    return score, pred_value, feature_importance_df, valid_lgb

#metric_slf, pred_value_skf, _, _ = modelling_skf(new_train, new_test, lgbm_params)

Fold 1
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[149]	training's pr_auc: 0.977073	valid_1's pr_auc: 0.80333
Fold 2
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[150]	training's pr_auc: 0.978605	valid_1's pr_auc: 0.817994
Fold 3
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[253]	training's pr_auc: 0.999502	valid_1's pr_auc: 0.818965
Fold 4
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[155]	training's pr_auc: 0.974066	valid_1's pr_auc: 0.917931
Fold 5
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[79]	training's pr_auc: 0.925118	valid_1's pr_auc: 0.822722
average precision score = 0.8293605656745442
[[7183   17]
 [  93  143]]


In [270]:
feature_importance_df.sort_values("Cv", ascending=True).reset_index(drop=True)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Fold_6,Average,Std,Cv
0,params3,148,122,86,100,109,13,96.333333,41.939109,0.435354
1,intensity_std,49,27,38,36,51,6,34.5,15.085865,0.437271
2,params1,107,71,50,73,89,7,66.166667,31.67763,0.478755
3,intensity_max,150,83,66,80,102,12,82.166667,41.200391,0.501425
4,layout_y,18,13,10,11,14,0,11.0,5.537749,0.503432
5,params2,98,58,39,42,62,12,51.833333,26.206975,0.505601
6,ratio2_5,120,73,62,82,97,0,72.333333,37.223051,0.514604
7,beta,43,27,27,34,54,1,31.0,16.401219,0.529072
8,intensity_mean,20,19,9,24,28,1,16.833333,9.154537,0.543834
9,params6,123,68,52,75,68,3,64.833333,35.343159,0.545139


# submission

In [105]:
final_pred = pred_value
score = metric
sample_submission["target"] = final_pred
sample_submission.to_csv("../../../20200529atma/result/atmacup3_sample_submission"+str(score)[:-10]+".csv", index = False)

In [106]:
sample_submission.head()

Unnamed: 0,target
0,0.000741
1,0.000676
2,0.00207
3,0.001649
4,0.033053
