In [1]:
import cv2
import copy
import time
import pathlib
import random
import warnings
import datetime
import scipy as sp
import pandas as pd
import numpy as np
from time import time
import lightgbm as lgb
from catboost import Pool
from functools import partial
import matplotlib.pyplot as plt
from scipy.stats import laplace
from catboost import CatBoost
from keras.models import Model
from sklearn import preprocessing
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from scipy.optimize import curve_fit
from keras.layers import Dense, Input
from sklearn.decomposition import PCA
from tqdm import tqdm_notebook as tqdm
from collections import Counter, defaultdict
from scipy.interpolate import UnivariateSpline
from sklearn.preprocessing import MinMaxScaler
from hyperopt import hp, tpe, Trials, fmin, space_eval
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold
from sklearn.metrics import confusion_matrix, average_precision_score, r2_score
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=5)
warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [2]:
def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', average_precision_score(y_true.get_label(), y_predicted), True

def mixture_laplace(x, loc1, scale1, coef1, loc2, scale2, coef2, loc3, scale3, coef3):
    coef1 = coef1 / (coef1 + coef2 + coef3)
    coef2 = coef2 / (coef1 + coef2 + coef3)
    coef3 = coef3 / (coef1 + coef2 + coef3)

    # scaleを非負にする
    scale1 = np.e ** scale1
    scale2 = np.e ** scale2
    scale3 = np.e ** scale3

    lap1 = laplace(loc1, scale1).pdf(x) * coef1
    lap2 = laplace(loc2, scale2).pdf(x) * coef2
    lap3 = laplace(loc3, scale3).pdf(x) * coef3
    # sin1 = sin_a * np.sin(sin_b * x + sin_c)
    return lap1 + lap2 + lap3

In [3]:
filepath = "../input/atma2020529/"
filepath2 = "../input/atma2020529-2/"
filepath3 = "../input/atma20205293/"
train = pd.read_csv(filepath + "train.csv")
test = pd.read_csv(filepath + "test.csv")
fitting = pd.read_csv(filepath + "fitting.csv")
sample_submission = pd.read_csv(filepath + "atmaCup5__sample_submission.csv")
wave_df = pd.read_csv(filepath2 + "wave_df.csv")
wave_test = pd.read_csv(filepath2 + "wave_test.csv")
wave_df = pd.DataFrame(wave_df)
wave_test = pd.DataFrame(wave_test)
wave_df = wave_df.iloc[:,:511]
spec_df = pd.read_csv(filepath3 + "spec.csv")

# FE

In [4]:
from tsfresh import extract_features, extract_relevant_features
from tsfresh.feature_extraction import settings
# https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_extraction.html#module-tsfresh.feature_extraction.feature_calculators
fc_parameters = {
    #"abs_energy": None,
    "large_standard_deviation": [{ "r": 0.1}],
}
ts_df = extract_features(spec_df, column_id="spectrum_filename", column_sort="wavelength", n_jobs=8, 
                         default_fc_parameters=fc_parameters) #settings.EfficientFCParameters())

Feature Extraction: 100%|██████████| 40/40 [00:02<00:00, 15.94it/s]


In [5]:
ts_df = ts_df.reset_index()
ts_df = ts_df.rename(columns={"id":"spectrum_filename"})

In [6]:
#clf = TSNE(n_components=2)
#X = wave_df.values.T
#z = clf.fit_transform(X)
#projected_df = pd.DataFrame(z, columns=['project_0', 'project_1'])
#projected_df= pd.concat([train, projected_df], axis=1)

In [7]:
def pos_x_modify(row):
    if row["layout_a"] ==2 or row["layout_a"] ==3:
        row["layout_x"] += 47
    return row["layout_x"]

def pos_y_modify(row):
    if row["layout_a"] ==1 or row["layout_a"] ==3:
        row["layout_y"] += 191
    return row["layout_y"]

def transform(df):
    new_df = df.copy()
    # merge original csv and fitting data
    new_df = pd.merge(new_df, fitting, on="spectrum_id", how="left")
    
    # aggragate spec information
    spec_agg = spec_df.groupby("spectrum_filename")["intensity"].agg(["max", "min", "mean", "std"])
    spec_agg.columns = ["intensity_" + c for c in spec_agg.columns]
    fft = []
    #r2 = []
    for i, file_df in spec_df.groupby("spectrum_filename"):
        x = file_df["wavelength"].values
        y = file_df["intensity"].values
        
        #fast fourier transformation
        F = np.fft.fft(y)
        Amp = np.abs(F)
        fft.append(np.quantile(Amp, 0.95))
        
        # fitting approximation curve
        #try:
         #   y = y - y.min()
          #  scale = y.sum()
           # y = y / scale

            #p0 = [
             #   file_df["wavelength"].iloc[np.argsort(y)[::-1][1]], 1, 1,
              #  file_df["wavelength"].iloc[np.argsort(y)[::-1][10]], 3, 1,
               # file_df["wavelength"].iloc[np.argsort(y)[::-1][30]], 3, 1,
            #]
            #params, _ = curve_fit(mixture_laplace, file_df["wavelength"].values, y, p0=p0, maxfev=10000)
            #y_pred = mixture_laplace(file_df["wavelength"].values, *params)
            #r2.append(r2_score(y*scale, y_pred*scale))
        #except RuntimeError as e:
         #   r2.append(-1)
        
    spec_agg["amp_0.95"] = fft
    #spec_agg["r2_score"] = r2
    
    # merge original csv and spec information
    new_df= pd.merge(new_df, spec_agg.reset_index(), on="spectrum_filename", how="left")
    
    new_df = pd.merge(new_df, ts_df, on="spectrum_filename", how="left")
    
    # remove unnecessary columns
    if "target" in new_df.columns:
        new_df = new_df.drop(["spectrum_id",  "spectrum_filename"], axis=1)
    else:
        new_df = new_df.drop(["spectrum_id",  "spectrum_filename","chip_id"], axis=1)
    
    # create new variables
    new_df["layout_x"] = new_df.apply(lambda x: pos_x_modify(x), axis=1)
    new_df["layout_y"] = new_df.apply(lambda x: pos_y_modify(x), axis=1)
    new_df["ratio2_5"] = new_df["params2"] / new_df["params5"] # I don't know the meaning, but seems effective
    
    return new_df
new_train = transform(train)
new_test = transform(test)
print(f'train shape: {new_train.shape}')
print(f'test shape: {new_test.shape}')

train shape: (7436, 23)
test shape: (6952, 21)


In [8]:
new_train.head()

Unnamed: 0,chip_id,exc_wl,layout_a,layout_x,layout_y,pos_x,target,params0,params1,params2,params3,params4,params5,params6,rms,beta,intensity_max,intensity_min,intensity_mean,intensity_std,amp_0.95,intensity__large_standard_deviation__r_0.1,ratio2_5
0,79ad4647da6de6425abf,850,2,83,140,1313.081,0,30.808589,581.1802,1037.714752,1.531423,22469.651641,1032.317268,8.29561,10.028668,0.02521298,1751.0,-228.0,40.292752,172.206792,7643.154333,0.0,1.005229
1,79ad4647da6de6425abf,780,3,47,359,159.415,0,91.300897,17405.82,1080.510452,4.766233,33257.123175,1077.468855,8.018225,7.948485,0.3435612,4219.0,-263.0,166.958984,463.428363,28918.58781,1.0,1.002823
2,c695a1e61e002b34e556,780,1,34,220,-610.7688,0,106.642946,1e-10,1119.464438,2.0,42579.867913,1378.883338,11.687417,10.739859,2.348528e-15,2412.0,-235.0,151.577691,327.857694,18130.78214,1.0,0.811863
3,c695a1e61e002b34e556,780,2,79,139,1214.618,0,306.933674,10994.86,1139.855067,5.198692,39349.741703,1145.212849,9.445029,10.379948,0.2183921,3209.0,-52.0,523.080947,436.48141,22149.537147,1.0,0.995322
4,c695a1e61e002b34e556,780,0,45,85,-257.6161,0,46.133256,22276.22,1120.918337,5.668012,31054.928673,1117.107782,7.65871,8.31655,0.4176962,3998.0,-245.0,138.187717,472.009931,30874.913315,1.0,1.003411


# modelling

In [9]:
#scaler = MinMaxScaler(feature_range = (0, 1))
#wave_df = scaler.fit_transform(wave_df)
#wave_test = scaler.transform(wave_test)
#wave_df = pd.DataFrame(wave_df)
#wave_test = pd.DataFrame(wave_test)
#wave_df = pd.concat([wave_df, new_train[["chip_id", "target"]]], axis=1)
#wave_df = pd.concat([wave_df, new_train], axis=1)
#wave_test = pd.concat([wave_test, new_test], axis=1)
#wave_df.head()

In [10]:
import tensorflow as tf
from tensorflow.keras.metrics import AUC

def sk_pr_auc(y_true, y_pred):
    return tf.py_function(average_precision_score, (y_true, y_pred), tf.float64)

pr_metric = AUC(curve='PR', num_thresholds=1000) 

def modelling_nn(new_train, new_test):
    X_train = new_train.drop(['target'],axis=1).copy()
    y_train = new_train.target.copy()
    
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train["chip_id"]))
    X_train["chip_id"] = lbl.transform(list(X_train["chip_id"]))
    
    remove_features = []
    for i in X_train.columns:
        if (X_train[i].std() == 0) and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    X_test = new_test.copy()
    X_test = X_test.drop(remove_features, axis=1)

    n_folds=6
    skf=GroupKFold(n_splits = n_folds)
    models = []

    valid = np.array([])
    valid_lgb = pd.DataFrame(np.zeros([X_train.shape[0]]))
    real = np.array([])
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train, X_train["chip_id"])):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]

        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]
        #print(X_train2.chip_id.unique())
        #print(lbl.inverse_transform(X_train2.chip_id.unique()))
        
        X_train2.drop(["chip_id"], axis=1, inplace=True)
        X_test2.drop(["chip_id"], axis=1, inplace=True)
        
        clf = tf.keras.models.Sequential([
            tf.keras.layers.Input(shape=(X_train2.shape[1], )),
            tf.keras.layers.Dense(25, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dense(10, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])
        clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=[pr_metric])
        save_best = tf.keras.callbacks.ModelCheckpoint('./nn_model.w8', save_weights_only=True, save_best_only=True, verbose=1)
        early_stop = tf.keras.callbacks.EarlyStopping(patience=10)
    
        clf.fit(X_train2, y_train2,  validation_data=(X_test2, y_test2),epochs=100,callbacks=[save_best, early_stop])
        
        valid_predict = clf.predict(X_test2).reshape(X_test2.shape[0],)
        valid = np.concatenate([valid, valid_predict])
        real = np.concatenate([real, y_test2])
        
        if i != 5:
            models.append(clf)

    score = average_precision_score(real, valid)
    print("average precision score = {}".format(average_precision_score(real, valid)))
    print(confusion_matrix(real, np.round(valid)))
    pred_value = np.zeros(X_test.shape[0])
    for model in models:
        pred_value += model.predict(X_test).reshape(X_test.shape[0],) / len(models)
    return score, pred_value

#metric, pred_value = modelling_nn(wave_df, wave_test)

In [11]:
categoricals = ['layout_a',]
lgbm_params = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.05, 'random_seed':44,'max_depth': 4}

def modelling(new_train, new_test, lgbm_params):
    X_train = new_train.drop(['target'],axis=1).copy()
    y_train = new_train.target.copy()
    
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train["chip_id"]))
    X_train["chip_id"] = lbl.transform(list(X_train["chip_id"]))
    
    remove_features = []
    for i in X_train.columns:
        if (X_train[i].std() == 0) and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    X_test = new_test.copy()
    X_test = X_test.drop(remove_features, axis=1)
    
    X_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_train.columns]
    X_test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_test.columns]

    n_folds=6
    skf=GroupKFold(n_splits = n_folds)
    models = []

    valid = np.array([])
    valid_lgb = pd.DataFrame(np.zeros([X_train.shape[0]]))
    real = np.array([])
    features_list = [i for i in X_train.columns if i != "chip_id"]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    initial = lgbm_params["random_seed"]
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train, X_train["chip_id"])):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]

        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]
        #print(X_train2.chip_id.unique())
        #print(lbl.inverse_transform(X_train2.chip_id.unique()))
        
        X_train2.drop(["chip_id"], axis=1, inplace=True)
        X_test2.drop(["chip_id"], axis=1, inplace=True)
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
           num_boost_round=10000,early_stopping_rounds=30,verbose_eval = 1000, feval=pr_auc_metric, categorical_feature = categoricals)
        valid_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
        valid = np.concatenate([valid, valid_predict])
        valid_lgb.iloc[test_index]  = clf.predict(X_test2, num_iteration = clf.best_iteration).reshape(X_test2.shape[0], 1)
        real = np.concatenate([real, y_test2])
        feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()
        
        models.append(clf)
        
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]

    score = average_precision_score(real, valid)
    print("average precision score = {}".format(average_precision_score(real, valid)))
    print(confusion_matrix(real, np.round(valid)))
    pred_value = np.zeros(X_test.shape[0])
    for model in models:
        pred_value += model.predict(X_test, num_iteration = model.best_iteration) / len(models)
    return score, pred_value, feature_importance_df, valid_lgb

metric, pred_value, feature_importance_df, _ = modelling(new_train, new_test, lgbm_params)

Fold 1
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[195]	training's pr_auc: 1	valid_1's pr_auc: 0.853398
Fold 2
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[163]	training's pr_auc: 0.999964	valid_1's pr_auc: 0.887736
Fold 3
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[133]	training's pr_auc: 0.990122	valid_1's pr_auc: 0.962526
Fold 4
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[141]	training's pr_auc: 0.99573	valid_1's pr_auc: 0.786265
Fold 5
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[214]	training's pr_auc: 1	valid_1's pr_auc: 0.673492
Fold 6
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[25]	training's pr_auc: 0.829694	valid_1's pr_auc: 1
average precision score = 0.78557735545

In [12]:
feature_importance_df.sort_values("Cv", ascending=True).reset_index(drop=True)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Fold_6,Average,Std,Cv
0,intensity__large_standard_deviation__r_0_1,74,63,72,44,84,24,60.166667,20.317617,0.337689
1,params2,169,145,94,131,133,39,118.5,41.911613,0.353685
2,params3,199,222,181,227,297,47,195.5,75.577223,0.386584
3,intensity_max,200,172,141,159,254,44,161.666667,63.746024,0.394305
4,params6,161,177,132,114,177,22,130.5,53.699007,0.411487
5,beta,114,103,86,100,123,10,89.333333,37.299091,0.417527
6,params1,155,173,129,170,203,17,141.166667,59.756218,0.423303
7,layout_y,82,72,67,74,103,8,67.666667,29.078438,0.429731
8,amp_0_95,98,69,84,74,142,19,81.0,36.642416,0.452376
9,rms,114,132,104,98,178,16,107.0,48.452726,0.452829


# submission

In [13]:
final_pred = pred_value
score = metric
sample_submission["target"] = final_pred
sample_submission.to_csv("atmacup3_sample_submission"+str(score)[:-10]+".csv", index = False)
sample_submission.head()

Unnamed: 0,target
0,0.001991
1,0.001885
2,0.002051
3,0.002148
4,0.021453
