In [1]:
import cv2
import copy
import time
import tqdm
import pathlib
import random
import warnings
import datetime
import scipy as sp
import pandas as pd
import numpy as np
import xgboost as xgb
from time import time
import lightgbm as lgb
from catboost import Pool
from functools import partial
import matplotlib.pyplot as plt
from scipy.stats import laplace
from catboost import CatBoost
from keras.models import Model
from sklearn import preprocessing
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from scipy.optimize import curve_fit
from keras.layers import Dense, Input
from sklearn.decomposition import PCA
from tqdm import tqdm_notebook as tqdm
from collections import Counter, defaultdict
from scipy.interpolate import UnivariateSpline
from sklearn.preprocessing import MinMaxScaler
from hyperopt import hp, tpe, Trials, fmin, space_eval
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold
from sklearn.metrics import confusion_matrix, average_precision_score, r2_score
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=5)
warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [2]:
def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', average_precision_score(y_true.get_label(), y_predicted), True

In [3]:
filepath = "../input/atma2020529/"
filepath2 = "../input/atma2020529-2/"
filepath3 = "../input/atma20205293/"
train = pd.read_csv(filepath + "train.csv")
test = pd.read_csv(filepath + "test.csv")
fitting = pd.read_csv(filepath + "fitting.csv")
sample_submission = pd.read_csv(filepath + "atmaCup5__sample_submission.csv")
spec_df = pd.read_csv(filepath3 + "spec.csv")
#wave_df = pd.read_csv(filepath2 + "wave_df.csv")
#wave_test = pd.read_csv(filepath2 + "wave_test.csv")
#wave_df = pd.DataFrame(wave_df)
#wave_test = pd.DataFrame(wave_test)
#wave_df = wave_df.iloc[:,:511]

# FE

In [4]:
from tsfresh import extract_features, extract_relevant_features
from tsfresh.feature_extraction import settings
# https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_extraction.html#module-tsfresh.feature_extraction.feature_calculators
fc_parameters = {
    "large_standard_deviation": [{ "r": 0.1}],   
    "ar_coefficient": [{"coeff": 3, "k": 10}], 
    "cid_ce" :[{"normalize": True}],
}
ts_df = extract_features(spec_df, column_id="spectrum_filename", column_sort="wavelength", n_jobs=8, 
                         default_fc_parameters=fc_parameters) #settings.EfficientFCParameters())

ts_df = ts_df.reset_index()
ts_df = ts_df.rename(columns={"id":"spectrum_filename"})

Feature Extraction: 100%|██████████| 40/40 [00:13<00:00,  2.92it/s]


- "change_quantiles": [{"ql":0.8, "qh":1.0, "isabs": True, "f_agg": "var"}],
- "change_quantiles": [{"ql":0.6, "qh":1.0, "isabs": True, "f_agg": "var"}],
- "change_quantiles": [{"ql":0.4, "qh":1.0, "isabs": True, "f_agg": "var"}],
- "change_quantiles": [{"ql":0.0, "qh":1.0, "isabs": True, "f_agg": "var"}],
- "fft_coefficient": [{"coeff": 71, "attr": "abs"}]
- "number_cwt_peaks": [{"n":10}],
- "c3": [{"lag": 40}],

In [5]:
def autocorr(x):
    result = np.correlate(x, x, mode='full')
    return result[result.size//2:]

# aggragate spec information
spec_agg = spec_df.groupby("spectrum_filename")["intensity"].agg(["max", "min", "mean", "std"])
spec_agg.columns = ["intensity_" + c for c in spec_agg.columns]
fft = []
for i, file_df in tqdm(spec_df.groupby("spectrum_filename")):
    x = file_df["wavelength"].values
    y = file_df["intensity"].values
        
    # fast fourier transformation
    F = np.fft.fft(y)
    Amp = np.abs(F)
    fft.append(np.quantile(Amp, 0.95))
    
spec_agg["amp_0.95"] = fft

HBox(children=(FloatProgress(value=0.0, max=14388.0), HTML(value='')))




In [6]:
def pos_x_modify(row):
    if row["layout_a"] ==2 or row["layout_a"] ==3:
        row["layout_x"] += 47
    return row["layout_x"]

def pos_y_modify(row):
    if row["layout_a"] ==1 or row["layout_a"] ==3:
        row["layout_y"] += 191
    return row["layout_y"]

def transform(df):
    new_df = df.copy()
    # merge original csv and fitting data
    new_df = pd.merge(new_df, fitting, on="spectrum_id", how="left")
    
    # merge original csv and spec information
    new_df= pd.merge(new_df, spec_agg.reset_index(), on="spectrum_filename", how="left")
    
    new_df = pd.merge(new_df, ts_df, on="spectrum_filename", how="left")
    
    # remove unnecessary columns
    if "target" in new_df.columns:
        new_df = new_df.drop(["spectrum_id",  "spectrum_filename"], axis=1)
    else:
        new_df = new_df.drop(["spectrum_id",  "spectrum_filename","chip_id"], axis=1)
    
    # create new variables
    new_df["layout_x"] = new_df.apply(lambda x: pos_x_modify(x), axis=1)
    new_df["layout_y"] = new_df.apply(lambda x: pos_y_modify(x), axis=1)
    new_df["ratio2_5"] = new_df["params2"] / new_df["params5"] # I don't know the meaning, but seems effective
    new_df["ratio3_1"] = new_df["params3"] / new_df["params1"] # I don't know the meaning, but seems effective
    
    return new_df
new_train = transform(train)
new_test = transform(test)
print(f'train shape: {new_train.shape}')
print(f'test shape: {new_test.shape}')

train shape: (7436, 26)
test shape: (6952, 24)


In [7]:
new_train.head()

Unnamed: 0,chip_id,exc_wl,layout_a,layout_x,layout_y,pos_x,target,params0,params1,params2,params3,params4,params5,params6,rms,beta,intensity_max,intensity_min,intensity_mean,intensity_std,amp_0.95,intensity__ar_coefficient__k_10__coeff_3,intensity__cid_ce__normalize_True,intensity__large_standard_deviation__r_0.1,ratio2_5,ratio3_1
0,79ad4647da6de6425abf,850,2,83,140,1313.081,0,30.808589,581.1802,1037.714752,1.531423,22469.651641,1032.317268,8.29561,10.028668,0.02521298,1751.0,-228.0,40.292752,172.206792,7643.154333,0.103792,18.584115,0.0,1.005229,0.002635022
1,79ad4647da6de6425abf,780,3,47,359,159.415,0,91.300897,17405.82,1080.510452,4.766233,33257.123175,1077.468855,8.018225,7.948485,0.3435612,4219.0,-263.0,166.958984,463.428363,28918.58781,0.053777,7.889624,1.0,1.002823,0.0002738298
2,c695a1e61e002b34e556,780,1,34,220,-610.7688,0,106.642946,1e-10,1119.464438,2.0,42579.867913,1378.883338,11.687417,10.739859,2.348528e-15,2412.0,-235.0,151.577691,327.857694,18130.78214,0.233933,10.395798,1.0,0.811863,20000000000.0
3,c695a1e61e002b34e556,780,2,79,139,1214.618,0,306.933674,10994.86,1139.855067,5.198692,39349.741703,1145.212849,9.445029,10.379948,0.2183921,3209.0,-52.0,523.080947,436.48141,22149.537147,0.107048,7.399576,1.0,0.995322,0.0004728291
4,c695a1e61e002b34e556,780,0,45,85,-257.6161,0,46.133256,22276.22,1120.918337,5.668012,31054.928673,1117.107782,7.65871,8.31655,0.4176962,3998.0,-245.0,138.187717,472.009931,30874.913315,0.134784,7.886848,1.0,1.003411,0.0002544423


# modelling

In [8]:
categoricals = ['layout_a']
lgbm_params = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.04, 'random_seed':44,'max_depth': 3}
lgbm_params3 = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.035, 'random_seed':44,'max_depth': 4}
lgbm_params4 = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.055, 'random_seed':44,'max_depth': 3}
lgbm_params5 = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.045, 'random_seed':44,'max_depth': 3}
lgbm_params6 = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.03, 'random_seed':44,'max_depth': 3}

def modelling(new_train, new_test, lgbm_params):
    X_train = new_train.drop(['target'],axis=1).copy()
    y_train = new_train.target.copy()
    
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train["chip_id"]))
    X_train["chip_id"] = lbl.transform(list(X_train["chip_id"]))
    
    remove_features = ["layout_x", "layout_y", "pos_x"]
    for i in X_train.columns:
        if (X_train[i].std() == 0) and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    X_test = new_test.copy()
    X_test = X_test.drop(remove_features, axis=1)
    
    X_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_train.columns]
    X_test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_test.columns]

    n_folds=6
    skf=GroupKFold(n_splits = n_folds)
    models = []

    valid = np.array([])
    valid_lgb = pd.DataFrame(np.zeros([X_train.shape[0]]))
    real = np.array([])
    features_list = [i for i in X_train.columns if i != "chip_id"]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    initial = lgbm_params["random_seed"]
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train, X_train["chip_id"])):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]

        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]
        #print(X_train2.chip_id.unique())
        #print(lbl.inverse_transform(X_train2.chip_id.unique()))
        
        X_train2.drop(["chip_id"], axis=1, inplace=True)
        X_test2.drop(["chip_id"], axis=1, inplace=True)
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
        
        if i + 1 == 5:
            clf = lgb.train(lgbm_params5, lgb_train,valid_sets=[lgb_train, lgb_eval],
           num_boost_round=10000,early_stopping_rounds=30,verbose_eval = 1000, feval=pr_auc_metric, categorical_feature = categoricals) 
        elif i + 1 == 3:
            clf = lgb.train(lgbm_params3, lgb_train,valid_sets=[lgb_train, lgb_eval],
           num_boost_round=10000,early_stopping_rounds=30,verbose_eval = 1000, feval=pr_auc_metric, categorical_feature = categoricals)    
        elif i + 1 == 4:
            clf = lgb.train(lgbm_params4, lgb_train,valid_sets=[lgb_train, lgb_eval],
           num_boost_round=10000,early_stopping_rounds=30,verbose_eval = 1000, feval=pr_auc_metric, categorical_feature = categoricals)   
        elif i + 1 == 6:
            clf = lgb.train(lgbm_params6, lgb_train,valid_sets=[lgb_train, lgb_eval],
           num_boost_round=10000,early_stopping_rounds=30,verbose_eval = 1000, feval=pr_auc_metric, categorical_feature = categoricals)  
        else:
            clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
           num_boost_round=10000,early_stopping_rounds=30,verbose_eval = 1000, feval=pr_auc_metric, categorical_feature = categoricals)
            
        valid_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
        valid = np.concatenate([valid, valid_predict])
        valid_lgb.iloc[test_index]  = clf.predict(X_test2, num_iteration = clf.best_iteration).reshape(X_test2.shape[0], 1)
        real = np.concatenate([real, y_test2])
        feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()
        
        if i+1 != 6:
            models.append(clf)
        
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]

    score = average_precision_score(real, valid)
    print("average precision score = {}".format(average_precision_score(real, valid)))
    print(confusion_matrix(real, np.round(valid)))
    pred_value = np.zeros(X_test.shape[0])
    for i, model in enumerate(models):
        pred_value += model.predict(X_test, num_iteration = model.best_iteration) / len(models)
    return score, pred_value, feature_importance_df, valid_lgb

metric, pred_value, feature_importance_df, _ = modelling(new_train, new_test, lgbm_params)

Fold 1
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[228]	training's pr_auc: 0.995927	valid_1's pr_auc: 0.886216
Fold 2
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[223]	training's pr_auc: 0.992535	valid_1's pr_auc: 0.912956
Fold 3
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[141]	training's pr_auc: 0.984268	valid_1's pr_auc: 0.967975
Fold 4
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[351]	training's pr_auc: 1	valid_1's pr_auc: 0.826989
Fold 5
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[260]	training's pr_auc: 0.998233	valid_1's pr_auc: 0.712052
Fold 6
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[45]	training's pr_auc: 0.845436	valid_1's pr_auc: 0.5
average precision score = 0.8

In [9]:
feature_importance_df.sort_values("Average", ascending=False).reset_index(drop=True)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Fold_6,Average,Std,Cv
0,ratio3_1,171,148,181,206,160,43,151.5,51.764692,0.341681
1,params6,108,175,178,233,143,52,148.166667,57.292864,0.386678
2,params3,114,143,150,225,151,43,137.666667,54.017487,0.392379
3,intensity__ar_coefficient__k_10__coeff_3,147,100,199,148,163,53,135.0,46.800997,0.346674
4,intensity_max,89,103,137,178,143,22,112.0,49.416596,0.44122
5,ratio2_5,46,125,100,170,147,0,98.0,58.631618,0.598282
6,params2,97,107,107,141,94,32,96.333333,32.576406,0.338163
7,rms,74,108,93,148,128,7,93.0,45.151597,0.485501
8,params0,84,51,107,109,81,5,72.833333,35.927783,0.493288
9,intensity_min,95,62,86,121,50,9,70.5,35.743298,0.506997


# submission

In [10]:
final_pred = pred_value
score = metric
sample_submission["target"] = final_pred
sample_submission.to_csv("atmacup3_sample_submission"+str(score)[:-10]+".csv", index = False)
sample_submission.head()

Unnamed: 0,target
0,0.002412
1,0.00049
2,0.000536
3,0.000153
4,0.040891
