In [14]:
import pandas as pd
import numpy as np
import warnings
import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter, defaultdict
from sklearn import preprocessing
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, log_loss, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix, auc
import lightgbm as lgb
from functools import partial
import copy
import time
import matplotlib.pyplot as plt
import scipy as sp
import random
from hyperopt import hp, tpe, Trials, fmin, space_eval
import cv2
from catboost import CatBoost
from catboost import Pool
import pathlib
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",1000)
np.set_printoptions(precision=5)
warnings.filterwarnings("ignore")
from sklearn.metrics import average_precision_score

In [15]:
filepath = "../../../../pythonwork/20200529atma/input/"
train = pd.read_csv(filepath + "train.csv")
test = pd.read_csv(filepath + "test.csv")
fitting = pd.read_csv(filepath + "fitting.csv")
sample_submission = pd.read_csv(filepath + "atmacup5__sample_submission.csv")

In [16]:
p_temp = pathlib.Path(filepath + 'spectrum_raw/')

spec = []
for file in p_temp.iterdir():
    spec_df = pd.read_csv(file, sep='\t', header=None)
    spec_df.columns = ["wavelength", "intensity"]
    spec_df["spectrum_filename"] = file.stem + ".dat"
    spec.append(spec_df)

spec_df = pd.concat(spec, axis=0)
spec_df.head()

Unnamed: 0,wavelength,intensity,spectrum_filename
0,1032.836,87.0,b63e0413b6a42cfadca5.dat
1,1033.886,250.0,b63e0413b6a42cfadca5.dat
2,1034.936,293.0,b63e0413b6a42cfadca5.dat
3,1035.986,117.0,b63e0413b6a42cfadca5.dat
4,1037.036,208.0,b63e0413b6a42cfadca5.dat


# FE

In [17]:
train = pd.merge(train, fitting, on="spectrum_id", how="left")
test = pd.merge(test, fitting, on="spectrum_id", how="left")

In [18]:
spec_agg = spec_df.groupby("spectrum_filename")["intensity"].agg(["max", "min", "mean", "std"])
spec_agg.columns = ["intensity_" + c for c in spec_agg.columns]
train = pd.merge(train, spec_agg.reset_index(), on="spectrum_filename", how="left")
test = pd.merge(test, spec_agg.reset_index(), on="spectrum_filename", how="left")
print(f'train shape: {train.shape}')
print(f'test shape: {test.shape}')

train shape: (7436, 22)
test shape: (6952, 21)


In [19]:
new_train = train.drop(["spectrum_id",  "spectrum_filename","chip_id"], axis=1).copy()
new_test = test.drop(["spectrum_id",  "spectrum_filename","chip_id"], axis=1).copy()
print(f'train shape: {new_train.shape}')
print(f'test shape: {new_test.shape}')

train shape: (7436, 19)
test shape: (6952, 18)


In [20]:
def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', average_precision_score(y_true.get_label(), y_predicted), True

# modelling

## lgb

In [29]:
categoricals = []
lgbm_params = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.01, 'random_seed':44,'max_depth': 3}

def modelling(new_train, new_test, lgbm_params):
    X_train = new_train.drop(['target'],axis=1).copy()
    y_train = new_train.target.copy()
    
    remove_features = []
    for i in X_train.columns:
        if (X_train[i].std() == 0) and i not in remove_features:
            remove_features.append(i)
    X_train = X_train.drop(remove_features, axis=1)
    X_test = new_test.copy()
    X_test = X_test.drop(remove_features, axis=1)

    n_folds=5
    skf=StratifiedKFold(n_splits = n_folds)
    models = []

    valid = np.array([])
    valid_lgb = pd.DataFrame(np.zeros([X_train.shape[0]]))
    real = np.array([])
    features_list = [i for i in X_train.columns]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    initial = lgbm_params["random_seed"]
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        print("Fold "+str(i+1))
        #lgbm_params["random_seed"] = initial + i
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]

        X_test2 = X_train.iloc[test_index,:]
        y_test2 = y_train.iloc[test_index]
        
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_test2, y_test2, reference=lgb_train)
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
           num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 1000, feval=pr_auc_metric)#, categorical_feature = categoricals)
        valid_predict = clf.predict(X_test2, num_iteration = clf.best_iteration)
        valid = np.concatenate([valid, valid_predict])
        valid_lgb.iloc[test_index]  = clf.predict(X_test2, num_iteration = clf.best_iteration).reshape(X_test2.shape[0], 1)
        real = np.concatenate([real, y_test2])
        feature_importance_df["Fold_"+str(i+1)] = clf.feature_importance()

        models.append(clf)
        
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]

    score = average_precision_score(real, valid)
    print("average precision score = {}".format(average_precision_score(real, valid)))
    print(confusion_matrix(real, np.round(valid)))
    pred_value = np.zeros(X_test.shape[0])
    for model in models:
        pred_value += model.predict(X_test, num_iteration = model.best_iteration) / len(models)
    return score, pred_value, feature_importance_df, valid_lgb

metric, pred_value, feature_importance_df, _ = modelling(new_train, new_test, lgbm_params)

Fold 1
Training until validation scores don't improve for 100 rounds.
[1000]	training's pr_auc: 0.989017	valid_1's pr_auc: 0.801952
Early stopping, best iteration is:
[1738]	training's pr_auc: 0.999972	valid_1's pr_auc: 0.820036
Fold 2
Training until validation scores don't improve for 100 rounds.
[1000]	training's pr_auc: 0.984731	valid_1's pr_auc: 0.807379
Early stopping, best iteration is:
[1611]	training's pr_auc: 0.999376	valid_1's pr_auc: 0.826773
Fold 3
Training until validation scores don't improve for 100 rounds.
[1000]	training's pr_auc: 0.988201	valid_1's pr_auc: 0.803868
Early stopping, best iteration is:
[1883]	training's pr_auc: 0.999972	valid_1's pr_auc: 0.844133
Fold 4
Training until validation scores don't improve for 100 rounds.
[1000]	training's pr_auc: 0.979834	valid_1's pr_auc: 0.873237
Early stopping, best iteration is:
[1195]	training's pr_auc: 0.989227	valid_1's pr_auc: 0.87918
Fold 5
Training until validation scores don't improve for 100 rounds.
Early stopping,

In [30]:
feature_importance_df.sort_values("Std", ascending=True).reset_index(drop=True)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Average,Std,Cv
0,layout_a,101,14,33,6,23,35.4,34.015291,0.960884
1,exc_wl,183,138,266,128,90,161.0,60.27935,0.374406
2,layout_x,215,245,176,192,34,172.4,72.989314,0.423372
3,intensity_min,615,511,498,414,232,454.0,128.054676,0.282059
4,pos_x,495,204,498,368,142,341.4,146.603683,0.429419
5,params4,365,237,523,353,69,309.4,150.765513,0.487283
6,params5,485,560,430,345,120,388.0,151.281195,0.3899
7,intensity_std,428,367,687,419,209,422.0,154.054536,0.365058
8,intensity_mean,312,571,505,278,137,360.6,157.671304,0.437247
9,params2,686,602,750,414,292,548.8,171.027951,0.31164


# submission

In [31]:
final_pred = pred_value
score = metric
sample_submission["target"] = final_pred
sample_submission.to_csv("../../../20200529atma/result/atmacup3_sample_submission"+str(score)[:-10]+".csv", index = False)

In [32]:
sample_submission.head()

Unnamed: 0,target
0,0.000186
1,0.000272
2,0.00027
3,0.000285
4,0.005305
