In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt

import torch
import torch.nn as nn
from pytorch_tabnet.tab_model import TabNetClassifier

In [97]:
train = pd.read_csv('./data/train.csv')
display(train.isnull().sum())
train['loading'].fillna(value=127.74233678955453, inplace=True)
train.drop(['id', 'product_code', 'attribute_0', 'attribute_1', 'attribute_2', 'attribute_3'], axis='columns', inplace=True)
train.reset_index(drop=True, inplace=True)
train.columns = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

train

id                   0
product_code         0
loading            250
attribute_0          0
attribute_1          0
attribute_2          0
attribute_3          0
measurement_0        0
measurement_1        0
measurement_2        0
measurement_3      381
measurement_4      538
measurement_5      676
measurement_6      796
measurement_7      937
measurement_8     1048
measurement_9     1227
measurement_10    1300
measurement_11    1468
measurement_12    1601
measurement_13    1774
measurement_14    1874
measurement_15    2009
measurement_16    2110
measurement_17    2284
failure              0
dtype: int64

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,80.10,7,8,4,18.040,12.518,15.748,19.292,11.739,20.155,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.100,0
1,84.89,14,3,3,18.213,11.540,17.717,17.893,12.748,17.889,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,82.43,12,1,5,18.057,11.652,16.738,18.240,12.718,18.288,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,101.07,13,2,6,17.295,11.188,18.576,18.339,12.583,19.060,12.471,16.346,18.377,10.020,15.250,15.562,16.154,17.172,826.282,0
4,188.06,9,2,8,19.346,12.950,16.990,15.746,11.306,18.093,10.337,17.082,19.932,12.428,16.182,12.760,13.153,16.412,579.885,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26565,158.95,6,16,4,16.301,13.259,18.068,15.505,10.865,19.354,,12.177,17.942,10.112,15.795,18.572,16.144,,729.131,0
26566,146.02,10,12,8,17.543,,17.984,19.078,11.139,19.563,11.242,14.179,20.564,10.234,14.450,14.322,13.146,16.471,853.924,0
26567,115.62,1,10,1,15.670,11.535,16.778,18.385,11.630,19.279,11.407,16.437,17.476,8.668,15.069,16.599,15.590,14.065,750.364,0
26568,106.38,2,9,4,18.059,,16.918,18.101,11.713,19.358,11.392,17.064,17.814,14.928,16.273,15.485,13.624,12.865,730.156,0


In [123]:
target = 19
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid"], p =[.8, .1], size=(train.shape[0],))
    
train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index
# test_indices = train[train.Set=="test"].index

            0   1   2  3       4           5       6       7       8       9  \
0       80.10   7   8  4  18.040   12.518000  15.748  19.292  11.739  20.155   
1       84.89  14   3  3  18.213   11.540000  17.717  17.893  12.748  17.889   
2       82.43  12   1  5  18.057   11.652000  16.738  18.240  12.718  18.288   
3      101.07  13   2  6  17.295   11.188000  18.576  18.339  12.583  19.060   
4      188.06   9   2  8  19.346   12.950000  16.990  15.746  11.306  18.093   
...       ...  ..  .. ..     ...         ...     ...     ...     ...     ...   
26565  158.95   6  16  4  16.301   13.259000  18.068  15.505  10.865  19.354   
26566  146.02  10  12  8  17.543  127.753606  17.984  19.078  11.139  19.563   
26567  115.62   1  10  1  15.670   11.535000  16.778  18.385  11.630  19.279   
26568  106.38   2   9  4  18.059  127.753606  16.918  18.101  11.713  19.358   
26569  131.20   6  19  1  18.034   11.431000  16.918  17.129  12.713  18.731   

       ...      11          12      13 

In [122]:
train_indices

Int64Index([], dtype='int64')

In [118]:
nunique = train.nunique()
types = train.dtypes

categorical_columns = []
categorical_dims =  {}
for col in train.columns:
    if types[col] == 'object' or nunique[col] < 200:
        print(col, train[col].nunique())
        l_enc = LabelEncoder()
        train[col] = train[col].fillna("VV_likely")
        train[col] = l_enc.fit_transform(train[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
    else:
        train.fillna(train.loc[train_indices, col].mean(), inplace=True)


# Categorical Embedding을 위해 Categorical 변수의 차원과 idxs를 담음.
unused_feat = ['Set']
features = [ col for col in train.columns if col not in unused_feat+[target]] 
cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]



X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices]

# X_test = train[features].values[test_indices]
# y_test = train[target].values[test_indices]

1 29
2 30
3 25
19 2
Set 2


In [119]:
X_train

array([], shape=(0, 19), dtype=float64)

In [102]:
clf = TabNetClassifier(cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=10,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-2),
                       scheduler_params={"step_size":50,
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax' # "sparsemax", entmax
                      )

Device used : cuda


In [103]:
max_epochs = 50

clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    max_epochs=max_epochs , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False,
)

ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required.