# SW중심대학 공동 AI 경진대회

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

## 데이터 불러오기

In [2]:
# pd.set_option('display.max_columns', None) # 모든 컬럼 생략없이 출력

In [3]:
train = pd.read_csv("competition_data/train.csv")
train = train.drop(['index'], axis=1)  # index 컬럼 삭제 무의미한 데이터

test = pd.read_csv("competition_data/test.csv")
test = test.drop(['index'], axis=1)

## 데이터 확인

### train셋 확인

In [4]:
train.head(10)  # 데이터 확인

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,engnat,age,hand,religion,orientation,voted,married,familysize,ASD,nerdiness
0,1.0,5.0,5.0,5.0,1.0,4.0,5.0,5.0,1.0,3.0,...,1.0,20,2.0,12.0,4.0,2.0,1.0,4.0,2.0,1
1,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,3.0,3.0,...,1.0,49,1.0,2.0,1.0,1.0,2.0,4.0,2.0,1
2,4.0,5.0,5.0,4.0,3.0,5.0,5.0,5.0,4.0,4.0,...,2.0,43,1.0,2.0,2.0,2.0,3.0,4.0,2.0,1
3,4.0,4.0,4.0,2.0,4.0,3.0,3.0,5.0,3.0,4.0,...,1.0,17,2.0,1.0,1.0,2.0,1.0,2.0,2.0,1
4,4.0,4.0,4.0,4.0,3.0,3.0,4.0,2.0,3.0,4.0,...,2.0,18,2.0,12.0,1.0,2.0,1.0,1.0,2.0,0
5,5.0,4.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,3.0,...,1.0,26,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
6,4.0,3.0,4.0,3.0,5.0,4.0,5.0,4.0,5.0,5.0,...,2.0,40,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1
7,4.0,5.0,4.0,4.0,4.0,4.0,2.0,5.0,3.0,4.0,...,2.0,34,1.0,2.0,5.0,1.0,1.0,2.0,2.0,1
8,4.0,4.0,3.0,4.0,4.0,5.0,4.0,3.0,3.0,4.0,...,1.0,20,1.0,7.0,1.0,1.0,1.0,3.0,2.0,0
9,3.0,3.0,4.0,3.0,4.0,2.0,4.0,2.0,4.0,4.0,...,2.0,17,1.0,10.0,1.0,2.0,1.0,5.0,2.0,0


In [5]:
train.info()  # 데이터 유형 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 69 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Q1            14959 non-null  float64
 1   Q2            14931 non-null  float64
 2   Q3            14950 non-null  float64
 3   Q4            14929 non-null  float64
 4   Q5            14962 non-null  float64
 5   Q6            14952 non-null  float64
 6   Q7            14924 non-null  float64
 7   Q8            14952 non-null  float64
 8   Q9            14944 non-null  float64
 9   Q10           14928 non-null  float64
 10  Q11           14941 non-null  float64
 11  Q12           14933 non-null  float64
 12  Q13           14960 non-null  float64
 13  Q14           14964 non-null  float64
 14  Q15           14955 non-null  float64
 15  Q16           14967 non-null  float64
 16  Q17           14963 non-null  float64
 17  Q18           14937 non-null  float64
 18  Q19           14947 non-nu

## 라벨 인코딩

In [8]:
train_codes = pd.Categorical(train['country'])
test_codes = pd.Categorical(test['country'])

In [9]:
train['country'] = train_codes.codes
test['country'] = test_codes.codes
train['country']

0        130
1        130
2         94
3        130
4         64
        ... 
14995    130
14996    130
14997    130
14998    130
14999     17
Name: country, Length: 15000, dtype: int16

In [10]:
train = train.astype('float64')
test = test.astype('float64')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 69 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Q1            14959 non-null  float64
 1   Q2            14931 non-null  float64
 2   Q3            14950 non-null  float64
 3   Q4            14929 non-null  float64
 4   Q5            14962 non-null  float64
 5   Q6            14952 non-null  float64
 6   Q7            14924 non-null  float64
 7   Q8            14952 non-null  float64
 8   Q9            14944 non-null  float64
 9   Q10           14928 non-null  float64
 10  Q11           14941 non-null  float64
 11  Q12           14933 non-null  float64
 12  Q13           14960 non-null  float64
 13  Q14           14964 non-null  float64
 14  Q15           14955 non-null  float64
 15  Q16           14967 non-null  float64
 16  Q17           14963 non-null  float64
 17  Q18           14937 non-null  float64
 18  Q19           14947 non-nu

## 데이터 전처리 전 시각화

In [11]:
mask = np.zeros_like(train.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

plt.figure(figsize=(40, 30))
sns.heatmap(train.corr(), annot=True, cmap='Blues', fmt='.2f', mask=mask)  #전체의 상관관계 시각화

In [12]:
plt.figure(figsize=(12, 9))
plt.xticks(rotation=90)
sns.barplot(x=abs(train.corr()['nerdiness']).sort_values(ascending=False)[1:].index,
            y=abs(train.corr()['nerdiness']).sort_values(ascending=False)[1:])

In [13]:
plt.style.use('ggplot')

# 히스토그램을 사용해서 데이터의 분포 살펴보기
plt.figure(figsize=(30, 30))
plt.suptitle("Data Histogram", fontsize=40)

# id는 제외하고 시각화
cols = train.columns
for i in range(len(cols)):
    plt.subplot(9, 9, i + 1)  # 최대 5 by 5, 25개의 특징 분포를 확인할 수 있습니다.
    plt.title(cols[i], fontsize=15)  # 각 분포그림의 제목을 특징명으로 설정합니다.
    if len(train[cols[i]].unique()) < 20:  # 해당 특징의 고유한 값 종류가 20개가 넘으면
        sns.countplot(data=train, x=train[cols[i]], hue='nerdiness')  # 연령대별 target 분포
    else:
        plt.xticks(rotation=90)  # x축 text 회전
        sns.histplot(data=train, x=train[cols[i]], hue='nerdiness', bins=20)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

## 이상치 & 결측치 평균값 대체

### 이상치 확인

In [14]:
train.plot(kind='box', subplots=True, layout=(9, 9), figsize=(21, 21))
plt.show()

### 이상치 처리

In [15]:
def outliers_iqr(data):
    q1, q3 = data.quantile(.25), data.quantile(.75)
    # 넘파이의 값을 퍼센트로 표시해주는 함수

    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)

    return np.where((data > upper_bound) | (data < lower_bound))

### train 이상치 처리

In [16]:
introelapse_index_data = outliers_iqr(train['introelapse'])[0]
testelapse_index_data = outliers_iqr(train['testelapse'])[0]
surveyelapse_index_data = outliers_iqr(train['surveyelapse'])[0]
age_index_data = outliers_iqr(train['age'])[0]
family_index_data = outliers_iqr(train['familysize'])[0]

train.loc[family_index_data, 'familysize'].value_counts()  # 부분 처리 하기위해 확인

In [17]:
train.loc[introelapse_index_data, 'introelapse'] = np.nan
train.loc[testelapse_index_data, 'testelapse'] = np.nan
train.loc[surveyelapse_index_data, 'surveyelapse'] = np.nan
train.loc[age_index_data, 'age'] = np.nan
train.loc[train['familysize'] > 5, 'familysize'] = np.nan  # 7 이상을 nan으로 대체

train['familysize'].isna().sum()

### test 이상치 처리

In [18]:
introelapse_index_data = outliers_iqr(test['introelapse'])[0]
testelapse_index_data = outliers_iqr(test['testelapse'])[0]
surveyelapse_index_data = outliers_iqr(test['surveyelapse'])[0]
age_index_data = outliers_iqr(test['age'])[0]
family_index_data = outliers_iqr(test['familysize'])[0]

test.loc[family_index_data, 'familysize'].value_counts()

In [19]:
test.loc[introelapse_index_data, 'introelapse'] = np.nan
test.loc[testelapse_index_data, 'testelapse'] = np.nan
test.loc[surveyelapse_index_data, 'surveyelapse'] = np.nan
test.loc[age_index_data, 'age'] = np.nan
test.loc[test['familysize'] > 5, 'familysize'] = np.nan  # 6명 이상의 가족 수 == 이상치 처리, 왠지모르겠는데 위의 함수로 처리가 안됨.

test['familysize'].isna().sum()

In [20]:
# from sklearn.impute import SimpleImputer
#
# fill = SimpleImputer(missing_values=np.nan, strategy='mean')
# train = pd.DataFrame(fill.fit_transform(train), columns=train.columns).astype('int64')
# test = pd.DataFrame(fill.fit_transform(test), columns=test.columns).astype('int64')

In [21]:
from sklearn.impute import KNNImputer

train = train.dropna(thresh=68)

fill = KNNImputer(n_neighbors=10, weights='distance')
train = pd.DataFrame(fill.fit_transform(train), columns=train.columns).astype('int64')
test = pd.DataFrame(fill.fit_transform(test), columns=test.columns).astype('int64')

## 데이터 보정 확인

In [29]:
plt.style.use('ggplot')

# 히스토그램을 사용해서 데이터의 분포 살펴보기
plt.figure(figsize=(30, 30))
plt.suptitle("Data Histogram", fontsize=40)

# id는 제외하고 시각화
cols = train.columns
for i in range(len(cols)):
    plt.subplot(9, 9, i + 1)  # 최대 5 by 5, 25개의 특징 분포를 확인할 수 있습니다.
    plt.title(cols[i], fontsize=15)  # 각 분포그림의 제목을 특징명으로 설정합니다.
    if len(train[cols[i]].unique()) < 20:  # 해당 특징의 고유한 값 종류가 20개가 넘으면
        sns.countplot(data=train, x=train[cols[i]], hue='nerdiness')  # 연령대별 target 분포
    else:
        plt.xticks(rotation=90)  # x축 text 회전
        sns.histplot(data=train, x=train[cols[i]], hue='nerdiness', bins=20)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [30]:
mask = np.zeros_like(train.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

plt.figure(figsize=(40, 30))
sns.heatmap(train.corr(), annot=True, cmap='Blues', fmt='.2f', mask=mask)  #전체의 상관관계 시각화

In [31]:
plt.figure(figsize=(12, 9))  # 사이즈 변경
plt.xticks(rotation=90)  # x축 text 회전
sns.barplot(x=abs(train.corr()['nerdiness']).sort_values(ascending=False)[1:].index,
            y=abs(train.corr()['nerdiness']).sort_values(ascending=False)[1:])  # target을 제외하고 나머지와의 상관관계 시각화

In [33]:
train = train.drop('country', axis=1).astype('float64')
test = test.drop('country', axis=1).astype('float64')

## 모델 탐색

In [34]:
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold

In [35]:
submission = pd.read_csv("competition_data/sample_submission.csv")
#
# models = [
#     RandomForestClassifier(n_estimators=1000),
#     XGBClassifier(n_estimators=1200),
#     LGBMClassifier(n_estimators=4400),
#     ExtraTreesClassifier(n_estimators=2400)
# ]
#
# for model in models:
#     print(f'{type(model).__name__} score: {kfold(model, train)}')

In [36]:
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.preprocessing import LabelEncoder

# 기본: 68, country 제거: 67, TIPI 제거: -10
nunique = train.nunique()[:67]
cols = train.columns[:67]

cat_idxs = []
cat_dims = []

for i in range(len(cols)):
    if nunique[cols[i]] < 20 or cols[i] == 'country':
        enc = LabelEncoder()
        train[cols[i]] = enc.fit_transform(train[cols[i]].values)
        test[cols[i]] = enc.fit_transform(test[cols[i]].values)

        cat_idxs.append(i)
        cat_dims.append(len(enc.classes_))

len(cat_idxs), len(cat_dims)

(63, 63)

In [37]:
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.augmentations import ClassificationSMOTE

U_EPOCHS = 1500
T_EPOCHS = 1500

X_train = train.drop('nerdiness', axis=1).values
y_train = train['nerdiness'].values
X_test = test.values

In [38]:
U_TABNET_PARAMS = {
    'n_d': 64,
    'n_a': 64,
    'n_independent': 4,
    'n_shared': 4,
    # 'n_steps': 3,
    'n_shared_decoder': 4,
    'n_indep_decoder': 4,
    'cat_idxs': cat_idxs,
    'cat_dims': cat_dims,
    'cat_emb_dim': len(cat_dims),

    'optimizer_fn': torch.optim.AdamW,
    'optimizer_params': {'lr': 0.03,
                         'weight_decay': 0.03
                         },
    'scheduler_fn': torch.optim.lr_scheduler.CosineAnnealingWarmRestarts,
    'scheduler_params': {'T_0': U_EPOCHS // 10,
                         'T_mult': 2,
                         'eta_min': 0.00001
                         },
    'mask_type': 'entmax',
    'gamma': 0.9,
    # 'lambda_sparse': 0.0001,
}


def unsupervised_kfold(train, scale=False):
    cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

    for t, v in cv.split(train, train['nerdiness']):
        model = TabNetPretrainer(**U_TABNET_PARAMS)

        train_cv = train.iloc[t]  # 훈련용
        val_cv = train.iloc[v]  # 검증용 분리

        X_train = train_cv.drop('nerdiness', axis=1).values
        X_val = val_cv.drop('nerdiness', axis=1).values

        U_FIT_PARAMS = {
            'X_train': X_train,
            'eval_set': [X_val],
            'eval_name': ['val'],
            'max_epochs': U_EPOCHS,
            'patience': U_EPOCHS // 3,
            'batch_size': 2048,
            'virtual_batch_size': 1024,
            'pretraining_ratio': 0.2
        }

        model.fit(**U_FIT_PARAMS)

In [39]:
# unsupervised_kfold(train)

In [40]:
U_FIT_PARAMS = {
    'X_train': X_train,
    'eval_set': [X_train],
    'eval_name': ['train'],
    'max_epochs': U_EPOCHS,
    'patience': U_EPOCHS // 3,
    'batch_size': 2048,
    'virtual_batch_size': 1024,
    'pretraining_ratio': 0.2
}

In [41]:
# pretrain_model = TabNetPretrainer(**U_TABNET_PARAMS)
# pretrain_model.fit(**U_FIT_PARAMS)

In [42]:
# pretrain_model.save_model('./tabnetpretrain')

In [43]:
loaded_pretrain = TabNetPretrainer(**U_TABNET_PARAMS)
loaded_pretrain.load_model('./tabnetpretrain.zip')



In [44]:
TABNET_PARAMS = {
    'optimizer_fn': torch.optim.AdamW,
    'optimizer_params': {'lr': 0.01,
                         'weight_decay': 0.03
                         },
    'scheduler_fn': torch.optim.lr_scheduler.CosineAnnealingWarmRestarts,
    'scheduler_params': {'T_0': T_EPOCHS // 10,
                         'T_mult': 2,
                         'eta_min': 0.00001
                         },
    'mask_type': 'entmax',
    'gamma': 1,
}

aug = ClassificationSMOTE(p=0.2)


def clf_kfold(train, scale=False):
    cv_accuracy = []
    cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

    for t, v in cv.split(train, train['nerdiness']):
        model = TabNetClassifier(**TABNET_PARAMS)

        train_cv = train.iloc[t]  # 훈련용
        val_cv = train.iloc[v]  # 검증용 분리

        X_train = train_cv.drop('nerdiness', axis=1).values
        y_train = train_cv['nerdiness'].values

        X_val = val_cv.drop('nerdiness', axis=1).values
        y_val = val_cv['nerdiness'].values

        FIT_PARAMS = {
            'X_train': X_train,
            'y_train': y_train,
            'eval_set': [(X_val, y_val)],
            'eval_name': ['val'],
            'eval_metric': ['auc'],
            'max_epochs': T_EPOCHS,
            'patience': T_EPOCHS // 3,
            'batch_size': 4096,
            'virtual_batch_size': 4096,
            'weights': 1,
            'augmentations': aug,
            'from_unsupervised': loaded_pretrain
        }

        model.fit(**FIT_PARAMS)

        fpr, tpr, thresholds = metrics.roc_curve(y_val, model.predict_proba(X_val)[:, 1])

        score = metrics.auc(fpr, tpr)

        cv_accuracy.append(score)
    return np.mean(cv_accuracy)

In [45]:
clf_kfold(train)



epoch 0  | loss: 0.83968 | val_auc: 0.63002 |  0:00:00s
epoch 1  | loss: 0.67462 | val_auc: 0.73372 |  0:00:01s
epoch 2  | loss: 0.65048 | val_auc: 0.76199 |  0:00:01s
epoch 3  | loss: 0.62253 | val_auc: 0.77208 |  0:00:01s
epoch 4  | loss: 0.60343 | val_auc: 0.77291 |  0:00:02s
epoch 5  | loss: 0.59386 | val_auc: 0.77249 |  0:00:02s
epoch 6  | loss: 0.58273 | val_auc: 0.77611 |  0:00:02s
epoch 7  | loss: 0.57255 | val_auc: 0.77866 |  0:00:03s
epoch 8  | loss: 0.57182 | val_auc: 0.78004 |  0:00:03s
epoch 9  | loss: 0.56131 | val_auc: 0.77992 |  0:00:03s
epoch 10 | loss: 0.5675  | val_auc: 0.77626 |  0:00:03s
epoch 11 | loss: 0.56801 | val_auc: 0.77433 |  0:00:04s
epoch 12 | loss: 0.55303 | val_auc: 0.77916 |  0:00:04s
epoch 13 | loss: 0.54636 | val_auc: 0.78303 |  0:00:04s
epoch 14 | loss: 0.54636 | val_auc: 0.78023 |  0:00:05s
epoch 15 | loss: 0.53815 | val_auc: 0.7747  |  0:00:05s
epoch 16 | loss: 0.54582 | val_auc: 0.77537 |  0:00:05s
epoch 17 | loss: 0.53723 | val_auc: 0.77622 |  0



epoch 0  | loss: 0.85735 | val_auc: 0.62438 |  0:00:00s
epoch 1  | loss: 0.66826 | val_auc: 0.73657 |  0:00:00s
epoch 2  | loss: 0.64544 | val_auc: 0.76556 |  0:00:00s
epoch 3  | loss: 0.62248 | val_auc: 0.77141 |  0:00:01s
epoch 4  | loss: 0.59974 | val_auc: 0.77344 |  0:00:01s
epoch 5  | loss: 0.58282 | val_auc: 0.7692  |  0:00:01s
epoch 6  | loss: 0.57376 | val_auc: 0.77298 |  0:00:02s
epoch 7  | loss: 0.57251 | val_auc: 0.77843 |  0:00:02s
epoch 8  | loss: 0.56814 | val_auc: 0.78207 |  0:00:02s
epoch 9  | loss: 0.56623 | val_auc: 0.78599 |  0:00:03s
epoch 10 | loss: 0.5671  | val_auc: 0.78651 |  0:00:03s
epoch 11 | loss: 0.56118 | val_auc: 0.78314 |  0:00:03s
epoch 12 | loss: 0.55514 | val_auc: 0.78629 |  0:00:04s
epoch 13 | loss: 0.54203 | val_auc: 0.79253 |  0:00:04s
epoch 14 | loss: 0.54339 | val_auc: 0.79532 |  0:00:04s
epoch 15 | loss: 0.54387 | val_auc: 0.79558 |  0:00:04s
epoch 16 | loss: 0.53712 | val_auc: 0.79501 |  0:00:05s
epoch 17 | loss: 0.53001 | val_auc: 0.79665 |  0



epoch 0  | loss: 0.83504 | val_auc: 0.62702 |  0:00:00s
epoch 1  | loss: 0.67389 | val_auc: 0.72795 |  0:00:00s
epoch 2  | loss: 0.64524 | val_auc: 0.75727 |  0:00:01s
epoch 3  | loss: 0.62041 | val_auc: 0.76254 |  0:00:01s
epoch 4  | loss: 0.58639 | val_auc: 0.76232 |  0:00:01s
epoch 5  | loss: 0.58552 | val_auc: 0.76329 |  0:00:01s
epoch 6  | loss: 0.57231 | val_auc: 0.76144 |  0:00:02s
epoch 7  | loss: 0.57608 | val_auc: 0.767   |  0:00:02s
epoch 8  | loss: 0.57009 | val_auc: 0.76948 |  0:00:02s
epoch 9  | loss: 0.57131 | val_auc: 0.77006 |  0:00:03s
epoch 10 | loss: 0.57234 | val_auc: 0.76974 |  0:00:03s
epoch 11 | loss: 0.56135 | val_auc: 0.7713  |  0:00:03s
epoch 12 | loss: 0.55267 | val_auc: 0.77424 |  0:00:04s
epoch 13 | loss: 0.55356 | val_auc: 0.77732 |  0:00:04s
epoch 14 | loss: 0.54874 | val_auc: 0.77743 |  0:00:04s
epoch 15 | loss: 0.5402  | val_auc: 0.77782 |  0:00:04s
epoch 16 | loss: 0.54893 | val_auc: 0.77934 |  0:00:05s
epoch 17 | loss: 0.54089 | val_auc: 0.77984 |  0



epoch 0  | loss: 0.85409 | val_auc: 0.61391 |  0:00:00s
epoch 1  | loss: 0.67264 | val_auc: 0.73033 |  0:00:00s
epoch 2  | loss: 0.64295 | val_auc: 0.76254 |  0:00:01s
epoch 3  | loss: 0.6083  | val_auc: 0.76854 |  0:00:01s
epoch 4  | loss: 0.59283 | val_auc: 0.76119 |  0:00:01s
epoch 5  | loss: 0.59481 | val_auc: 0.75455 |  0:00:01s
epoch 6  | loss: 0.57526 | val_auc: 0.76232 |  0:00:02s
epoch 7  | loss: 0.5785  | val_auc: 0.763   |  0:00:02s
epoch 8  | loss: 0.57606 | val_auc: 0.76265 |  0:00:02s
epoch 9  | loss: 0.56833 | val_auc: 0.75428 |  0:00:03s
epoch 10 | loss: 0.55197 | val_auc: 0.75437 |  0:00:03s
epoch 11 | loss: 0.55693 | val_auc: 0.76803 |  0:00:03s
epoch 12 | loss: 0.56004 | val_auc: 0.77475 |  0:00:03s
epoch 13 | loss: 0.55275 | val_auc: 0.77626 |  0:00:04s
epoch 14 | loss: 0.54013 | val_auc: 0.77846 |  0:00:04s
epoch 15 | loss: 0.53401 | val_auc: 0.77927 |  0:00:04s
epoch 16 | loss: 0.5413  | val_auc: 0.7819  |  0:00:05s
epoch 17 | loss: 0.54611 | val_auc: 0.78085 |  0



epoch 0  | loss: 0.85141 | val_auc: 0.62698 |  0:00:00s
epoch 1  | loss: 0.6736  | val_auc: 0.72798 |  0:00:00s
epoch 2  | loss: 0.63834 | val_auc: 0.75753 |  0:00:01s
epoch 3  | loss: 0.62436 | val_auc: 0.76472 |  0:00:01s
epoch 4  | loss: 0.59414 | val_auc: 0.76545 |  0:00:01s
epoch 5  | loss: 0.58276 | val_auc: 0.76322 |  0:00:01s
epoch 6  | loss: 0.58244 | val_auc: 0.76505 |  0:00:02s
epoch 7  | loss: 0.57441 | val_auc: 0.76841 |  0:00:02s
epoch 8  | loss: 0.57332 | val_auc: 0.76952 |  0:00:02s
epoch 9  | loss: 0.55905 | val_auc: 0.76862 |  0:00:03s
epoch 10 | loss: 0.55513 | val_auc: 0.7635  |  0:00:03s
epoch 11 | loss: 0.5541  | val_auc: 0.76456 |  0:00:03s
epoch 12 | loss: 0.5547  | val_auc: 0.77195 |  0:00:04s
epoch 13 | loss: 0.54572 | val_auc: 0.77747 |  0:00:04s
epoch 14 | loss: 0.55075 | val_auc: 0.77949 |  0:00:04s
epoch 15 | loss: 0.54465 | val_auc: 0.77633 |  0:00:04s
epoch 16 | loss: 0.54443 | val_auc: 0.77572 |  0:00:05s
epoch 17 | loss: 0.5347  | val_auc: 0.77917 |  0



0.8131214032029643

In [None]:
FIT_PARAMS = {
    'X_train': X_train,
    'y_train': y_train,
    'eval_set': [(X_train, y_train)],
    'eval_name': ['train'],
    'eval_metric': ['auc'],
    'max_epochs': T_EPOCHS,
    'patience': T_EPOCHS // 3,
    'batch_size': 2048,
    'virtual_batch_size': 1024,
    'weights': 1,
    'augmentations': aug,
    'from_unsupervised': loaded_pretrain
}

In [None]:
clf = TabNetClassifier(**TABNET_PARAMS)
clf.fit(**FIT_PARAMS)

In [None]:
clf_pred = clf.predict_proba(X_test)[:, 1]

In [None]:
submission['nerdiness'] = clf_pred
submission

In [None]:
submission.to_csv("08_17.csv", index=False)