In [3]:
import pandas as pd 
import numpy as np
import sklearn
import pickle
import matplotlib.pyplot as plt
import lazypredict
import xgboost as xgb

# from skopt import BayesSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, StackingClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV, cross_val_score

# import optuna
# from lazypredict.Supervised import LazyClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier

from pytorch_tabnet.tab_model import TabNetClassifier
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Neural networks
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Add, Input, Dense, Dropout, BatchNormalization, Embedding, Flatten, Concatenate, Reshape, Conv1D, AveragePooling1D, Multiply, MaxPool1D, Activation
from tensorflow.keras.regularizers import l1, l2
from scikeras.wrappers import KerasClassifier
from tensorflow_addons.layers import WeightNormalization

In [13]:
import tensorflow as tf
print(tf. __version__)

2.10.1


In [5]:
def make_submission(preds):
    assert len(preds) == 5000
    
    # Read labels
    with open('test_labels_sorted.npy', 'rb') as f:
        test_labels = np.load(f)
    len(test_labels)
    
    submission = pd.DataFrame(columns=['id', 'class'])
    for label, pred in zip(test_labels, preds):
        submission = submission.append({'id': label, 'class': pred}, ignore_index=True)
    return submission

In [6]:
def plot_confusion_matrix(y_val_from_train: np.array, y_pred_from_train: np.array):
    """Plot confusion matrix given predictions and truth values"""
    _, ax = plt.subplots(figsize=(8, 5))
    cmp = ConfusionMatrixDisplay(confusion_matrix(y_val_from_train, y_pred_from_train))
    cmp.plot(ax=ax)
    plt.show()

## Load Data

In [7]:
# Load X_train
with open('X_train_64_std.npy', 'rb') as f:
    X_train = np.load(f)
print(X_train.shape)
X_train

(9000, 64)


array([[ 0.66868409,  0.16353656,  0.63277468, ..., -0.68600237,
        -0.75071056, -0.6719989 ],
       [ 0.37202545,  0.27579237,  0.65904756, ...,  0.06479231,
        -0.48439794,  0.56611352],
       [-1.67047552, -1.2187742 , -2.37040642, ..., -0.65838631,
        -0.38095167, -0.1542973 ],
       ...,
       [-0.24270722, -0.14209669, -0.32285351, ...,  0.37591901,
        -0.37936549,  0.58761686],
       [ 0.38630325,  0.31266521,  0.37767149, ...,  0.83458681,
         0.3138927 ,  0.79569199],
       [ 0.33077964, -0.0822815 , -0.51143673, ..., -1.1650511 ,
        -0.410986  ,  0.44873132]])

In [8]:
# Load y_train
with open('y_train.npy', 'rb') as f:
    y_train = np.load(f)
print(y_train.shape)
y_train

(9000,)


array([ 7, 13,  6, ...,  3,  3, 18], dtype=int64)

In [9]:
# Load X_train
with open('X_test_64_std.npy', 'rb') as f:
    X_test = np.load(f)
print(X_test.shape)
X_test

(5000, 64)


array([[-0.59806336,  0.6264903 , -0.15256881, ...,  1.17376188,
         0.23880484,  0.48877589],
       [-0.47352949,  0.94441404, -0.23499654, ..., -0.41222519,
        -0.40989637, -0.65814035],
       [-0.64168924, -1.56209892, -1.06875674, ...,  0.3385773 ,
        -0.56342902,  0.74796695],
       ...,
       [ 1.00103725, -0.3371114 ,  0.37115316, ..., -1.16055904,
        -0.46010399, -0.62722221],
       [-0.98752639, -1.20156778, -1.41423994, ..., -0.81406819,
        -0.78249592,  1.12406601],
       [-0.21177287, -0.85742305, -0.54931118, ...,  0.02750228,
        -0.76809252, -1.25225668]])

## Create Ensemble

In [12]:
etc_clf = ExtraTreesClassifier(n_jobs=-1, n_estimators=933, max_depth=79, min_samples_split=2, min_samples_leaf=1, \
                               bootstrap = False, warm_start = False)

gb_clf = GradientBoostingClassifier(learning_rate=0.2, max_depth=5, min_samples_leaf=0.1, \
                                    min_samples_split=0.1363, subsample=1.0)

lgb_clf = lgb.LGBMClassifier(objective='multiclass', num_class=20, n_jobs=-1, seed=42, boosting='dart', \
                             min_child_samples=12, num_iterations=1936, num_leaves=66, min_data_in_leaf=50, \
                             max_bin=20, max_depth=17, learning_rate=0.24, reg_alpha=0.0004127769671094072)

cbc_clf = CatBoostClassifier()

tabnet = TabNetClassifier(optimizer_params=dict(lr=2e-2), scheduler_params={"step_size":10, "gamma":0.9}, scheduler_fn=torch.optim.lr_scheduler.StepLR)
tabnet._estimator_type = "classifier"

estimators = [("etc", etc_clf), ("gb", gb_clf), ("lgb", lgb_clf), ("cbc", cbc_clf), ('tabnet', tabnet)]
ensemble_estimators = StackingClassifier(estimators, final_estimator=LogisticRegression(), n_jobs=-1)



In [11]:
%%time
ensemble_estimators.fit(X_train, y_train)

Wall time: 13min 32s


StackingClassifier(estimators=[('etc',
                                ExtraTreesClassifier(max_depth=79,
                                                     n_estimators=933,
                                                     n_jobs=-1)),
                               ('gb',
                                GradientBoostingClassifier(learning_rate=0.2,
                                                           max_depth=5,
                                                           min_samples_leaf=0.1,
                                                           min_samples_split=0.1363)),
                               ('lgb',
                                LGBMClassifier(boosting='dart',
                                               learning_rate=0.24, max_bin=20,
                                               max_depth=17,
                                               min_child_samples=12,
                                               min_data_in_leaf=50,
                     

In [None]:
%%time
results_list = []
estimator_list = []
cv = StratifiedKFold(n_splits=5)

for estimator_name, clf in estimators:
    results = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
    estimator_list.append(estimator_name)
    results_list.append(results)
    
    print('>%s %.3f (%.3f)' % (estimator_name, np.mean(results), np.std(results)))

>etc 0.956 (0.006)
>gb 0.957 (0.005)
>lgb 0.961 (0.005)
>cbc 0.961 (0.003)




>tabnet 0.935 (0.006)


## Create submission

In [12]:
preds = ensemble_estimators.predict(X_test)
preds

array([3, 4, 5, ..., 4, 5, 1], dtype=int64)

In [13]:
sub = make_submission(preds)
sub

Unnamed: 0,id,class
0,10001,3
1,10002,4
2,10004,5
3,10008,20
4,10009,13
...,...,...
4995,23986,9
4996,23991,12
4997,23992,4
4998,23998,5


In [14]:
sub.to_csv('submission_new_ensemble_baseline_64f_std_5_models.csv', index=False)