In [22]:
%config Completer.use_jedi = False

import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import ElasticNetCV, Lasso
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_selection import RFE
from sklearn.inspection import permutation_importance

import plotly.express as px
import plotly.graph_objects as go

# Load and transform the data

In [2]:
X_art = pd.read_csv('data/artificial_train.data', delimiter=' ', names=[i for i in range(500)], index_col=False).values
y_art = pd.read_csv('data/artificial_train.labels', delimiter=' ', names=['y'], index_col=False).values.flatten()
X_art_val = pd.read_csv('data/artificial_valid.data', delimiter=' ', names=[i for i in range(500)], index_col=False).values
X_dig = pd.read_csv('data/digits_train.data', delimiter=' ', names=[i for i in range(5000)], index_col=False).values
y_dig = pd.read_csv('data/digits_train.labels', delimiter=' ', names=['y'], index_col=False).values.flatten()
X_dif_val = pd.read_csv('data/digits_valid.data', delimiter=' ', names=[i for i in range(5000)], index_col=False).values

In [3]:
print(f'Artifitial dataset: {X_art.shape[0]} observations, {X_art.shape[1]} features')
print(f'Digits dataset: {X_dig.shape[0]} observations, {X_dig.shape[1]} features')

Artifitial dataset: 2000 observations, 500 features
Digits dataset: 6000 observations, 5000 features


In [4]:
X_art = (X_art - X_art.mean(0)) / X_art.std()


In [5]:
std = X_dig.std(0)
X_dig = X_dig[:, std != 0]
X_dig = (X_dig - X_dig.mean(0)) / std[std != 0]


In [6]:
X_art_train, X_art_test, y_art_train, y_art_test = sklearn.model_selection.train_test_split(
    X_art, y_art, test_size=0.33, random_state=42)

X_dig_train, X_dig_test, y_dig_train, y_dig_test = sklearn.model_selection.train_test_split(
    X_dig, y_dig, test_size=0.33, random_state=42)

# ElasticNet

In [14]:
elasticnet = ElasticNetCV(
    l1_ratio=[0.05, 0.1, 0.2, 0.3, 0.5, 0.6, 0.9, 0.95, 0.99, 1],
    verbose=1
)

In [15]:
elasticnet.fit(X_art_train, y_art_train.flatten())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
.........................................................................................................................................................................................................[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.2min finished


ElasticNetCV(l1_ratio=[0.05, 0.1, 0.2, 0.3, 0.5, 0.6, 0.9, 0.95, 0.99, 1],
             verbose=1)

In [16]:
elasticnet.l1_ratio_, elasticnet.alpha_

(1.0, 0.06543605558779328)

In [22]:
lasso = Lasso(alpha=0.06)
lasso.fit(X_art_train, y_art_train)

Lasso(alpha=0.06)

In [23]:
pred = np.ones(X_art_test.shape[0])
pred[lasso.predict(X_art_test) < 0] = -1

In [24]:
(pred == y_art_test).mean()

0.5848484848484848

In [31]:
art_lasso_selection = lasso.coef_ != 0

# Trees

In [121]:
X_dig_train.shape

(4020, 4955)

In [130]:
trees = ExtraTreesClassifier(n_estimators=500, criterion='gini', min_samples_split=10,
                    max_features=300)

In [131]:
trees.fit(X_art_train, y_art_train)

ExtraTreesClassifier(max_features=300, min_samples_split=10, n_estimators=500)

In [133]:
pred = np.ones(X_art_test.shape[0])
pred[trees.predict(X_art_test) < 0] = -1

In [134]:
(pred == y_art_test).mean()

0.8363636363636363

In [135]:
px.histogram(trees.feature_importances_)

In [137]:
mask = trees.feature_importances_ > 0.005
mask.sum()

20

In [141]:
trees_better = ExtraTreesClassifier(n_estimators=100, criterion='gini', max_depth=3,
                    max_features=5)

trees_better.fit(X_art_train[:,mask], y_art_train)
pred = np.ones(X_art_test.shape[0])
pred[trees.predict(X_art_test) < 0] = -1
(pred == y_art_test).mean()


0.8363636363636363

## SVM

In [176]:
svm = SVC(C=1, kernel='rbf', gamma=0.1)
svm.fit(X_art_train[:, mask], y_art_train)

pred = np.ones(X_art_test.shape[0])
pred[svm.predict(X_art_test[:, mask]) < 0] = -1
(pred == y_art_test).mean()


0.906060606060606

In [183]:
svm = SVC(C=100, kernel='rbf', gamma=0.01)
svm.fit(X=X_art_train, y=y_art_train)

pred = np.ones(X_art_test.shape[0])
pred[svm.predict(X_art_test) < 0] = -1
(pred == y_art_test).mean()

0.6272727272727273

In [178]:
importance = permutation_importance(svm, X=X_art_test[:, mask], y=y_art_test, n_repeats=2)

In [179]:
px.histogram(importance['importances_mean'])

In [40]:
mask = importance['importances_mean'] > 0.005
sum(mask)

9

In [41]:
svm = SVC(C=100, kernel='rbf', gamma=0.01)
svm.fit(X_art_train[:, mask], y_art_train)
pred = np.ones(X_art_test.shape[0])
pred[svm.predict(X_art_test[:, mask]) < 0] = -1
(pred == y_art_test).mean()

0.8590909090909091

# RFE

In [39]:
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import RFE

In [51]:
gradboost = GradientBoostingClassifier(n_estimators=500, max_depth=10, max_features='auto')
gradboost.fit(X_art_train, y_art_train)
pred = np.ones(X_art_test.shape[0])
pred[gradboost.predict(X_art_test) < 0] = -1
(pred == y_art_test).mean()

0.7651515151515151

In [52]:
gradboost = GradientBoostingClassifier(n_estimators=500, max_depth=10, max_features='auto')
rfe = RFE(gradboost, n_features_to_select=20, step=50, verbose=1)
rfe.fit(X_art_train, y_art_train)

Fitting estimator with 500 features.
Fitting estimator with 450 features.
Fitting estimator with 400 features.
Fitting estimator with 350 features.
Fitting estimator with 300 features.
Fitting estimator with 250 features.
Fitting estimator with 200 features.
Fitting estimator with 150 features.
Fitting estimator with 100 features.
Fitting estimator with 50 features.


RFE(estimator=GradientBoostingClassifier(max_depth=10, max_features='auto',
                                         n_estimators=500),
    n_features_to_select=20, step=50, verbose=1)

In [53]:
art_rfe_selection = rfe.ranking_ == 1

In [54]:
pred = np.ones(X_art_test.shape[0])
pred[rfe.estimator_.predict(X_art_test[:, art_rfe_selection]) < 0] = -1
(pred == y_art_test).mean()

0.7757575757575758

# Boruta

In [57]:
from boruta import BorutaPy

In [59]:
trees = ExtraTreesClassifier(n_estimators=100, criterion='gini', min_samples_split=10,
                    max_features='auto')
feat_selector = BorutaPy(trees, n_estimators='auto', verbose=2, random_state=1)
feat_selector.fit(X_art_train, y_art_train)
feat_selector.support_
feat_selector.ranking_
# X_filtered = feat_selector.transform(X)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	21
Rejected: 	479
Iteration: 	9 / 100
Confirmed: 	13
Tentative: 	8
Rejected: 	479
Iteration: 	10 / 100
Confirmed: 	13
Tentative: 	8
Rejected: 	479
Iteration: 	11 / 100
Confirmed: 	13
Tentative: 	8
Rejected: 	479
Iteration: 	12 / 100
Confirmed: 	15
Tentative: 	6
Rejected: 	479
Iteration: 	13 / 100
Confirmed: 	15
Tentative: 	6
Rejected: 	479
Iteration: 	14 / 100
Confirmed: 	15
Tentative: 	6
Rejected: 	479
Iteration: 	15 / 100
Confirmed: 	15
Tentative: 	6
Rejected: 	479
Iteration: 	16 / 100
Confirmed: 	16
Tenta

array([105, 316, 453, 435,  62,  53, 210, 341, 196,  87,  57,  72,  41,
       299, 165, 180,  24, 348,  19, 107, 348, 267, 432, 306, 177, 361,
        97, 385,   1, 280, 351, 468,  78, 220, 429,  75, 180, 440,  83,
       299, 416, 411, 336,  59, 228, 394,   1, 312,   1, 247, 169, 185,
       354,  14, 263, 110,  29, 387, 481, 293,  61, 476, 445, 268,   1,
       297, 343, 175, 220, 146, 400, 428, 397, 309,  65, 237, 225,  98,
        55, 116, 259, 118, 238, 241, 126, 201, 173, 327, 252, 452, 383,
       227, 365, 365, 245, 317, 426,  74, 388, 275, 210, 353,  60, 400,
       306,   1, 234, 467, 304, 148, 159, 173, 286, 402, 178, 236,  91,
        84, 460, 245, 165, 190, 337, 260,  76,  27, 201, 216,   1, 477,
        35, 163, 403, 301, 216, 310,   9,  99, 161, 208, 111, 308, 285,
       327, 248, 206, 112,  45,  68,  58, 266, 360, 231,   1, 233, 171,
        24, 220, 424, 379, 179,  30, 439, 161, 155,  53, 102,  93, 272,
       480, 385, 331, 127,  31, 183,  86, 151,   5, 408, 379,  6

In [66]:
trees = ExtraTreesClassifier(n_estimators=1000, criterion='gini', min_samples_split=10,
                    max_features='auto')
trees.fit(X_art_train[:, feat_selector.support_], y_art_train)
pred = np.ones(X_art_test.shape[0])
pred[trees.predict(X_art_test[:, feat_selector.support_]) < 0] = -1
(pred == y_art_test).mean()

0.8863636363636364

In [67]:
svm = SVC(C=1, kernel='rbf', gamma=0.1)
svm.fit(X_art_train[:, feat_selector.support_], y_art_train)

pred = np.ones(X_art_test.shape[0])
pred[svm.predict(X_art_test[:, feat_selector.support_]) < 0] = -1
(pred == y_art_test).mean()


0.8878787878787879