In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import balanced_accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
from sklearn.linear_model import LassoCV
from boruta import BorutaPy

# Artificial dataset

# Data Preparation

In [2]:
x_train=pd.read_csv("data/artificial_train.data",header=None, sep=" ")

x_train=x_train.drop(columns=[500])

y_train=pd.read_csv("data/artificial_train.labels",header=None, sep=" ")

y_train=y_train.to_numpy().reshape(-1)

x_val=pd.read_csv("data/artificial_valid.data",header=None, sep=" ").iloc[:,:500]

scaler = MinMaxScaler()
scaler.fit(x_train)
x_train=scaler.transform(x_train)
x_val=scaler.transform(x_val)

# Final classification and estimated error

## Boruta algorithm

In [3]:
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1, max_iter = 5000)
feat_selector.fit(x_train, y_train)
x_boruta = feat_selector.transform(x_train)
x_val_boruta = feat_selector.transform(x_val)

Iteration: 	1 / 5000
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	2 / 5000
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	3 / 5000
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	4 / 5000
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	5 / 5000
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	6 / 5000
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	7 / 5000
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	8 / 5000
Confirmed: 	0
Tentative: 	22
Rejected: 	478
Iteration: 	9 / 5000
Confirmed: 	17
Tentative: 	5
Rejected: 	478
Iteration: 	10 / 5000
Confirmed: 	17
Tentative: 	5
Rejected: 	478
Iteration: 	11 / 5000
Confirmed: 	17
Tentative: 	5
Rejected: 	478
Iteration: 	12 / 5000
Confirmed: 	18
Tentative: 	4
Rejected: 	478
Iteration: 	13 / 5000
Confirmed: 	18
Tentative: 	4
Rejected: 	478
Iteration: 	14 / 5000
Confirmed: 	18
Tentative: 	4
Rejected: 	478
Iteration: 	15 / 5000
Confirmed: 	18
Tentative: 	4
Rejected: 	478
Iteration: 	16 / 5000
Conf

In [4]:
np.where(feat_selector.support_)[0] + 1

array([ 11,  29,  49,  65, 106, 129, 154, 205, 242, 282, 319, 337, 339,
       379, 434, 443, 452, 454, 473, 476, 494], dtype=int64)

In [5]:
feat_selector.ranking_

array([437, 267, 362, 448,   8, 118,  91, 469, 396, 445,   1, 262,  97,
        23, 404, 227, 257, 236, 130, 300, 441, 377, 443, 385, 209, 206,
        92, 270,   1, 145, 398, 244,  47, 143, 280, 292, 201, 358,  21,
       473, 424, 262, 134,  51,  48, 250,  86, 224,   1, 134, 293, 110,
       242, 328, 119,  25,  32, 272, 311, 103,  71, 293, 303, 430,   1,
       216, 160, 319, 119, 422, 349, 352, 241,  57, 115, 278, 402, 114,
       212, 113, 399,  87, 176, 214, 128, 125, 427, 279, 232, 181, 478,
       231, 366, 345, 369, 207, 435, 248, 203, 117, 191, 436, 397, 266,
       150,   1,  61, 265, 404, 110, 322,  76, 428, 434, 210, 284,  37,
       286, 290,  38, 455, 169, 426, 374, 228, 132, 367, 128,   1,  55,
        55,  74, 410, 440, 290, 168,  12,  44, 382, 147,  92, 320, 437,
       442, 219, 340, 313,  46, 275,  20, 149, 354,  75,   1, 467, 219,
        59, 339, 156, 148, 272, 219, 276, 258, 166, 296, 471, 341, 466,
       392, 305, 102, 222, 459, 242,  71, 381,  84, 400, 288,  5

## PCA

In [6]:
pca = PCA(n_components=8)

pca.fit(x_boruta)

x_train=pca.transform(x_boruta)
x_val=pca.transform(x_val_boruta)

## Performance

In [7]:
forest=RandomForestClassifier().fit(x_train, y_train)

pred = forest.predict(x_train)

balanced_accuracy_score(y_train, pred)

1.0

In [8]:
posterior_probs = forest.predict_proba(x_val)[:,1]

In [9]:
pd.DataFrame({'MATSZY': posterior_probs}).to_csv('MATSZY_artificial_prediction.txt', index = None)

## 5-fold cross-validation

In [10]:
acc_reg=[0 for x in range(5)]
acc_forest=[0 for x in range(5)]
for i in range(5):
    mask = np.zeros(x_train.shape[0], dtype=bool)
    mask[i::5] = True

    x_test = x_train[mask]
    x_fold = x_train[~mask]
    
    y_test = y_train[mask]
    y_fold = y_train[~mask]
    
    reg = LogisticRegression().fit(x_fold, y_fold)
    pred = reg.predict(x_test)
    acc_reg[i]=balanced_accuracy_score(y_test, pred)
    
    forest=RandomForestClassifier().fit(x_fold, y_fold)
    pred = forest.predict(x_test)
    acc_forest[i]=balanced_accuracy_score(y_test, pred)
print('Forest: ', acc_forest)
print('Logistic regression: ', acc_reg)

Forest:  [0.8282178590235463, 0.8525923330997898, 0.8344611528822055, 0.84998499849985, 0.8273486534470256]
Logistic regression:  [0.5917901652498809, 0.5855269742768492, 0.6499999999999999, 0.6377387738773878, 0.6146132879897976]


In [11]:
sum(acc_forest)/5

0.8385209993904834

# Different tests

# Univariate feature selection

In [12]:
x_train=pd.read_csv("data/artificial_train.data",header=None, sep=" ")

x_train=x_train.drop(columns=[500])

y_train=pd.read_csv("data/artificial_train.labels",header=None, sep=" ")

y_train=y_train.to_numpy().reshape(-1)

x_val=pd.read_csv("data/artificial_valid.data",header=None, sep=" ").iloc[:,:500]

scaler = MinMaxScaler()
scaler.fit(x_train)
x_train=scaler.transform(x_train)
x_val=scaler.transform(x_val)

In [13]:
kbest = SelectKBest(chi2, k=21)
kbest.fit(x_train, y_train)
x_kbest = kbest.transform(x_train)
vars_indexes = np.where(kbest.get_support())[0]
scores = kbest.scores_[vars_indexes]

In [14]:
kbest.get_support()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False,

In [15]:
pd.DataFrame({'variable': vars_indexes + 1, 'Univariate_selection': scores}).to_csv('Univariate_selection.csv')

## Sequential feature selection

Sequential Feature Selection (is very slow) (lasso can be replaced with reg or forest)

In [None]:
lasso = LassoCV().fit(x_train, y_train)
importance = np.abs(lasso.coef_)

sfs_forward = SequentialFeatureSelector(lasso, n_features_to_select=8,
                                        direction='forward').fit(x_train, y_train)

sfs_forward.get_support()

x_train_lasso1=sfs_forward.transform(x_train)

In [None]:
lasso = LassoCV().fit(x_train, y_train)
importance = np.abs(lasso.coef_)
threshold = np.sort(importance)[-11] + 0.01
sfm = SelectFromModel(lasso, threshold=threshold).fit(x_train, y_train)
x_train_lasso=sfm.transform(x_train)

## Removing features based on variance

In [31]:
sel = VarianceThreshold(threshold=0.025)
x_train_var=sel.fit_transform(x_train)