# Feature Selection

### Load the data from file and save train/test data

In [2]:
#code
import numpy as np
import pandas as pd

df = pd.read_csv('tmp/tsfresh/extracted_data.csv')
df.dropna(axis=1, inplace=True)
df.replace([np.inf, -np.inf], np.nan).dropna(axis=1, inplace=True)
df.drop('id', axis=1, inplace=True)

df.head()

Unnamed: 0,Flow rate__abs_energy,Flow rate__absolute_sum_of_changes,"Flow rate__agg_autocorrelation__f_agg_""mean""__maxlag_40","Flow rate__agg_autocorrelation__f_agg_""median""__maxlag_40","Flow rate__agg_autocorrelation__f_agg_""var""__maxlag_40","Flow rate__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","Flow rate__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","Flow rate__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""","Flow rate__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""stderr""","Flow rate__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""intercept""",...,Zone29_K1,Zone29_K2,Zone29_K3,Zone30_K1,Zone30_K2,Zone30_K3,Zone31_K1,Zone31_K2,Zone31_K3,target
0,8.43929e-08,7.4e-05,0.0,0.0,0.0,1.3e-05,-0.480982,-9.571038e-08,1.838974e-08,1.4e-05,...,8e-11,9e-11,3e-12,4e-11,5e-11,2e-12,4e-11,5e-11,2e-12,0
1,8.337982e-08,7.1e-05,0.0,0.0,0.0,1.3e-05,-0.616363,-1.284958e-07,1.649928e-08,1.5e-05,...,4e-11,5e-11,2e-12,4e-11,5e-11,2e-12,4e-11,5e-11,2e-12,0
2,1.032448e-07,5.9e-05,0.0,0.0,0.0,1.2e-05,-0.241002,-4.49837e-08,1.976525e-08,1.4e-05,...,4e-11,5e-11,2e-12,8e-11,9e-11,3e-12,4e-11,5e-11,2e-12,0
3,9.590627e-08,6.1e-05,0.0,0.0,0.0,1.3e-05,-0.52813,-1.032466e-07,1.703194e-08,1.5e-05,...,4e-11,5e-11,2e-12,8e-11,9e-11,3e-12,4e-11,5e-11,2e-12,1
4,9.772771e-08,6e-05,0.0,0.0,0.0,1.3e-05,-0.493841,-9.472176e-08,1.738857e-08,1.4e-05,...,8e-11,9e-11,3e-12,8e-11,9e-11,3e-12,8e-11,9e-11,3e-12,1


In [3]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [4]:
y.value_counts()

1    5055
0    4959
Name: target, dtype: int64

### Automatic sccoring function

In [5]:
#code
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score, f1_score

seed = 0

def automatic_scoring_cv(X, y):
    rf_model = RandomForestClassifier(n_estimators=100, random_state=seed)
    average_score = cross_val_score(estimator=rf_model, X=X, y=y, cv=5, scoring='roc_auc', n_jobs=-1).mean()
    return 'cv + roc_auc: ' + str(average_score)

def automatic_scoring_tt(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed, stratify=y)
    rf_model = RandomForestClassifier(n_estimators=100, random_state=seed).fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    return 'tt + f1_score: ' + str(f1_score(y_test, y_pred))

In [6]:
automatic_scoring_cv(X, y)

'cv + roc_auc: 0.8981465997252126'

In [7]:
automatic_scoring_tt(X, y)

'tt + f1_score: 0.8166816681668169'

### 1. Feature selector that removes all low-variance features.

In [8]:
#code
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=(.8 * (1 - .8)))
selector.fit(X)
columns = X.columns[selector.get_support(indices=True)]
X = X[columns]

X.head()

Unnamed: 0,Flow rate__ar_coefficient__k_10__coeff_1,Flow rate__ar_coefficient__k_10__coeff_2,Flow rate__ar_coefficient__k_10__coeff_3,Flow rate__ar_coefficient__k_10__coeff_4,"Flow rate__augmented_dickey_fuller__attr_""usedlag""",Flow rate__cid_ce__normalize_True,Flow rate__count_above_mean,Flow rate__count_below_mean,"Flow rate__fft_aggregated__aggtype_""centroid""","Flow rate__fft_aggregated__aggtype_""kurtosis""",...,Zone9_Pressure__sum_of_reoccurring_data_points,Zone9_Pressure__sum_of_reoccurring_values,Zone9_Pressure__sum_values,Zone9_Pressure__symmetry_looking__r_0.1,Zone9_Pressure__symmetry_looking__r_0.15000000000000002,Zone9_Pressure__time_reversal_asymmetry_statistic__lag_1,Zone9_Pressure__time_reversal_asymmetry_statistic__lag_2,Zone9_Pressure__time_reversal_asymmetry_statistic__lag_3,Zone9_Pressure__value_count__value_0,Zone9_Pressure__variance
0,1.097224,-0.119454,-0.018179,0.013075,1.0,2.979672,372.0,544.0,49.084752,8.469481,...,0.0,0.0,5125808.0,0.0,1.0,220032400000.0,432622100000.0,627574400000.0,749.0,188784300.0
1,1.157133,-0.161022,-0.005301,-0.040777,1.0,2.496999,380.0,629.0,51.389448,9.12482,...,0.0,0.0,17361410.0,0.0,0.0,983194200000.0,1959747000000.0,2929644000000.0,752.0,1132615000.0
2,1.076403,-0.057993,-0.032339,-0.005114,1.0,2.028811,425.0,435.0,32.345881,12.249977,...,0.0,0.0,4831920.0,1.0,1.0,415384100000.0,816260600000.0,1207333000000.0,706.0,247846700.0
3,1.083679,-0.072918,-0.031834,-0.004072,1.0,2.406142,425.0,538.0,46.956311,9.386179,...,0.0,0.0,11710680.0,0.0,0.0,457429000000.0,911627600000.0,1364312000000.0,700.0,566258200.0
4,1.050232,-0.047799,-0.0439,0.013567,1.0,2.456812,410.0,525.0,45.288313,9.376401,...,0.0,0.0,6048610.0,0.0,1.0,285422800000.0,562231500000.0,836442200000.0,689.0,219879900.0


In [9]:
automatic_scoring_cv(X, y)

'cv + roc_auc: 0.8934594566575715'

In [10]:
automatic_scoring_tt(X, y)

'tt + f1_score: 0.810924369747899'

### 2. Feature selector that checks correlation between 2 features. If correlation is high, 1 feature is removed.

In [11]:
#code
import numpy as np

def correl(data, threshold):
    corr = data.corr()
    print('Corr calculated')
    columns = np.full((corr.shape[0],), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i,j] >= threshold:
                if columns[j]:
                    columns[j] = False
    selected_columns = data.columns[columns]
    data = data[selected_columns]
    return data

X = correl(X, 0.9)

X.head()

Corr calculated


Unnamed: 0,Flow rate__ar_coefficient__k_10__coeff_1,Flow rate__ar_coefficient__k_10__coeff_4,"Flow rate__augmented_dickey_fuller__attr_""usedlag""",Flow rate__cid_ce__normalize_True,Flow rate__count_above_mean,Flow rate__count_below_mean,"Flow rate__fft_aggregated__aggtype_""centroid""","Flow rate__fft_aggregated__aggtype_""kurtosis""","Flow rate__fft_coefficient__coeff_10__attr_""angle""","Flow rate__fft_coefficient__coeff_11__attr_""angle""",...,Zone9_Pressure__partial_autocorrelation__lag_9,Zone9_Pressure__quantile__q_0.7,Zone9_Pressure__range_count__max_1__min_-1,Zone9_Pressure__skewness,Zone9_Pressure__spkt_welch_density__coeff_2,Zone9_Pressure__spkt_welch_density__coeff_5,Zone9_Pressure__spkt_welch_density__coeff_8,Zone9_Pressure__sum_of_reoccurring_data_points,Zone9_Pressure__symmetry_looking__r_0.1,Zone9_Pressure__symmetry_looking__r_0.15000000000000002
0,1.097224,0.013075,1.0,2.979672,372.0,544.0,49.084752,8.469481,2.951076,168.159905,...,-0.002299,0.0,749.0,2.331234,941450500.0,31325780.0,14951980.0,0.0,0.0,1.0
1,1.157133,-0.040777,1.0,2.496999,380.0,629.0,51.389448,9.12482,-105.444874,177.279461,...,-0.015294,0.0,752.0,1.532461,5196042000.0,291949300.0,33340110.0,0.0,0.0,0.0
2,1.076403,-0.005114,1.0,2.028811,425.0,435.0,32.345881,12.249977,65.125545,-55.917921,...,-0.026681,0.0,706.0,2.915235,5852888.0,376989.5,46760.94,0.0,1.0,1.0
3,1.083679,-0.004072,1.0,2.406142,425.0,538.0,46.956311,9.386179,136.341265,-148.918332,...,-0.015482,0.0,700.0,1.60486,3360284000.0,80318150.0,26510420.0,0.0,0.0,0.0
4,1.050232,0.013567,1.0,2.456812,410.0,525.0,45.288313,9.376401,-161.838119,122.674278,...,-0.010105,0.0,689.0,2.391474,105701200.0,14036070.0,6431597.0,0.0,0.0,1.0


In [12]:
automatic_scoring_cv(X, y)

'cv + roc_auc: 0.8811538224504571'

In [13]:
automatic_scoring_tt(X, y)

'tt + f1_score: 0.8023529411764705'

In [14]:
X['target'] = y
X.to_csv('tmp/selected_data_1.csv')

### 3. Select features according to a percentile of the highest scores (mutual_info_classif)

In [24]:
import numpy as np
import pandas as pd

df = pd.read_csv('tmp/selected_data_1.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

df.head()

Unnamed: 0,Flow rate__ar_coefficient__k_10__coeff_1,Flow rate__ar_coefficient__k_10__coeff_4,"Flow rate__augmented_dickey_fuller__attr_""usedlag""",Flow rate__cid_ce__normalize_True,Flow rate__count_above_mean,Flow rate__count_below_mean,"Flow rate__fft_aggregated__aggtype_""centroid""","Flow rate__fft_aggregated__aggtype_""kurtosis""","Flow rate__fft_coefficient__coeff_10__attr_""angle""","Flow rate__fft_coefficient__coeff_11__attr_""angle""",...,Zone9_Pressure__quantile__q_0.7,Zone9_Pressure__range_count__max_1__min_-1,Zone9_Pressure__skewness,Zone9_Pressure__spkt_welch_density__coeff_2,Zone9_Pressure__spkt_welch_density__coeff_5,Zone9_Pressure__spkt_welch_density__coeff_8,Zone9_Pressure__sum_of_reoccurring_data_points,Zone9_Pressure__symmetry_looking__r_0.1,Zone9_Pressure__symmetry_looking__r_0.15000000000000002,target
0,1.097224,0.013075,1.0,2.979672,372.0,544.0,49.084752,8.469481,2.951076,168.159905,...,0.0,749.0,2.331234,941450500.0,31325780.0,14951980.0,0.0,0.0,1.0,0
1,1.157133,-0.040777,1.0,2.496999,380.0,629.0,51.389448,9.12482,-105.444874,177.279461,...,0.0,752.0,1.532461,5196042000.0,291949300.0,33340110.0,0.0,0.0,0.0,0
2,1.076403,-0.005114,1.0,2.028811,425.0,435.0,32.345881,12.249977,65.125545,-55.917921,...,0.0,706.0,2.915235,5852888.0,376989.5,46760.94,0.0,1.0,1.0,0
3,1.083679,-0.004072,1.0,2.406142,425.0,538.0,46.956311,9.386179,136.341265,-148.918332,...,0.0,700.0,1.60486,3360284000.0,80318150.0,26510420.0,0.0,0.0,0.0,1
4,1.050232,0.013567,1.0,2.456812,410.0,525.0,45.288313,9.376401,-161.838119,122.674278,...,0.0,689.0,2.391474,105701200.0,14036070.0,6431597.0,0.0,0.0,1.0,1


In [None]:
from sklearn.feature_selection import SelectPercentile, mutual_info_classif

selperc = SelectPercentile(mutual_info_classif, percentile=1)
X_features = selperc.fit_transform(X, y)
columns = np.asarray(X.columns.values)
support = np.asarray(selperc.get_support())
columns_with_support = columns[support]

X = X[columns_with_support]
X.head()

In [None]:
automatic_scoring_cv(X, y)

In [None]:
automatic_scoring_tt(X, y)

In [None]:
X['target'] = y
X.to_csv('tmp/selected_data_2.csv')

### 4. Feature ranking with recursive feature elimination and cross-validated selection of the best number of features.

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('tmp/selected_data_2.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

df.head()

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC

sel = RFECV(SVC(kernel="linear"), step=1, cv=5, n_jobs=-1)
sel = sel.fit(X, y)
columns = np.asarray(X.columns.values)
support = np.asarray(sel.get_support())
columns_with_support = columns[support]

In [None]:
X = X[columns_with_support]
X['target'] = y
X.to_csv('tmp/selected_data_3.csv')

In [None]:
automatic_scoring_cv(X, y)