# Feature Selection

### Load the data from file and save train/test data

In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv('extracted_data.csv')
df = df.dropna(axis=1)
df = df.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

df.head()

Unnamed: 0,Flow rate_Time_standard_error_mean,Flow rate_Time_signal_energy,Flow rate_Time_maximum,Flow rate_Time_minimum,Flow rate_Time_skewness,Flow rate_Time_kurtosis,Flow rate_Time_standard_deviation,Flow rate_Time_variance,Flow rate_Time_length,Flow rate_Time_mean,...,Constant_Values_Zone30_K1,Constant_Values_Zone30_K2,Constant_Values_Zone30_K3,Constant_Values_Zone30_tfilling,Constant_Values_Zone_ids.27,Constant_Values_Zone31_K1,Constant_Values_Zone31_K2,Constant_Values_Zone31_K3,Constant_Values_Zone31_tfilling,target
0,2.012151e-07,1.121846e-09,8e-06,7.38504e-15,-0.015488,-0.882549,2e-06,3.603388e-12,90,3e-06,...,4e-11,5e-11,2e-12,5.08464,31.0,4e-11,5e-11,2e-12,7.14719,1
1,2.373305e-07,1.128794e-09,8e-06,2.9836e-31,1.402138,0.113374,3e-06,6.871743e-12,123,2e-06,...,4e-11,5e-11,2e-12,5.17018,31.0,4e-11,5e-11,2e-12,7.22421,1
2,2.11718e-07,1.990315e-09,8e-06,7.66226e-70,0.239625,-1.605099,3e-06,6.813326e-12,153,2e-06,...,4e-11,5e-11,2e-12,5.14247,31.0,4e-11,5e-11,2e-12,7.26887,1
3,2.112434e-07,2.1596e-09,7e-06,1.3860199999999999e-65,-0.004838,-1.931155,3e-06,6.827436e-12,154,3e-06,...,4e-11,5e-11,2e-12,4.5496,31.0,4e-11,5e-11,2e-12,6.32605,1
4,1.877035e-07,1.281446e-09,8e-06,8.17846e-80,0.409115,-0.739532,2e-06,4.721168e-12,135,2e-06,...,4e-11,5e-11,2e-12,5.08549,31.0,4e-11,5e-11,2e-12,6.76541,1


In [3]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [4]:
y.value_counts()

1    5049
0    4951
Name: target, dtype: int64

### Automatic sccoring function

In [5]:
#code
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score, f1_score

seed = 0

def automatic_scoring_cv(X, y):
    rf_model = RandomForestClassifier(n_estimators=100, random_state=seed)
    average_score = cross_val_score(estimator=rf_model, X=X, y=y, cv=5, scoring='roc_auc', n_jobs=-1).mean()
    return 'cv + roc_auc: ' + str(average_score)

def automatic_scoring_tt(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed, stratify=y)
    rf_model = RandomForestClassifier(n_estimators=100, random_state=seed).fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    return 'tt + f1_score: ' + str(f1_score(y_test, y_pred))

In [6]:
automatic_scoring_cv(X, y)

'cv + roc_auc: 0.8879755236404773'

### 1. Feature selector that removes all low-variance features.

In [7]:
#code
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=(.8 * (1 - .8)))
selector.fit(X)
columns = X.columns[selector.get_support(indices=True)]
X = X[columns]

X.head()

Unnamed: 0,Flow rate_Time_skewness,Flow rate_Time_kurtosis,Flow rate_Time_length,Flow rate_Time_cid_ce,Flow rate_Time_sample_entropy,Flow rate_Time_augmented_dickey_fuller_teststat,Flow rate_Time_augmented_dickey_fuller_usedlag,Flow rate_Time_large_standard_deviation_0.1,Flow rate_Time_large_standard_deviation_0.15000000000000002,Flow rate_Time_large_standard_deviation_0.2,...,Constant_Values_Zone9_tfilling,Constant_Values_Zone10_tfilling,Constant_Values_Zone15_tfilling,Constant_Values_Zone16_tfilling,Constant_Values_Zone17_tfilling,Constant_Values_Zone18_tfilling,Constant_Values_Zone23_tfilling,Constant_Values_Zone24_tfilling,Constant_Values_Zone26_tfilling,Constant_Values_Zone31_tfilling
0,-0.015488,-0.882549,90,2.338973,1.923687,-2.038672,1,True,True,True,...,8.71076,5.32217,5.40678,9.05436,8.91207,5.37182,5.37182,8.97084,7.14719,7.14719
1,1.402138,0.113374,123,1.562565,0.553437,-2.371455,4,True,True,True,...,9.01861,5.30301,4.9585,8.58738,9.03714,5.41979,5.35064,8.81792,7.0389,7.22421
2,0.239625,-1.605099,153,2.938788,0.96705,-2.120462,0,True,True,True,...,8.20194,4.84727,5.44917,9.15164,8.58509,1.57392,5.41989,9.09192,6.83842,7.26887
3,-0.004838,-1.931155,154,1.703203,0.925972,-1.170086,0,True,True,True,...,9.47419,5.7124,4.83943,8.25596,9.39794,5.74066,4.96893,8.13939,7.23515,6.32605
4,0.409115,-0.739532,135,3.353975,0.974327,-0.588509,5,True,True,True,...,9.34483,5.48449,4.9795,8.77925,9.38939,5.59861,5.52508,8.839,7.33478,6.76541


In [8]:
automatic_scoring_cv(X, y)

'cv + roc_auc: 0.8852018839298559'

### 2. Feature selector that checks correlation between 2 features. If correlation is high, 1 feature is removed.

In [9]:
#code
import numpy as np

def correl(data, threshold):
    corr = data.corr()
    print('Corr calculated')
    columns = np.full((corr.shape[0],), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i,j] >= threshold:
                if columns[j]:
                    columns[j] = False
    selected_columns = data.columns[columns]
    data = data[selected_columns]
    return data

X = correl(X, 0.9)

X.head()

Corr calculated


Unnamed: 0,Flow rate_Time_skewness,Flow rate_Time_length,Flow rate_Time_sample_entropy,Flow rate_Time_augmented_dickey_fuller_teststat,Flow rate_Time_augmented_dickey_fuller_usedlag,Flow rate_Time_large_standard_deviation_0.1,Flow rate_Time_large_standard_deviation_0.15000000000000002,Flow rate_Time_large_standard_deviation_0.2,Flow rate_Time_large_standard_deviation_0.25,Flow rate_Time_large_standard_deviation_0.30000000000000004,...,Constant_Values_Zone9_tfilling,Constant_Values_Zone10_tfilling,Constant_Values_Zone15_tfilling,Constant_Values_Zone16_tfilling,Constant_Values_Zone17_tfilling,Constant_Values_Zone18_tfilling,Constant_Values_Zone23_tfilling,Constant_Values_Zone24_tfilling,Constant_Values_Zone26_tfilling,Constant_Values_Zone31_tfilling
0,-0.015488,90,1.923687,-2.038672,1,True,True,True,False,False,...,8.71076,5.32217,5.40678,9.05436,8.91207,5.37182,5.37182,8.97084,7.14719,7.14719
1,1.402138,123,0.553437,-2.371455,4,True,True,True,True,True,...,9.01861,5.30301,4.9585,8.58738,9.03714,5.41979,5.35064,8.81792,7.0389,7.22421
2,0.239625,153,0.96705,-2.120462,0,True,True,True,True,True,...,8.20194,4.84727,5.44917,9.15164,8.58509,1.57392,5.41989,9.09192,6.83842,7.26887
3,-0.004838,154,0.925972,-1.170086,0,True,True,True,True,True,...,9.47419,5.7124,4.83943,8.25596,9.39794,5.74066,4.96893,8.13939,7.23515,6.32605
4,0.409115,135,0.974327,-0.588509,5,True,True,True,True,False,...,9.34483,5.48449,4.9795,8.77925,9.38939,5.59861,5.52508,8.839,7.33478,6.76541


In [10]:
automatic_scoring_cv(X, y)

'cv + roc_auc: 0.9070809713810641'

### 3. Select features according to a percentile of the highest scores (mutual_info_classif)

In [11]:
from sklearn.feature_selection import SelectPercentile, mutual_info_classif

selperc = SelectPercentile(mutual_info_classif, percentile=10)
X_features = selperc.fit_transform(X, y)
columns = np.asarray(X.columns.values)
support = np.asarray(selperc.get_support())
columns_with_support = columns[support]

X = X[columns_with_support]
X.head()

Unnamed: 0,Flow rate_Time_skewness,Flow rate_Time_sample_entropy,Flow rate_Time_large_standard_deviation_0.1,Flow rate_Time_large_standard_deviation_0.15000000000000002,Flow rate_Frequency_signal_energy,Flow rate_Frequency_skewness,Zone7_Pressure_Time_cid_ce,Zone9_Pressure_Time_median,Zone9_Pressure_Time_percentile_95,Zone15_Pressure_Time_augmented_dickey_fuller_teststat,...,Zone18_Pressure_Time_median,Zone23_Pressure_Time_augmented_dickey_fuller_teststat,Zone23_Pressure_Time_augmented_dickey_fuller_usedlag,Zone24_Pressure_Time_kurtosis,Zone24_Pressure_Time_sample_entropy,Zone24_Pressure_Time_augmented_dickey_fuller_usedlag,Zone24_Pressure_Frequency_skewness,Zone30_Pressure_Time_augmented_dickey_fuller_teststat,Constant_Values_Zone16_tfilling,Constant_Values_Zone24_tfilling
0,-0.015488,1.923687,True,True,1.929465e-07,6.022285,2.559244,63948.2,74837.14,-0.255487,...,62204.0,-1.924673,1,-0.573096,2.047256,4,6.631168,-1.757511,9.05436,8.97084
1,1.402138,0.553437,True,True,1.754497e-07,3.987482,1.929611,55132.9,59955.07,-2.090178,...,45323.0,-2.106598,4,0.582974,1.766882,4,5.913858,-2.218144,8.58738,8.81792
2,0.239625,0.96705,True,True,3.511431e-07,6.220548,4.163208,53513.2,96443.46,-3.147783,...,40277.4,-3.154167,1,-1.383036,1.58884,1,6.875512,-2.523897,9.15164,9.09192
3,-0.004838,0.925972,True,True,3.876618e-07,6.390315,2.331688,16559.45,58867.045,-1.355158,...,46045.6,-1.316847,1,-0.863062,1.415317,1,6.412217,-1.276491,8.25596,8.13939
4,0.409115,0.974327,True,True,2.228542e-07,6.012997,3.785949,13740.3,65015.82,-2.734639,...,45465.6,-2.657794,1,-0.113447,1.469364,5,7.730912,-2.678957,8.77925,8.839


In [12]:
automatic_scoring_cv(X, y)

'cv + roc_auc: 0.8959568083739656'

In [13]:
X['target'] = y
X.to_csv('selected_data.csv')