# Feature Selection

### Load the data from file and save train/test data

In [1]:
#code
import numpy as np
import pandas as pd

df = pd.read_csv('extracted_data.csv')
df.dropna(axis=1, inplace=True)
df.replace([np.inf, -np.inf], np.nan).dropna(axis=1, inplace=True)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df[df.columns.drop(list(df.filter(regex='Time')))]
#df.drop('id', axis=1, inplace=True)

df.head()

Unnamed: 0,Flow rate_variance,Flow rate_minimum,Flow rate_mean,Flow rate_abs_energy,Flow rate_kurtosis,Flow rate_count_below_mean,Flow rate_count_above_mean,Flow rate_skewness,Flow rate_standard_deviation,Flow rate_maximum,...,Zone31_Pressure_minimum,Zone31_Pressure_mean,Zone31_Pressure_abs_energy,Zone31_Pressure_kurtosis,Zone31_Pressure_count_below_mean,Zone31_Pressure_count_above_mean,Zone31_Pressure_skewness,Zone31_Pressure_standard_deviation,Zone31_Pressure_maximum,Target
0,2.133409e-11,1.15034e-15,1e-05,1.031236e-07,-0.568486,418,439,-0.240613,5e-06,1.8e-05,...,0.0,9210.803547,297252200000.0,1.178,619,238,1.579773,16186.820421,59057.0,1
1,2.105e-11,1.04767e-17,1e-05,1.042159e-07,-0.507464,422,439,-0.257104,5e-06,1.8e-05,...,0.0,8966.502441,285749800000.0,1.30134,623,238,1.610864,15858.219212,58896.3,0
2,2.315912e-11,4.5085e-14,9e-06,9.353593e-08,-0.766407,478,414,0.0,5e-06,1.7e-05,...,0.0,8674.867948,287078600000.0,2.6417,644,248,1.845702,15702.983351,67384.0,1
3,2.573246e-11,3.18465e-74,9e-06,1.027388e-07,-0.712796,412,491,-0.284421,5e-06,1.8e-05,...,0.0,7849.956178,216683600000.0,1.590384,625,278,1.642973,13354.320973,51343.5,0
4,2.3088e-11,2.22183e-117,9e-06,1.049617e-07,-0.504741,411,528,-0.300985,5e-06,1.8e-05,...,0.0,16532.734285,1018608000000.0,3.568572,654,285,1.985958,28485.923782,136121.0,0


In [2]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [3]:
y.value_counts()

1    5055
0    4959
Name: Target, dtype: int64

### Automatic sccoring function

In [4]:
#code
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score, f1_score

seed = 0

def automatic_scoring_cv(X, y):
    rf_model = RandomForestClassifier(n_estimators=100, random_state=seed)
    average_score = cross_val_score(estimator=rf_model, X=X, y=y, cv=5, scoring='roc_auc', n_jobs=-1).mean()
    return 'cv + roc_auc: ' + str(average_score)

def automatic_scoring_tt(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed, stratify=y)
    rf_model = RandomForestClassifier(n_estimators=100, random_state=seed).fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    return 'tt + f1_score: ' + str(f1_score(y_test, y_pred))

In [5]:
automatic_scoring_cv(X, y)

'cv + roc_auc: 0.8727620272692256'

In [6]:
automatic_scoring_tt(X, y)

'tt + f1_score: 0.8134196586227193'

### 1. Feature selector that removes all low-variance features.

In [7]:
#code
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=(.8 * (1 - .8)))
selector.fit(X)
columns = X.columns[selector.get_support(indices=True)]
X = X[columns]

X.head()

Unnamed: 0,Flow rate_kurtosis,Flow rate_count_below_mean,Flow rate_count_above_mean,Zone2_Pressure_variance,Zone2_Pressure_minimum,Zone2_Pressure_mean,Zone2_Pressure_abs_energy,Zone2_Pressure_kurtosis,Zone2_Pressure_count_below_mean,Zone2_Pressure_count_above_mean,...,Zone31_Pressure_variance,Zone31_Pressure_minimum,Zone31_Pressure_mean,Zone31_Pressure_abs_energy,Zone31_Pressure_kurtosis,Zone31_Pressure_count_below_mean,Zone31_Pressure_count_above_mean,Zone31_Pressure_skewness,Zone31_Pressure_standard_deviation,Zone31_Pressure_maximum
0,-0.568486,418,439,394030700.0,0.0,11750.302707,456010000000.0,0.272192,616,241,...,262013200.0,0.0,9210.803547,297252200000.0,1.178,619,238,1.579773,16186.820421,59057.0
1,-0.507464,422,439,451262900.0,0.0,12231.770021,517356900000.0,0.856194,628,233,...,251483100.0,0.0,8966.502441,285749800000.0,1.30134,623,238,1.610864,15858.219212,58896.3
2,-0.766407,478,414,552785700.0,0.0,12851.580796,640410300000.0,3.919326,643,249,...,246583700.0,0.0,8674.867948,287078600000.0,2.6417,644,248,1.845702,15702.983351,67384.0
3,-0.712796,412,491,301025300.0,0.0,11200.702371,385112300000.0,-0.402086,625,278,...,178337900.0,0.0,7849.956178,216683600000.0,1.590384,625,278,1.642973,13354.320973,51343.5
4,-0.504741,411,528,208480000.0,0.0,8308.334753,260580400000.0,1.078892,680,259,...,811447900.0,0.0,16532.734285,1018608000000.0,3.568572,654,285,1.985958,28485.923782,136121.0


In [8]:
automatic_scoring_cv(X, y)

'cv + roc_auc: 0.8733230760380968'

In [9]:
automatic_scoring_tt(X, y)

'tt + f1_score: 0.8048780487804877'

### 2. Feature selector that checks correlation between 2 features. If correlation is high, 1 feature is removed.

In [10]:
#code
import numpy as np

def correl(data, threshold):
    corr = data.corr()
    print('Corr calculated')
    columns = np.full((corr.shape[0],), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i,j] >= threshold:
                if columns[j]:
                    columns[j] = False
    selected_columns = data.columns[columns]
    data = data[selected_columns]
    return data

X = correl(X, 0.9)

X.head()

Corr calculated


Unnamed: 0,Flow rate_kurtosis,Flow rate_count_below_mean,Flow rate_count_above_mean,Zone2_Pressure_minimum,Zone2_Pressure_mean,Zone2_Pressure_count_below_mean,Zone2_Pressure_count_above_mean,Zone2_Pressure_skewness,Zone3_Pressure_minimum,Zone3_Pressure_mean,...,Zone27_Pressure_count_below_mean,Zone28_Pressure_count_below_mean,Zone29_Pressure_mean,Zone29_Pressure_count_below_mean,Zone29_Pressure_count_above_mean,Zone30_Pressure_count_below_mean,Zone30_Pressure_skewness,Zone31_Pressure_minimum,Zone31_Pressure_count_below_mean,Zone31_Pressure_count_above_mean
0,-0.568486,418,439,0.0,11750.302707,616,241,1.372814,-400.365,28008.127502,...,508,467,42545.788926,481,376,538,0.951473,0.0,619,238
1,-0.507464,422,439,0.0,12231.770021,628,233,1.520189,-399.633,26053.879321,...,505,470,42491.830557,483,378,536,0.930888,0.0,623,238
2,-0.766407,478,414,0.0,12851.580796,643,249,2.041241,-367.513,28349.11483,...,515,428,44405.985179,468,424,555,1.029921,0.0,644,248
3,-0.712796,412,491,0.0,11200.702371,625,278,1.134353,-400.365,25882.85601,...,511,489,38899.036888,533,370,569,1.012065,0.0,625,278
4,-0.504741,411,528,0.0,8308.334753,680,259,1.536761,-290.705,22078.289114,...,524,506,47824.389478,531,408,586,0.989202,0.0,654,285


In [11]:
automatic_scoring_cv(X, y)

'cv + roc_auc: 0.8609831492590649'

In [12]:
automatic_scoring_tt(X, y)

'tt + f1_score: 0.7927294048666079'

### 3. Select features according to a percentile of the highest scores (mutual_info_classif)

In [13]:
from sklearn.feature_selection import SelectPercentile, mutual_info_classif

selperc = SelectPercentile(mutual_info_classif, percentile=40)
X_features = selperc.fit_transform(X, y)
columns = np.asarray(X.columns.values)
support = np.asarray(selperc.get_support())
columns_with_support = columns[support]

X = X[columns_with_support]
X.head()

Unnamed: 0,Flow rate_kurtosis,Zone3_Pressure_count_above_mean,Zone4_Pressure_count_above_mean,Zone5_Pressure_minimum,Zone6_Pressure_count_below_mean,Zone7_Pressure_minimum,Zone7_Pressure_count_below_mean,Zone7_Pressure_skewness,Zone9_Pressure_mean,Zone9_Pressure_kurtosis,...,Zone19_Pressure_count_below_mean,Zone22_Pressure_count_below_mean,Zone23_Pressure_count_below_mean,Zone24_Pressure_kurtosis,Zone24_Pressure_count_below_mean,Zone24_Pressure_count_above_mean,Zone24_Pressure_skewness,Zone26_Pressure_count_above_mean,Zone29_Pressure_count_above_mean,Zone31_Pressure_count_below_mean
0,-0.568486,330,364,-2038.22,543,-1057.16,618,1.588478,7272.84273,5.311404,...,434,440,560,5.16733,708,149,2.473288,247,376,619
1,-0.507464,336,371,-2038.22,541,-1040.04,624,1.623811,7826.292369,5.748566,...,434,443,557,5.368426,717,144,2.516752,247,378,623
2,-0.766407,331,388,-1964.6,572,-1006.84,658,1.841518,7669.570471,7.946121,...,446,504,595,11.189778,749,143,3.445868,288,424,644
3,-0.712796,378,364,-2038.22,582,-1032.48,625,1.68086,9924.194251,2.121459,...,413,486,590,17.256859,721,182,3.720973,272,370,625
4,-0.504741,334,330,-1898.18,604,-1380.96,674,2.272728,3717.180958,9.245656,...,494,549,609,7.990006,722,217,2.840404,314,408,654


In [14]:
automatic_scoring_cv(X, y)

'cv + roc_auc: 0.8444465682632478'

In [15]:
automatic_scoring_tt(X, y)

'tt + f1_score: 0.7892113749633539'

In [16]:
X['target'] = y
X.to_csv('selected_data.csv')