# COMPOSITE PREPROCESSING

### Read extracted data

In [1]:
import pandas as pd

df = pd.read_csv('tmp/extracted_data.csv')
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X.shape

(200, 23111)

In [2]:
y.value_counts()

1    100
0    100
Name: target, dtype: int64

### Utilities

In [3]:
# code
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

seed = 0
folds = 10
algorithm = RandomForestClassifier(n_estimators=100, random_state=seed)

def get_score(X, y):    
    # Cross Validation
    score = cross_val_score(algorithm, X, y, cv=folds).mean()
    print("Cross Validation Score: "+ str(score*100) + "%") 
    # Train-Test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)
    algorithm.fit(X_train, y_train)
    y_pred = algorithm.predict(X_test)
    print(classification_report(y_test, y_pred))

### Delete Nan/Inf values

In [4]:
# code
import numpy as np

X.dropna(axis=1, inplace=True)
X.replace([np.inf, -np.inf], np.nan).dropna(axis=1, inplace=True)

X.head()

Unnamed: 0,id,Flow rate__abs_energy,Flow rate__absolute_sum_of_changes,"Flow rate__agg_autocorrelation__f_agg_""mean""__maxlag_40","Flow rate__agg_autocorrelation__f_agg_""median""__maxlag_40","Flow rate__agg_autocorrelation__f_agg_""var""__maxlag_40","Flow rate__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","Flow rate__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","Flow rate__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""","Flow rate__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""stderr""",...,Zone28_K3,Zone29_K1,Zone29_K2,Zone29_K3,Zone30_K1,Zone30_K2,Zone30_K3,Zone31_K1,Zone31_K2,Zone31_K3
0,0,1.031236e-07,6.2e-05,0.0,0.0,0.0,1.2e-05,-0.243467,-4.506888e-08,1.958974e-08,...,2e-12,4e-11,5e-11,2e-12,4e-11,5e-11,2e-12,4e-11,5e-11,2e-12
1,2,9.353593e-08,7.9e-05,0.0,0.0,0.0,1.3e-05,-0.410989,-7.622757e-08,1.802454e-08,...,2e-12,4e-11,5e-11,2e-12,4e-11,5e-11,2e-12,4e-11,5e-11,2e-12
2,5,1.097562e-07,4.4e-05,0.0,0.0,0.0,1.4e-05,-0.491959,-1.091336e-07,2.024589e-08,...,3e-12,8e-11,9e-11,3e-12,8e-11,9e-11,3e-12,8e-11,9e-11,3e-12
3,11,9.292072e-08,6.3e-05,0.0,0.0,0.0,1.4e-05,-0.566665,-1.146758e-07,1.719813e-08,...,3e-12,4e-11,5e-11,2e-12,8e-11,9e-11,3e-12,8e-11,9e-11,3e-12
4,12,9.332236e-08,5.7e-05,0.0,0.0,0.0,1.4e-05,-0.517404,-1.066454e-07,1.859232e-08,...,2e-12,8e-11,9e-11,3e-12,8e-11,9e-11,3e-12,8e-11,9e-11,3e-12


In [5]:
get_score(X, y)

Cross Validation Score: 65.5%
              precision    recall  f1-score   support

           0       0.59      0.71      0.64        34
           1       0.60      0.47      0.53        32

    accuracy                           0.59        66
   macro avg       0.59      0.59      0.58        66
weighted avg       0.59      0.59      0.58        66



### Get most important features

In [6]:
# code
import eli5
import pandas as pd
from eli5.formatters.as_dataframe import format_as_dataframe
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X, y) 

explanation = eli5.explain_weights(rf, top=30)
explanation_df = format_as_dataframe(explanation)

important_features = pd.DataFrame()

for feature_id in explanation_df['feature']:
    n = int(feature_id.replace('x', ''))
    col = X.iloc[ : , n]
    important_features[col.name] = col.values   

X = important_features
X.head()   

Unnamed: 0,id,Zone16_Pressure__ratio_beyond_r_sigma__r_0.5,Zone26_Pressure__number_cwt_peaks__n_5,Zone4_Pressure__quantile__q_0.8,Zone7_Pressure__quantile__q_0.8,Zone31_Pressure__kurtosis,"Zone9_Pressure__fft_coefficient__coeff_31__attr_""imag""",Zone15_Pressure__kurtosis,Zone15_Pressure__quantile__q_0.7,Zone23_Pressure__skewness,...,"Zone31_Pressure__fft_coefficient__coeff_39__attr_""imag""",Zone24_Pressure__skewness,"Zone7_Pressure__fft_coefficient__coeff_0__attr_""real""",Zone17_Pressure__time_reversal_asymmetry_statistic__lag_1,"Zone17_Pressure__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.2","Zone31_Pressure__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.4","Zone2_Pressure__augmented_dickey_fuller__attr_""teststat""","Zone31_Pressure__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.0",Zone26_Pressure__ratio_value_number_to_time_series_length,"Zone9_Pressure__fft_coefficient__coeff_30__attr_""imag"""
0,0,0.152859,26.0,90414.74,23210.18,1.178,201211.715148,0.180995,18795.44,1.207939,...,255785.698246,2.473288,8007426.292,477375900000.0,78.888668,135393.08443,-1.086534,85.208147,0.325554,244955.6795
1,2,0.09417,23.0,93441.16,21939.86,2.6417,180946.077666,1.282848,16444.71,1.487467,...,-98428.832501,3.445868,7359212.265,552242900000.0,80.815937,30438.276291,-2.317037,42.94978,0.382287,212275.353946
2,5,0.136808,33.0,43145.8,12085.7,1.567874,432280.949243,0.114999,11725.8,1.189575,...,70494.515982,3.155242,5205329.993,616169100000.0,82.022717,78916.292213,0.628782,50.660506,0.378936,399243.625839
3,11,0.112735,25.0,41329.9,13174.0,3.252763,207945.724593,2.822502,10678.11,1.811675,...,245829.624233,2.818438,6712071.851,608691100000.0,79.473041,27339.274344,0.894001,34.879843,0.41023,193371.775057
4,12,0.148026,24.0,38397.98,13821.24,-0.219816,274288.012114,0.325279,13051.98,1.08462,...,238733.932588,2.321429,5506196.489,483027800000.0,76.021515,23691.742285,2.030197,50.837662,0.33114,256213.407809


In [7]:
get_score(X, y)

Cross Validation Score: 76.0%
              precision    recall  f1-score   support

           0       0.82      0.79      0.81        34
           1       0.79      0.81      0.80        32

    accuracy                           0.80        66
   macro avg       0.80      0.80      0.80        66
weighted avg       0.80      0.80      0.80        66



### Delete features with low variance

In [8]:
# code
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=0.05)
selector.fit(X)
columns = X.columns[selector.get_support(indices=True)]
X = X[columns]
X = X.loc[:, ~X.columns.str.contains('^Unnamed')]
X.drop('id', axis=1, inplace=True)

X.head()

Unnamed: 0,Zone26_Pressure__number_cwt_peaks__n_5,Zone4_Pressure__quantile__q_0.8,Zone7_Pressure__quantile__q_0.8,Zone31_Pressure__kurtosis,"Zone9_Pressure__fft_coefficient__coeff_31__attr_""imag""",Zone15_Pressure__kurtosis,Zone15_Pressure__quantile__q_0.7,Zone23_Pressure__skewness,"Zone17_Pressure__fft_coefficient__coeff_83__attr_""abs""","Zone9_Pressure__fft_coefficient__coeff_59__attr_""abs""",...,"Zone10_Pressure__fft_coefficient__coeff_88__attr_""abs""","Zone31_Pressure__fft_coefficient__coeff_39__attr_""imag""",Zone24_Pressure__skewness,"Zone7_Pressure__fft_coefficient__coeff_0__attr_""real""",Zone17_Pressure__time_reversal_asymmetry_statistic__lag_1,"Zone17_Pressure__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.2","Zone31_Pressure__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.4","Zone2_Pressure__augmented_dickey_fuller__attr_""teststat""","Zone31_Pressure__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.0","Zone9_Pressure__fft_coefficient__coeff_30__attr_""imag"""
0,26.0,90414.74,23210.18,1.178,201211.715148,0.180995,18795.44,1.207939,133115.566292,143901.657162,...,76492.559938,255785.698246,2.473288,8007426.292,477375900000.0,78.888668,135393.08443,-1.086534,85.208147,244955.6795
1,23.0,93441.16,21939.86,2.6417,180946.077666,1.282848,16444.71,1.487467,179547.807417,148662.893774,...,74394.719607,-98428.832501,3.445868,7359212.265,552242900000.0,80.815937,30438.276291,-2.317037,42.94978,212275.353946
2,33.0,43145.8,12085.7,1.567874,432280.949243,0.114999,11725.8,1.189575,106445.906595,207604.224556,...,126717.303704,70494.515982,3.155242,5205329.993,616169100000.0,82.022717,78916.292213,0.628782,50.660506,399243.625839
3,25.0,41329.9,13174.0,3.252763,207945.724593,2.822502,10678.11,1.811675,156796.688629,140329.72416,...,104570.553883,245829.624233,2.818438,6712071.851,608691100000.0,79.473041,27339.274344,0.894001,34.879843,193371.775057
4,24.0,38397.98,13821.24,-0.219816,274288.012114,0.325279,13051.98,1.08462,125427.616776,137615.504684,...,82506.913319,238733.932588,2.321429,5506196.489,483027800000.0,76.021515,23691.742285,2.030197,50.837662,256213.407809


In [9]:
get_score(X, y)

Cross Validation Score: 72.5%
              precision    recall  f1-score   support

           0       0.76      0.74      0.75        34
           1       0.73      0.75      0.74        32

    accuracy                           0.74        66
   macro avg       0.74      0.74      0.74        66
weighted avg       0.74      0.74      0.74        66



### Delete highly correlated featatures 
If 2 features are highly correlated, delete one.

In [10]:
# code
def correl(data, threshold):
    corr = data.corr()
    columns = np.full((corr.shape[0],), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i,j] >= threshold:
                if columns[j]:
                    columns[j] = False
    selected_columns = data.columns[columns]
    data = data[selected_columns]
    return data

X = correl(X, 0.9)

X.head()

Unnamed: 0,Zone26_Pressure__number_cwt_peaks__n_5,Zone4_Pressure__quantile__q_0.8,Zone7_Pressure__quantile__q_0.8,Zone31_Pressure__kurtosis,"Zone9_Pressure__fft_coefficient__coeff_31__attr_""imag""",Zone15_Pressure__kurtosis,Zone15_Pressure__quantile__q_0.7,Zone23_Pressure__skewness,"Zone17_Pressure__fft_coefficient__coeff_83__attr_""abs""","Zone9_Pressure__fft_coefficient__coeff_59__attr_""abs""",...,"Zone6_Pressure__fft_aggregated__aggtype_""skew""","Zone14_Pressure__agg_linear_trend__f_agg_""mean""__chunk_len_50__attr_""stderr""","Zone10_Pressure__fft_coefficient__coeff_88__attr_""abs""","Zone31_Pressure__fft_coefficient__coeff_39__attr_""imag""",Zone24_Pressure__skewness,Zone17_Pressure__time_reversal_asymmetry_statistic__lag_1,"Zone31_Pressure__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.4","Zone2_Pressure__augmented_dickey_fuller__attr_""teststat""","Zone31_Pressure__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.0","Zone9_Pressure__fft_coefficient__coeff_30__attr_""imag"""
0,26.0,90414.74,23210.18,1.178,201211.715148,0.180995,18795.44,1.207939,133115.566292,143901.657162,...,2.586544,1112.369466,76492.559938,255785.698246,2.473288,477375900000.0,135393.08443,-1.086534,85.208147,244955.6795
1,23.0,93441.16,21939.86,2.6417,180946.077666,1.282848,16444.71,1.487467,179547.807417,148662.893774,...,3.478801,566.148649,74394.719607,-98428.832501,3.445868,552242900000.0,30438.276291,-2.317037,42.94978,212275.353946
2,33.0,43145.8,12085.7,1.567874,432280.949243,0.114999,11725.8,1.189575,106445.906595,207604.224556,...,2.215812,604.656515,126717.303704,70494.515982,3.155242,616169100000.0,78916.292213,0.628782,50.660506,399243.625839
3,25.0,41329.9,13174.0,3.252763,207945.724593,2.822502,10678.11,1.811675,156796.688629,140329.72416,...,1.888775,322.081596,104570.553883,245829.624233,2.818438,608691100000.0,27339.274344,0.894001,34.879843,193371.775057
4,24.0,38397.98,13821.24,-0.219816,274288.012114,0.325279,13051.98,1.08462,125427.616776,137615.504684,...,1.952721,304.019976,82506.913319,238733.932588,2.321429,483027800000.0,23691.742285,2.030197,50.837662,256213.407809


In [11]:
get_score(X, y)

Cross Validation Score: 70.0%
              precision    recall  f1-score   support

           0       0.74      0.68      0.71        34
           1       0.69      0.75      0.72        32

    accuracy                           0.71        66
   macro avg       0.71      0.71      0.71        66
weighted avg       0.71      0.71      0.71        66

