In [1]:
import numpy as np
from glob import glob
import pandas as pd

files = glob('data/*.csv')
totals = ['Total_PorosityQuantity', 'Total_PorosityQuality', 'Total_UnfilledZones', 'Total_FillingQuality', 'TOTAL_QUALITY']

df_list = list()
target = list()

for i, file in enumerate(files[:100]):
    df = pd.read_csv(file)
    target.append(df.TOTAL_QUALITY.unique()[0])
    df = df.drop(axis=1, columns=totals)  
    df['id'] = i
    
    ts_cols = [col for col in df if col.endswith(('Time', 'id', 'Flow rate', 'Pressure'))]
    df_list.append(df[ts_cols])

df = pd.concat(df_list)
df.head()

Unnamed: 0,Time,Flow rate,Zone2_Pressure,Zone3_Pressure,Zone4_Pressure,Zone5_Pressure,Zone6_Pressure,Zone7_Pressure,Zone9_Pressure,Zone10_Pressure,...,Zone22_Pressure,Zone23_Pressure,Zone24_Pressure,Zone26_Pressure,Zone27_Pressure,Zone28_Pressure,Zone29_Pressure,Zone30_Pressure,Zone31_Pressure,id
0,0.065529,6.72733e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.094341,7.17004e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.116504,7.6141e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.1657,8.40451e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.186822,8.817e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [2]:
df.id.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
      dtype=int64)

In [3]:
type(df.id.values)

numpy.ndarray

In [4]:
from tsfresh import extract_features
from tsfresh.utilities.distribution import MultiprocessingDistributor
distributor = MultiprocessingDistributor(n_workers=8, disable_progressbar=False, progressbar_title="Feature Extraction")
extracted_features = extract_features(df, column_id='id', column_sort='Time', distributor=distributor)
extracted_features.to_csv('temp/extracted_features.csv')

Feature Extraction: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [45:39<00:00, 68.50s/it]


In [None]:
# Delete NaN/Inf values
extracted_features = extracted_features.dropna(axis=1)
extracted_features = extracted_features.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
        
extracted_features.to_csv('temp/extracted_features_cleaned.csv')

### UTILITIES

In [163]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

seed = 1
folds = 10

def get_classification_report(X, y):
    algorithm = RandomForestClassifier(n_estimators=100, random_state=seed)
    score = cross_val_score(algorithm, X, y, cv=folds).mean()
    print("SCORE: "+ str(score)) 

### READ EXTRACTED FEATURES FROM CSV

In [164]:
import pandas as pd

extracted_features = pd.read_csv('temp/extracted_features_cleaned.csv')
extracted_features.head()

Unnamed: 0.1,Unnamed: 0,id,Flow rate__abs_energy,Flow rate__absolute_sum_of_changes,"Flow rate__agg_autocorrelation__f_agg_""mean""__maxlag_40","Flow rate__agg_autocorrelation__f_agg_""median""__maxlag_40","Flow rate__agg_autocorrelation__f_agg_""var""__maxlag_40","Flow rate__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","Flow rate__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","Flow rate__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""",...,Zone9_Pressure__symmetry_looking__r_0.9,Zone9_Pressure__symmetry_looking__r_0.9500000000000001,Zone9_Pressure__time_reversal_asymmetry_statistic__lag_1,Zone9_Pressure__time_reversal_asymmetry_statistic__lag_2,Zone9_Pressure__time_reversal_asymmetry_statistic__lag_3,Zone9_Pressure__value_count__value_-1,Zone9_Pressure__value_count__value_0,Zone9_Pressure__value_count__value_1,Zone9_Pressure__variance,Zone9_Pressure__variance_larger_than_standard_deviation
0,0,0,1.031236e-07,6.2e-05,0.0,0.0,0.0,1.2e-05,-0.243467,-4.506888e-08,...,1.0,1.0,341046200000.0,674152500000.0,1002176000000.0,0.0,691.0,0.0,367147200.0,1.0
1,1,1,1.042159e-07,6.4e-05,0.0,0.0,0.0,1.2e-05,-0.253883,-4.662547e-08,...,1.0,1.0,494710700000.0,980760500000.0,1460443000000.0,0.0,705.0,0.0,461223200.0,1.0
2,2,2,9.353593e-08,7.9e-05,0.0,0.0,0.0,1.3e-05,-0.410989,-7.622757e-08,...,1.0,1.0,82722060000.0,163441700000.0,244599100000.0,0.0,700.0,0.0,391203100.0,1.0
3,3,3,1.027388e-07,6.1e-05,0.0,0.0,0.0,1.3e-05,-0.381023,-7.325205e-08,...,1.0,1.0,491034500000.0,976517400000.0,1455599000000.0,0.0,692.0,0.0,492880800.0,1.0
4,4,4,1.049617e-07,6.8e-05,0.0,0.0,0.0,1.3e-05,-0.428986,-7.536972e-08,...,1.0,1.0,38466510000.0,75589870000.0,111560300000.0,0.0,747.0,0.0,104007100.0,1.0


### Generate first model and test

In [165]:
get_classification_report(extracted_features, target)

SCORE: 0.6873737373737374


### Delete features with low variance

In [166]:
# Removing features with low variance
from sklearn.feature_selection import VarianceThreshold

# sel = VarianceThreshold(threshold=0.05)
# extracted_features = pd.DataFrame(sel.fit_transform(extracted_features))
# extracted_features

sel = VarianceThreshold(threshold=0.05)
selector.fit(extracted_features)
columns = extracted_features.columns[selector.get_support(indices=True)]
extracted_features = extracted_features[columns]
extracted_features = extracted_features.loc[:, ~extracted_features.columns.str.contains('^Unnamed')]
extracted_features.drop('id', axis=1, inplace=True)
extracted_features.head()

Unnamed: 0,"Flow rate__augmented_dickey_fuller__attr_""teststat""","Flow rate__augmented_dickey_fuller__attr_""usedlag""",Flow rate__cid_ce__normalize_True,Flow rate__count_above_mean,Flow rate__count_below_mean,"Flow rate__fft_aggregated__aggtype_""centroid""","Flow rate__fft_aggregated__aggtype_""kurtosis""","Flow rate__fft_aggregated__aggtype_""skew""","Flow rate__fft_aggregated__aggtype_""variance""","Flow rate__fft_coefficient__coeff_10__attr_""angle""",...,Zone9_Pressure__sum_of_reoccurring_data_points,Zone9_Pressure__sum_of_reoccurring_values,Zone9_Pressure__sum_values,Zone9_Pressure__symmetry_looking__r_0.1,Zone9_Pressure__symmetry_looking__r_0.15000000000000002,Zone9_Pressure__time_reversal_asymmetry_statistic__lag_1,Zone9_Pressure__time_reversal_asymmetry_statistic__lag_2,Zone9_Pressure__time_reversal_asymmetry_statistic__lag_3,Zone9_Pressure__value_count__value_0,Zone9_Pressure__variance
0,-0.917134,3.0,2.027651,439.0,418.0,31.771789,-5.207888,3.018917,5994.850747,128.443307,...,0.0,0.0,6232826.22,0.0,1.0,341046200000.0,674152500000.0,1002176000000.0,691.0,367147200.0
1,-0.829793,5.0,2.095977,439.0,422.0,32.024493,-5.110457,3.015963,6007.395536,118.192733,...,0.0,0.0,6738437.73,1.0,1.0,494710700000.0,980760500000.0,1460443000000.0,705.0,461223200.0
2,-1.476037,1.0,2.915139,414.0,478.0,43.9821,-5.713558,2.575471,8387.874541,-131.01667,...,0.0,0.0,6841256.86,1.0,1.0,82722060000.0,163441700000.0,244599100000.0,700.0,391203100.0
3,-0.961903,2.0,2.037617,491.0,412.0,35.702867,-5.395782,2.942215,6980.032859,-90.795881,...,0.0,0.0,8961547.409,0.0,1.0,491034500000.0,976517400000.0,1455599000000.0,692.0,492880800.0
4,-1.351336,4.0,2.862844,528.0,411.0,45.995221,-5.909401,2.586751,8835.415363,37.502317,...,0.0,0.0,3490432.92,1.0,1.0,38466510000.0,75589870000.0,111560300000.0,747.0,104007100.0


### Delete highly correlated featatures (if 2 features are highly correlated, delete one)

In [167]:
def correl(data, threshold):
    corr = data.corr()
    columns = np.full((corr.shape[0],), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i,j] >= threshold:
                if columns[j]:
                    columns[j] = False
    selected_columns = data.columns[columns]
    data = data[selected_columns]
    return data


#def trimm_correlated(df_in, threshold):
#    df_corr = df_in.corr(method='pearson', min_periods=1)
#    df_not_correlated = ~(df_corr.mask(np.tril(np.ones([len(df_corr)]*2, dtype=bool))).abs() > threshold).any()
#     un_corr_idx = df_not_correlated.loc[df_not_correlated[df_not_correlated.index] == True].index
#     df_out = df_in[un_corr_idx]
#     return df_out
# extracted_features = trimm_correlated(extracted_features, 0.95)
extracted_features = correl(extracted_features, 0.9)
extracted_features.head()

Unnamed: 0,"Flow rate__augmented_dickey_fuller__attr_""teststat""","Flow rate__augmented_dickey_fuller__attr_""usedlag""",Flow rate__cid_ce__normalize_True,Flow rate__count_above_mean,Flow rate__count_below_mean,"Flow rate__fft_aggregated__aggtype_""kurtosis""","Flow rate__fft_aggregated__aggtype_""skew""","Flow rate__fft_coefficient__coeff_10__attr_""angle""","Flow rate__fft_coefficient__coeff_11__attr_""angle""","Flow rate__fft_coefficient__coeff_12__attr_""angle""",...,Zone9_Pressure__quantile__q_0.7,Zone9_Pressure__range_count__max_1__min_-1,Zone9_Pressure__ratio_beyond_r_sigma__r_0.5,Zone9_Pressure__spkt_welch_density__coeff_2,Zone9_Pressure__spkt_welch_density__coeff_5,Zone9_Pressure__spkt_welch_density__coeff_8,Zone9_Pressure__sum_of_reoccurring_data_points,Zone9_Pressure__symmetry_looking__r_0.1,Zone9_Pressure__symmetry_looking__r_0.15000000000000002,Zone9_Pressure__time_reversal_asymmetry_statistic__lag_1
0,-0.917134,3.0,2.027651,439.0,418.0,-5.207888,3.018917,128.443307,-134.379691,163.932519,...,0.0,691.0,0.116686,25869760.0,270203.0,43036.16,0.0,0.0,1.0,341046200000.0
1,-0.829793,5.0,2.095977,439.0,422.0,-5.110457,3.015963,118.192733,-127.783891,149.197934,...,0.0,705.0,0.106852,8413668.0,467373.9,49606.34,0.0,1.0,1.0,494710700000.0
2,-1.476037,1.0,2.915139,414.0,478.0,-5.713558,2.575471,-131.01667,-130.032831,-6.859566,...,0.0,700.0,0.125561,10121020.0,307606.3,35225.84,0.0,1.0,1.0,82722060000.0
3,-0.961903,2.0,2.037617,491.0,412.0,-5.395782,2.942215,-90.795881,26.986556,132.363953,...,0.0,692.0,0.156146,4862382000.0,70620840.0,13688670.0,0.0,0.0,1.0,491034500000.0
4,-1.351336,4.0,2.862844,528.0,411.0,-5.909401,2.586751,37.502317,-160.207602,104.284825,...,0.0,747.0,0.123536,4376019000.0,41674720.0,51360180.0,0.0,1.0,1.0,38466510000.0


In [168]:
get_classification_report(extracted_features, target)

SCORE: 0.6782828282828284


In [169]:
extracted_features .to_csv('temp/extracted_features_cleaned_1.csv')