In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# tsfresh
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.distribution import MultiprocessingDistributor

In [2]:
total = ['Total_PorosityQuantity', 'Total_PorosityQuality', 'Total_UnfilledZones', 'Total_FillingQuality', 'TOTAL_QUALITY']

path = 'data'
all_files = glob(path + '/*.csv')
df_li = list()
target = list()

for id, filename in enumerate(all_files[:10]):
    df = pd.read_csv(filename, index_col=None, header=0)
    target.append(df.TOTAL_QUALITY.unique()[0])
    df = df.drop(axis=1, columns=total)
    df['id'] = id    
    df_li.append(df)    

df = pd.concat(df_li)

target = DataFrame(target, columns=['target'])
df.shape

(8686, 283)

### EXTRACT FEATURES

In [3]:
# TSFRESH MULTIPROCESSING API DOC (single thread feature extraction too slow.....)
# https://tsfresh.readthedocs.io/en/latest/text/parallelization.html#parallelization-of-feature-extraction
# https://tsfresh.readthedocs.io/en/latest/text/tsfresh_on_a_cluster.html
distributor = MultiprocessingDistributor(n_workers=8, disable_progressbar=False, progressbar_title="Feature Extraction")

In [4]:
# TSFRESH FEATURE EXTRACTION API DOC

#extracted_features = extract_features(df, column_id='id', column_sort='Time', distributor=distributor)
#extracted_features.to_csv('temp/extracted_features.csv')

extracted_features = pd.read_csv('temp/extracted_features.csv')
extracted_features

Unnamed: 0,id,Flow rate__abs_energy,Flow rate__absolute_sum_of_changes,"Flow rate__agg_autocorrelation__f_agg_""mean""__maxlag_40","Flow rate__agg_autocorrelation__f_agg_""median""__maxlag_40","Flow rate__agg_autocorrelation__f_agg_""var""__maxlag_40","Flow rate__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","Flow rate__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","Flow rate__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""","Flow rate__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""stderr""",...,Zone_ids__symmetry_looking__r_0.9,Zone_ids__symmetry_looking__r_0.9500000000000001,Zone_ids__time_reversal_asymmetry_statistic__lag_1,Zone_ids__time_reversal_asymmetry_statistic__lag_2,Zone_ids__time_reversal_asymmetry_statistic__lag_3,Zone_ids__value_count__value_-1,Zone_ids__value_count__value_0,Zone_ids__value_count__value_1,Zone_ids__variance,Zone_ids__variance_larger_than_standard_deviation
0,0,1.031236e-07,6.2e-05,0.0,0.0,0.0,1.2e-05,-0.243467,-4.506888e-08,1.958974e-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1.042159e-07,6.4e-05,0.0,0.0,0.0,1.2e-05,-0.253883,-4.662547e-08,1.926688e-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,9.353593e-08,7.9e-05,0.0,0.0,0.0,1.3e-05,-0.410989,-7.622757e-08,1.802454e-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,1.05719e-07,6.3e-05,0.0,0.0,0.0,1.3e-05,-0.402628,-7.429241e-08,1.790357e-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,1.099151e-07,5e-05,0.0,0.0,0.0,1.2e-05,-0.196922,-3.750376e-08,2.049529e-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,1.057093e-07,5.2e-05,0.0,0.0,0.0,1.3e-05,-0.340799,-6.834559e-08,2.021362e-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,6,1.083024e-07,4.5e-05,0.0,0.0,0.0,1.2e-05,-0.241732,-4.837696e-08,2.131524e-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,7,1.0295e-07,5.8e-05,0.0,0.0,0.0,1.2e-05,-0.31083,-5.933411e-08,1.956447e-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,8,1.031331e-07,5.3e-05,0.0,0.0,0.0,1.2e-05,-0.300814,-5.927074e-08,2.038152e-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,9,1.046665e-07,6.3e-05,0.0,0.0,0.0,1.2e-05,-0.241462,-4.406175e-08,1.920699e-08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
type(extracted_features)

pandas.core.frame.DataFrame

In [6]:
extracted_features = extracted_features.dropna(axis=1)
extracted_features = extracted_features.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
extracted_features.shape

(10, 214369)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(extracted_features, target, test_size=.3)

In [8]:
rf_model = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
print(classification_report(y_test, rf_model.predict(X_test)))

  """Entry point for launching an IPython kernel.
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.67      1.00      0.80         2

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3

