In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import joblib
import logging
import warnings
import multiprocessing
import sys

import datetime
from datetime import timezone 

import pickle

from matplotlib import pyplot as plt
from time import time
import pathlib
from pathlib import Path

import ydata_profiling

from tsfresh.feature_extraction import extract_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import MinimalFCParameters

from pycaret.utils import enable_colab 
from pycaret.classification import *

from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from statsmodels.stats.multicomp import (pairwise_tukeyhsd, MultiComparison)

import logging
import os

In [None]:
logging.getLogger('tsfresh').setLevel(logging.ERROR)
warnings.simplefilter(action='ignore')

%matplotlib inline
%config InlineBackend.figure_format = 'png'

In [None]:
# Multithread
jobs = multiprocessing.cpu_count()
print('\n\nMultithread: ', jobs)


#####################################################################################
# Data
#####################################################################################

data_path = Path('/home/pzeola/3W/data', '')
!pip freeze > /home/pzeola/3W/requirements.txt

events_names = {
    0: 'Normal',
    1: 'Abrupt Increase of BSW',
    2: 'Spurious Closure of DHSV',
    3: 'Severe Slugging',
    4: 'Flow Instability',
    5: 'Rapid Productivity Loss',
    6: 'Quick Restriction in PCK',
    7: 'Scaling in PCK',
    8: 'Hydrate in Production Line'}

abnormal_classes_codes = [1, 2, 3, 4, 5, 6, 7, 8]

vars = ['P-PDG',
        'P-TPT',
        'T-TPT',
        'P-MON-CKP',
        'T-JUS-CKP',
        'P-JUS-CKGL',
        'T-JUS-CKGL',
        'QGL']

csv_columns = ['timestamp'] + vars + ['class']

#####################################################################################
# Feature Extraction parameters
#####################################################################################
scaler = preprocessing.StandardScaler()

df_fc_p = MinimalFCParameters()
df_fc_p['abs_energy'] = None
df_fc_p['mean_abs_change'] = None
df_fc_p['mean_change'] = None
df_fc_p['mean_second_derivative_central'] = None
df_fc_p['median'] = None
df_fc_p['mean'] = None
df_fc_p['standard_deviation'] = None
df_fc_p['variation_coefficient'] = None
df_fc_p['variance'] = None
df_fc_p['skewness'] = None
df_fc_p['kurtosis'] = None
df_fc_p['root_mean_square'] = None
df_fc_p['percentage_of_reoccurring_values_to_all_values'] = None
df_fc_p['percentage_of_reoccurring_datapoints_to_all_datapoints'] = None
df_fc_p['sample_entropy'] = None
df_fc_p['maximum'] = None
df_fc_p['minimum'] = None
df_fc_p['linear_trend_timewise'] = None
print('used features: {}'.format(list(df_fc_p.keys())))

#####################################################################################
# File input list
#####################################################################################
def class_and_file_generator(data_path, real=False, simulated=False, drawn=False):
    for class_path in data_path.iterdir():
        if class_path.is_dir():
            class_code = int(class_path.stem)
            for instance_path in class_path.iterdir():
                if (instance_path.suffix == '.csv'):
                    if (simulated and instance_path.stem.startswith('SIMULATED')) or \
                       (drawn and instance_path.stem.startswith('DRAWN')) or \
                       (real and (not instance_path.stem.startswith('SIMULATED')) and \
                       (not instance_path.stem.startswith('DRAWN'))):
                        yield class_code, instance_path
                        
#####################################################################################
# Load file into Dataframe
#####################################################################################                        
def load_instance(class_code, instance_path):
    try:
        well, instance_id = instance_path.stem.split('_')
        df = pd.read_csv(instance_path, sep=',', header=0)
        assert (df.columns == csv_columns).all(), 'invalid columns in the file {}: {}'\
            .format(str(instance_path), str(df.columns.tolist()))
        
        # Disregard transient/fault diff and NaN
        df['class'] = np.where(df['class']>100 , class_code, df['class'])
        df['class'].fillna(0, inplace=True)
        
        return df
    except Exception as e:
        raise Exception('error reading file {}: {}'.format(instance_path, e))     
        
#####################################################################################
# Feature Extraction
#####################################################################################  
def normalize_and_extract_features(window, label):
    
    # Workaround for NaN columns / NaN por 0
    for i in range(len(window[0])-1):
        if(np.isnan(window[0,i+1])):
            window[0,i+1] = 0  
            
    # NaN values replaced by Average     ? is this ok ?
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    
    # Normalizes the samples (zero mean and unit variance)
    df_norm = pd.DataFrame.from_records(scaler.fit_transform(imp_mean.fit_transform(window[:,1:9])).astype('float32'))
    df_norm.set_axis(vars, axis=1, inplace=True)
    df_norm.insert(loc=0, column='id', value=0)
    df_norm.insert(loc=0, column='timestamp', value=0)
    df_norm['timestamp'] = df['timestamp']
    
    # Extracts features from samples
    extracted_data = extract_features(df_norm, 
                         column_id='id', 
                         column_sort='timestamp', 
                         default_fc_parameters=df_fc_p,
                         impute_function=impute,
                         n_jobs=0,
                         disable_progressbar=True)
    extracted_data.insert(loc=0, column='class', value=label)
    extracted_data = extracted_data.reset_index(drop=True)
    return extracted_data

In [None]:
t0 = time()


# Gets all real/simulated/drawn instances (normal operation only cases discarted)
instances = pd.DataFrame(class_and_file_generator(data_path, real=True, simulated=True, drawn=True), columns=['class', 'instance_path'])

# Fault instances
instances = instances.loc[instances.iloc[:,0].isin(abnormal_classes_codes)].reset_index(drop=True)
print('instances:' + str(len(instances)))

window_sizes = [500] # se tiver vários tamanhos [150, 500, 600, ...]
offset = 150

# Raw data windows
df = pd.DataFrame(columns=['class', 'data'])
data = pd.DataFrame()

for window_size in window_sizes:
  
  df = pd.DataFrame(columns=['class', 'data'])
  data = pd.DataFrame()
  
  # Data ID (csv name)
  print('data-s' + str(window_size) + '-o' + str(offset))
  
  # For each instance with any type of undesirable event
  for i, row in instances.iterrows():
      
      # Loads the current instance
      class_code, instance_path = row
      print(' ')
      print('Instance {}: {} {}'.format(i+1, events_names[class_code], instance_path))
      df = load_instance(class_code, instance_path)
      
      # Moment of fault detection, transition from normal to transient 
      fault_transition = np.where(df['class'] == class_code)[0][0]
      fault_end = fault_transition + len(np.where(df['class'] == class_code)[0])
    
      for i in range(int((len(df['class']) - window_size)/offset)):
        window = df.iloc[i*offset:i*offset+window_size,:].values 
        extracted_data = normalize_and_extract_features(window, df['class'][i*offset+window_size])
        data = data.append(extracted_data) 
        print(data)
          
      print(time()-t0)
      t0 = time()
  
  dt = datetime.datetime.now() 
  utc_time = dt.replace(tzinfo = timezone.utc) 
  utc_timestamp = utc_time.timestamp()

  # Save data as .csv
  data.to_csv('all_features.csv')
  #data.to_csv('data-s' + str(window_size) + '-n' + str(n_windows) + '-o' + str(window_offset) + '-t' + str(int(utc_timestamp)) + '.csv', index=False)  
  
  path_x = '/home/pzeola/3W/extracted_features_data/all_features_data-s'+ str(window_size) + '-o' + str(offset) + '.pkl'
  
  import os.path
  if(os.path.exists(path_x)):
    # Save binary data
    with open('/home/pzeola/3W/extracted_features_data/all_features_data-s' + str(window_size) + '-o' + str(offset) + '.pkl', "wb") as f:
      pickle.dump(data, f)
  else:
    data.to_pickle(f"{path_x}")

In [None]:
data = pickle.load( open('/home/pzeola/3W/extracted_features_data/all_features_data-s500-o150.pkl', 'rb'))

In [None]:
data['class'].value_counts()

In [None]:
data.drop_duplicates()
print(data.shape)
print(data['class'].value_counts())

In [None]:
columns = data.columns
print(len(columns[1:20]))
print(columns[1:20])