In [1]:
from ZMiner import *
from ZMinerD import *
from utils import *
from ZHist import *

In [2]:
# Parameters
# - Data: type should be xlsx (following the original data)
# - Metadata columns: Some columns which are not histograms. Please see the example synthetic data for details.
#   * Note that those columns are hard requirements. The dataset should have those metadata columns.

data = load_data("synthetic_turbo_failure.xlsx", ['no','time','snap','date','status'])

  a = data.drop(meta, axis=1)


In [3]:
data.head()

Unnamed: 0_level_0,A,A,A,A,A,A,A,A,A,A,...,J,J,J,J,J,no,time,snap,date,status
Unnamed: 0_level_1,A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,...,J5,J6,J7,J8,J9,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4,11,67,187,476,569,468,240,60,18,...,472,293,130,40,14,0,2100,0,2100,0
1,16,96,450,1154,1781,1771,1125,487,107,13,...,1834,1125,420,76,15,0,7000,1,9100,0
2,20,130,499,1121,1491,1106,471,135,24,3,...,1171,611,194,37,6,0,5000,2,14100,0
3,4,30,109,299,530,499,331,147,41,10,...,477,295,124,30,5,0,2000,3,16100,0
4,8,21,40,56,81,82,53,40,11,8,...,87,43,15,8,1,0,400,4,16500,0


In [4]:
# Important parameters
# - data: a Dataframe loaded from the load_data function
# - drop_cols: metadata columns which are not histograms
# - distance: histogram distance function used for calculating the relative location (default: "chi2")
# - normal_term: interpolate the histograms and split them again regularly so each histogram has equal time gap in between (default: 300)
#   - Those time units are artificially created
# - n_bins: number of event labels created (default: 3)
# - remove_events: event labels that will not be included in the pattern mining phase  
# - isStandardized: If the histograms are already standardized or not. If not, the algorithm first standardize it to have zero mean and one standard deviation
#                   (default: False)

# Parameters not being used
# - minTrend: minimum amount of time duration to be considered to create a trend (default: 300, **NOT USED**). 
# - matrix_cols: Columns that contain two-dimensional histograms (Not implemented) 
# - minimal_trend_gradient = 0 (Trends are regarded as "increasing" or "decreasing" if the absolute derivative value is bigger than this threshold)
# - removeMatrix = False (Not implemented)
# - removeTrend = True

z = ZHist(data, ['no','time','snap','date','status'], normal_term = 300, n_bins = 3, remove_events=['b'])

  self.data_simplified = data.drop(drop_cols, axis=1)


In [5]:
z.fit()

getWeightedAverage: 0.3333090841770172
interval creation started
current: A
current: B
current: C
current: D
current: E
current: F
current: G
current: H
current: I
current: J
interval creation is done: 245.78264617547393
interval separation started
interval separation is done: 0.009128373116254807


In [None]:
# Example of the event intervals created by Z-Hist
#z.repair_intervals_removed

In [7]:
# Create a data class recognized by the ZMiner (frequent pattern mining) algorithm.

# Needs to be done for both sets, which can be accessed by the following variables
# - repair_intervals_removed: falty set (status == 1)
# - normal_intervals_removed: normal set (status == 0)
database = Database(z.repair_intervals_removed)

# Parameters to create the constraint for the pattern mining algorithms:
# - minSup: minimum support of the pattern in the dataset to be considered for disproportionality
# - epsilon: flexibility parameter (default: 0)
# - gap: a maximum time gap to be considered as a 'follows' relation
# - timeout: timeout for the algorithm
# - level: a maximum size of the patterns
constraints = makeConstraints([0.1, 0, 1000, 2000000000, 3], z.repair_intervals_removed)

# Run pattern mining algorithms
# - forgettable: an option to remember the location of each pattern in the dataset (default: True).
#   * this is only used to check the actual location. If the interest is the frequencies or disproportionality, it should be turned off.
algorithm = ZMiner(database, constraints, forgettable=True)

# FL_repair variable will contain the frequency
count, freq, timedelta, timeout, FL_repair = algorithm.ZMiner()

########## Z-MINER ##########
1-1. MINIMUM SUPPORT: 10.0
1-2. EPSILON CONSTRAINT: 0.0
1-3. GAP CONSTRAINT: 1000.0
1-4. TIMEOUT: 2000000000
1-5. LEVEL: 3.0
2. NUMBER OF E-SEQUENCES: 100
3. TOTAL COMPARISON COUNTS: 731625
4. TOTAL FREQUENT ARRANGEMENTS: 7668
5. TOTAL TIME CONSUMED: 6.771270743999992


In [8]:
database2 = Database(z.normal_intervals_removed)
constraints2 = makeConstraints([0.1, 0, 1000, 200000000, 3], z.normal_intervals_removed)
algorithm2 = ZMinerD(database2, constraints2, FL_repair, forgettable=True)
count2, freq2, tdelta2, timeout2, FL_normal = algorithm2.ZMiner()

########## Z-MINER ##########
1-1. MINIMUM SUPPORT: 90.0
1-2. EPSILON CONSTRAINT: 0.0
1-3. GAP CONSTRAINT: 1000.0
1-4. TIMEOUT: 200000000
2. NUMBER OF E-SEQUENCES: 900
3. TOTAL COMPARISON COUNTS: 1750200
4. TOTAL FREQUENT ARRANGEMENTS: 725
5. TOTAL TIME CONSUMED: 18.59606928000005


In [23]:
# This function will calculate the disproportionality for each pattern and save it into a csv file.
# - (filename, faulty set, normal set, size of faulty set, size of normal set, constraints used)
exportDisprop("result", FL_repair, FL_normal, 100, 900, constraints2)

'Disprop_result_0.1_0.0_1000.0_200000000.csv'