In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from utils import *
from ZHist import *
from ZMiner import *
from ZMinerD import *
import pickle

We need to make two different pairs of two sets:
1. Create a normal set (1000) and an abnormal set (500)
2. Create a histogram variable with fixed number of bins and fixed number of time duration
 - fixed number of bins (10, 12, 14) 
     -> as we convert it to the distance from the central point, it does not affect much
 - fixed number of time duration (3 hours, 6 hours, 9 hours) for each snapshot
     -> it will work as a weight given to each snapshot of a specific bin when getting a central point
 - the number of snapshot -> the total time will be different time durations * number of snapshot
  for each bin $b$, $\sum_i^{s_b}{t_b(i)}$
  
3. logic
 - for a normal set: just follow a normal distribution
   - mean and std are randomly selected
 - for an abnormal set:
   - start from the same distribution with 
   - Markov simulation
     - for mean: multiply previous mean by 0.01/0.05/0.1 with the probability 0.5
     - for std: multiply previous std by 0.01/0.05/0.1 with the probability 0.5
   - we can change those two values to check the effect of both parameters.

### Create a synthetic dataset

In [2]:
def createSyntheticDataset(n_variables, bin_nos, durations, n_snapshots, n_normal, 
                          n_abnormal, prob_mean, prob_std, weight_mean, weight_std):
    # duration for each (car, snapshot)
    # need to be pre-defined to keep consistency
    selected_durations = {
        "normal": {},
        "abnormal": {},
    }
    normal_dfs = []
    abnormal_dfs = []
    multicolumns = []
    columns = []


    for n_car in range(n_normal):
        n_snap_selected = np.random.choice(n_snapshots, 1)[0]

        if n_car not in selected_durations["normal"]:
            selected_durations["normal"][n_car] = {}
        for n_snap in range(n_snap_selected):
            selected_durations["normal"][n_car][n_snap] = np.random.choice(durations, 1)[0]

    for n_car in range(n_abnormal):
        n_snap_selected = np.random.choice(n_snapshots, 1)[0]

        if n_car not in selected_durations["abnormal"]:
            selected_durations["abnormal"][n_car] = {}
        for n_snap in range(n_snap_selected):
            selected_durations["abnormal"][n_car][n_snap] = np.random.choice(durations, 1)[0]

    print("Snapshot & duration creation complete")

    # for each histogram parameters
    for n in range(n_variables):
        # create a variable with bins
        n_bin_selected = np.random.choice(bin_nos, 1)[0]
        bin_names = {b: chr(65+n)+str(b) for b in range(n_bin_selected)}

        ####
        # create columns

        for b in bin_names.values():
            multicolumns.append((chr(65+n), b))
            columns.append(b)

        ####
        print("For variable", n, ", the bins are", bin_names)
        print("=================================")
        print("Starting sampling...")
        
        
        # each histogram variable should have their own distribution
        # For simplicity, we start everything from the standard normal distribution
        # Those parameters can be changed for different simulation trials.
        h_val_mean = 0
        h_val_std = 1

        ####### choose number of snapshots it will have 
        initial_time = 0

        samples = {
            "normal": {},
            "abnormal": {},
        }

        total_samples_for_bin = []
        # for normal car
        for n_car in range(n_normal):
            # for each snapshot
            samples["normal"][n_car] = {}
            for n_snap in range(len(selected_durations["normal"][n_car])):
                #### random sampling
                # select a total duration (total sampling trials)

                duration_selected = selected_durations["normal"][n_car][n_snap]

                # random sample * duration time
                samples["normal"][n_car][n_snap] = [np.random.normal(h_val_mean, h_val_std) for _ in range(duration_selected)]
                total_samples_for_bin += samples["normal"][n_car][n_snap]
        # for abnormal car
        for n_car in range(n_abnormal):
            # for each snapshot
            samples["abnormal"][n_car] = {}
            # for abnormal group
            # every trial we need to correct the distribution
            h_val_mean_temp = h_val_mean
            h_val_std_temp = h_val_std

            for n_snap in range(len(selected_durations["abnormal"][n_car])):
                #### random sampling
                # select a total duration (total sampling trials)
                duration_selected = selected_durations["abnormal"][n_car][n_snap]

                samples_tmp = []

                for _ in range(duration_selected):
                    sample = np.random.normal(h_val_mean_temp, h_val_std_temp)
                    samples_tmp.append(sample)
                    #correct the distribution
                    # mean 
                    if np.random.choice([1, 0], p=[prob_mean, 1-prob_mean]):
                        h_val_mean_temp *= weight_mean
                    # std
                    if np.random.choice([1, 0], p=[prob_std, 1-prob_std]):
                        h_val_std_temp *= weight_std

                samples["abnormal"][n_car][n_snap] = samples_tmp
                total_samples_for_bin += samples["abnormal"][n_car][n_snap]

        # after finishing sampling (for each histogram variable), we need to create bins according to their values

        #concatenated = np.concatenate(list(samples["normal"][0].values())+list(samples["abnormal"][0].values()))
        out, bins = pd.qcut(total_samples_for_bin, n_bin_selected, retbins=True)
        del total_samples_for_bin #for memory stability

        # apply those bins to each normal/car and abnormal car

        for n_car in range(n_normal):
            normal_dict = []
            for n_snap in range(len(selected_durations["normal"][n_car])):
                bin_tmp = pd.cut(samples["normal"][n_car][n_snap], bins, labels=False)
                normal_dict.append(Counter(bin_tmp))
            normal_df = pd.DataFrame(normal_dict)
            normal_df.rename(columns=bin_names, inplace=True)
            if n == n_variables - 1:
                normal_df["no"] = n_car
                normal_df["time"] = selected_durations["normal"][n_car].values()
                normal_df["snap"] = range(len(selected_durations["normal"][n_car]))
                normal_df["date"] = normal_df["time"].cumsum()
            if n_car == 0:
                normal_df_concat = normal_df
            else:
                normal_df_concat = pd.concat((normal_df_concat, normal_df))
            normal_df_concat = normal_df_concat.loc[:, normal_df_concat.columns.notnull()]
        normal_dfs.append(normal_df_concat)

        for n_car in range(n_abnormal):
            abnormal_dict = []
            for n_snap in range(len(selected_durations["abnormal"][n_car])):
                bin_tmp = pd.cut(samples["abnormal"][n_car][n_snap], bins, labels=False)
                abnormal_dict.append(Counter(bin_tmp))
            abnormal_df = pd.DataFrame(abnormal_dict)
            abnormal_df.rename(columns=bin_names, inplace=True)
            if n == n_variables - 1:
                abnormal_df["no"] = n_normal + n_car
                abnormal_df["time"] = selected_durations["abnormal"][n_car].values()
                abnormal_df["snap"] = range(len(selected_durations["abnormal"][n_car]))
                abnormal_df["date"] = abnormal_df["time"].cumsum()
            if n_car == 0:
                abnormal_df_concat = abnormal_df
            else:
                abnormal_df_concat = pd.concat((abnormal_df_concat, abnormal_df))
            abnormal_df_concat = abnormal_df_concat.loc[:, abnormal_df_concat.columns.notnull()]
        abnormal_dfs.append(abnormal_df_concat)

    # clear for memory stability
    del samples
    normal_com = pd.concat(normal_dfs, axis=1, join='inner')
    abnormal_com = pd.concat(abnormal_dfs, axis=1, join='inner')
    normal_com['status'] = 0
    abnormal_com['status'] = 1
    
    # index modification
    data = pd.concat([normal_com, abnormal_com], axis=0).reset_index(drop=True)
    data_simplified = data[columns]
    data_simplified.columns = pd.MultiIndex.from_tuples(multicolumns)
    data_simplified[['no','time','snap','date','status']] = data[['no','time','snap','date','status']]
    
    return data_simplified

### Synthetic data 1: weight mean 1% increase with 30% prob. std fixed

In [107]:
param_syn1 = {
    "n_variables": 5, # number of histogram variables
    "bin_nos": [10, 12, 14], #will be randomly chosen
    "durations": [360, 720, 1080], #will be randomly chosen
    "n_snapshots": [10, 15, 20],
    "n_normal": 1000,
    "n_abnormal": 500,
    "prob_mean": 0.5,
    "prob_std": 0,
    "weight_mean": 1.01,
    "weight_std": 1.0
}

define columns

In [108]:
data1 = createSyntheticDataset(**param_syn1)

Snapshot & duration creation complete
For variable 0 , the bins are {0: 'A0', 1: 'A1', 2: 'A2', 3: 'A3', 4: 'A4', 5: 'A5', 6: 'A6', 7: 'A7', 8: 'A8', 9: 'A9', 10: 'A10', 11: 'A11'}
Starting sampling...
For variable 1 , the bins are {0: 'B0', 1: 'B1', 2: 'B2', 3: 'B3', 4: 'B4', 5: 'B5', 6: 'B6', 7: 'B7', 8: 'B8', 9: 'B9'}
Starting sampling...
For variable 2 , the bins are {0: 'C0', 1: 'C1', 2: 'C2', 3: 'C3', 4: 'C4', 5: 'C5', 6: 'C6', 7: 'C7', 8: 'C8', 9: 'C9', 10: 'C10', 11: 'C11'}
Starting sampling...
For variable 3 , the bins are {0: 'D0', 1: 'D1', 2: 'D2', 3: 'D3', 4: 'D4', 5: 'D5', 6: 'D6', 7: 'D7', 8: 'D8', 9: 'D9', 10: 'D10', 11: 'D11', 12: 'D12', 13: 'D13'}
Starting sampling...
For variable 4 , the bins are {0: 'E0', 1: 'E1', 2: 'E2', 3: 'E3', 4: 'E4', 5: 'E5', 6: 'E6', 7: 'E7', 8: 'E8', 9: 'E9', 10: 'E10', 11: 'E11', 12: 'E12', 13: 'E13'}
Starting sampling...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


### Synthetic data 2: weight mean 5% increase with 30% prob. std fixed

In [109]:
param_syn2 = {
    "n_variables": 5, # number of histogram variables
    "bin_nos": [10, 12, 14], #will be randomly chosen
    "durations": [360, 720, 1080], #will be randomly chosen
    "n_snapshots": [10, 15, 20],
    "n_normal": 1000,
    "n_abnormal": 500,
    "prob_mean": 0.5,
    "prob_std": 0,
    "weight_mean": 1.05,
    "weight_std": 1.0
}

In [110]:
data2 = createSyntheticDataset(**param_syn2)

Snapshot & duration creation complete
For variable 0 , the bins are {0: 'A0', 1: 'A1', 2: 'A2', 3: 'A3', 4: 'A4', 5: 'A5', 6: 'A6', 7: 'A7', 8: 'A8', 9: 'A9', 10: 'A10', 11: 'A11'}
Starting sampling...
For variable 1 , the bins are {0: 'B0', 1: 'B1', 2: 'B2', 3: 'B3', 4: 'B4', 5: 'B5', 6: 'B6', 7: 'B7', 8: 'B8', 9: 'B9', 10: 'B10', 11: 'B11', 12: 'B12', 13: 'B13'}
Starting sampling...
For variable 2 , the bins are {0: 'C0', 1: 'C1', 2: 'C2', 3: 'C3', 4: 'C4', 5: 'C5', 6: 'C6', 7: 'C7', 8: 'C8', 9: 'C9'}
Starting sampling...
For variable 3 , the bins are {0: 'D0', 1: 'D1', 2: 'D2', 3: 'D3', 4: 'D4', 5: 'D5', 6: 'D6', 7: 'D7', 8: 'D8', 9: 'D9', 10: 'D10', 11: 'D11'}
Starting sampling...
For variable 4 , the bins are {0: 'E0', 1: 'E1', 2: 'E2', 3: 'E3', 4: 'E4', 5: 'E5', 6: 'E6', 7: 'E7', 8: 'E8', 9: 'E9', 10: 'E10', 11: 'E11', 12: 'E12', 13: 'E13'}
Starting sampling...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


### Synthetic data 3: weight mean 10% increase with 30% prob. std fixed

In [111]:
param_syn3 = {
    "n_variables": 5, # number of histogram variables
    "bin_nos": [10, 12, 14], #will be randomly chosen
    "durations": [360, 720, 1080], #will be randomly chosen
    "n_snapshots": [10, 15, 20],
    "n_normal": 1000,
    "n_abnormal": 500,
    "prob_mean": 0.5,
    "prob_std": 0,
    "weight_mean": 1.1,
    "weight_std": 1.0
}

In [112]:
data3 = createSyntheticDataset(**param_syn3)

Snapshot & duration creation complete
For variable 0 , the bins are {0: 'A0', 1: 'A1', 2: 'A2', 3: 'A3', 4: 'A4', 5: 'A5', 6: 'A6', 7: 'A7', 8: 'A8', 9: 'A9', 10: 'A10', 11: 'A11', 12: 'A12', 13: 'A13'}
Starting sampling...
For variable 1 , the bins are {0: 'B0', 1: 'B1', 2: 'B2', 3: 'B3', 4: 'B4', 5: 'B5', 6: 'B6', 7: 'B7', 8: 'B8', 9: 'B9', 10: 'B10', 11: 'B11', 12: 'B12', 13: 'B13'}
Starting sampling...
For variable 2 , the bins are {0: 'C0', 1: 'C1', 2: 'C2', 3: 'C3', 4: 'C4', 5: 'C5', 6: 'C6', 7: 'C7', 8: 'C8', 9: 'C9', 10: 'C10', 11: 'C11'}
Starting sampling...
For variable 3 , the bins are {0: 'D0', 1: 'D1', 2: 'D2', 3: 'D3', 4: 'D4', 5: 'D5', 6: 'D6', 7: 'D7', 8: 'D8', 9: 'D9', 10: 'D10', 11: 'D11', 12: 'D12', 13: 'D13'}
Starting sampling...
For variable 4 , the bins are {0: 'E0', 1: 'E1', 2: 'E2', 3: 'E3', 4: 'E4', 5: 'E5', 6: 'E6', 7: 'E7', 8: 'E8', 9: 'E9', 10: 'E10', 11: 'E11', 12: 'E12', 13: 'E13'}
Starting sampling...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


### Synthetic data 4: weight std 1% increase with 50% prob. mean fixed

In [113]:
param_syn4 = {
    "n_variables": 5, # number of histogram variables
    "bin_nos": [10, 12, 14], #will be randomly chosen
    "durations": [360, 720, 1080], #will be randomly chosen
    "n_snapshots": [10, 15, 20],
    "n_normal": 1000,
    "n_abnormal": 500,
    "prob_mean": 0,
    "prob_std": 0.5,
    "weight_mean": 1.0,
    "weight_std": 1.01
}

In [114]:
data4 = createSyntheticDataset(**param_syn4)

Snapshot & duration creation complete
For variable 0 , the bins are {0: 'A0', 1: 'A1', 2: 'A2', 3: 'A3', 4: 'A4', 5: 'A5', 6: 'A6', 7: 'A7', 8: 'A8', 9: 'A9', 10: 'A10', 11: 'A11'}
Starting sampling...
For variable 1 , the bins are {0: 'B0', 1: 'B1', 2: 'B2', 3: 'B3', 4: 'B4', 5: 'B5', 6: 'B6', 7: 'B7', 8: 'B8', 9: 'B9'}
Starting sampling...
For variable 2 , the bins are {0: 'C0', 1: 'C1', 2: 'C2', 3: 'C3', 4: 'C4', 5: 'C5', 6: 'C6', 7: 'C7', 8: 'C8', 9: 'C9', 10: 'C10', 11: 'C11', 12: 'C12', 13: 'C13'}
Starting sampling...
For variable 3 , the bins are {0: 'D0', 1: 'D1', 2: 'D2', 3: 'D3', 4: 'D4', 5: 'D5', 6: 'D6', 7: 'D7', 8: 'D8', 9: 'D9'}
Starting sampling...
For variable 4 , the bins are {0: 'E0', 1: 'E1', 2: 'E2', 3: 'E3', 4: 'E4', 5: 'E5', 6: 'E6', 7: 'E7', 8: 'E8', 9: 'E9'}
Starting sampling...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


### Synthetic data 5: weight std 5% increase with 50% prob. mean fixed

In [115]:
param_syn5 = {
    "n_variables": 5, # number of histogram variables
    "bin_nos": [10, 12, 14], #will be randomly chosen
    "durations": [360, 720, 1080], #will be randomly chosen
    "n_snapshots": [10, 15, 20],
    "n_normal": 1000,
    "n_abnormal": 500,
    "prob_mean": 0,
    "prob_std": 0.5,
    "weight_mean": 1.0,
    "weight_std": 1.05
}

In [116]:
data5 = createSyntheticDataset(**param_syn5)

Snapshot & duration creation complete
For variable 0 , the bins are {0: 'A0', 1: 'A1', 2: 'A2', 3: 'A3', 4: 'A4', 5: 'A5', 6: 'A6', 7: 'A7', 8: 'A8', 9: 'A9'}
Starting sampling...
For variable 1 , the bins are {0: 'B0', 1: 'B1', 2: 'B2', 3: 'B3', 4: 'B4', 5: 'B5', 6: 'B6', 7: 'B7', 8: 'B8', 9: 'B9'}
Starting sampling...
For variable 2 , the bins are {0: 'C0', 1: 'C1', 2: 'C2', 3: 'C3', 4: 'C4', 5: 'C5', 6: 'C6', 7: 'C7', 8: 'C8', 9: 'C9', 10: 'C10', 11: 'C11', 12: 'C12', 13: 'C13'}
Starting sampling...
For variable 3 , the bins are {0: 'D0', 1: 'D1', 2: 'D2', 3: 'D3', 4: 'D4', 5: 'D5', 6: 'D6', 7: 'D7', 8: 'D8', 9: 'D9', 10: 'D10', 11: 'D11'}
Starting sampling...
For variable 4 , the bins are {0: 'E0', 1: 'E1', 2: 'E2', 3: 'E3', 4: 'E4', 5: 'E5', 6: 'E6', 7: 'E7', 8: 'E8', 9: 'E9', 10: 'E10', 11: 'E11', 12: 'E12', 13: 'E13'}
Starting sampling...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


### Synthetic data 6: weight std 10% increase with 50% prob. mean fixed

In [117]:
param_syn6 = {
    "n_variables": 5, # number of histogram variables
    "bin_nos": [10, 12, 14], #will be randomly chosen
    "durations": [360, 720, 1080], #will be randomly chosen
    "n_snapshots": [10, 15, 20],
    "n_normal": 1000,
    "n_abnormal": 500,
    "prob_mean": 0,
    "prob_std": 0.5,
    "weight_mean": 1.0,
    "weight_std": 1.1
}

In [118]:
data6 = createSyntheticDataset(**param_syn6)

Snapshot & duration creation complete
For variable 0 , the bins are {0: 'A0', 1: 'A1', 2: 'A2', 3: 'A3', 4: 'A4', 5: 'A5', 6: 'A6', 7: 'A7', 8: 'A8', 9: 'A9'}
Starting sampling...
For variable 1 , the bins are {0: 'B0', 1: 'B1', 2: 'B2', 3: 'B3', 4: 'B4', 5: 'B5', 6: 'B6', 7: 'B7', 8: 'B8', 9: 'B9', 10: 'B10', 11: 'B11', 12: 'B12', 13: 'B13'}
Starting sampling...
For variable 2 , the bins are {0: 'C0', 1: 'C1', 2: 'C2', 3: 'C3', 4: 'C4', 5: 'C5', 6: 'C6', 7: 'C7', 8: 'C8', 9: 'C9', 10: 'C10', 11: 'C11'}
Starting sampling...
For variable 3 , the bins are {0: 'D0', 1: 'D1', 2: 'D2', 3: 'D3', 4: 'D4', 5: 'D5', 6: 'D6', 7: 'D7', 8: 'D8', 9: 'D9', 10: 'D10', 11: 'D11'}
Starting sampling...
For variable 4 , the bins are {0: 'E0', 1: 'E1', 2: 'E2', 3: 'E3', 4: 'E4', 5: 'E5', 6: 'E6', 7: 'E7', 8: 'E8', 9: 'E9', 10: 'E10', 11: 'E11'}
Starting sampling...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [57]:
def runAnalysis(z, constraints_1 = [0.05, 0, 2000],
               constraints_2 = [0, 0, 2000], timeout = 20000000):
    
    #abnormal set
    database1 = Database(z.repair_intervals_removed)
    constraints1 = makeConstraints(constraints_1, z.repair_intervals_removed)
    algorithm1 = ZMiner(database1, constraints1, forgettable=True)
    algorithm1.constraints["timeoutseconds"] = timeout
    count, freq, timedelta, timeout1, FL_repair = algorithm1.ZMiner()
    
    #normal set
    database2 = Database(z.normal_intervals_removed)
    constraints2 = makeConstraints(constraints_2, z.normal_intervals_removed)
    algorithm2 = ZMinerD(database2, constraints2, FL_repair, forgettable=True)
    algorithm2.constraints["timeoutseconds"] = timeout
    count2, freq2, tdelta2, timeout2, FL_normal = algorithm2.ZMiner()
    
    return FL_normal, FL_repair, algorithm1.constraints
    

In [17]:
def saveAnalysis(set1, set2, size_set1, size_set2, filename, constraints):
    
    #Constraints are just used to generate a file name
    for j in set1[2]:
        for k in set1[2][j]:
            set1[2][j][k] = len(set1[2][j][k])

    for j in set2[2]:
        for k in set2[2][j]:
            set2[2][j][k] = len(set2[2][j][k])
    
    exportDisprop(filename, set1, set2, size_set1, size_set2, constraints)

Save synthetic datasets

In [121]:
data1.to_excel("Synthetic_data_new1.xlsx")
data2.to_excel("Synthetic_data_new2.xlsx")
data3.to_excel("Synthetic_data_new3.xlsx")
data4.to_excel("Synthetic_data_new4.xlsx")
data5.to_excel("Synthetic_data_new5.xlsx")
data6.to_excel("Synthetic_data_new6.xlsx")

In [19]:
def loadData(filename, columns = ['no', 'time', 'snap', 'date', 'status']):
    data_loadtest = pd.read_excel(filename, header=[0, 1], index_col=0)
    data_copied = data_loadtest[columns]
    data_copied.columns = columns
    a = data_loadtest.drop(columns, axis=1)
    a[columns] = data_copied
    a = a.fillna(0)
    return a

In [20]:
data1 = loadData("Synthetic_data_new1.xlsx")
data2 = loadData("Synthetic_data_new2.xlsx")
data3 = loadData("Synthetic_data_new3.xlsx")
data4 = loadData("Synthetic_data_new4.xlsx")
data5 = loadData("Synthetic_data_new5.xlsx")
data6 = loadData("Synthetic_data_new6.xlsx")

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


### Run the algorithm

1. Create intervals

In [122]:
z1 = ZHist(data1, ['no','time','snap','date','status'])
z1.fit()

getWeightedAverage: 1.2003814519848675
interval creation started
current: A
current: B
current: C
current: D
current: E
interval creation is done: 217.06658401200548
interval separation started
interval separation is done: 0.01011984795331955


In [123]:
z2 = ZHist(data2, ['no','time','snap','date','status'])
z2.fit()

getWeightedAverage: 1.2087526749819517
interval creation started
current: A
current: B
current: C
current: D
current: E
interval creation is done: 217.27154814917594
interval separation started
interval separation is done: 0.010706569999456406


In [124]:
z3 = ZHist(data3, ['no','time','snap','date','status'])
z3.fit()

getWeightedAverage: 1.2045948880258948
interval creation started
current: A
current: B
current: C
current: D
current: E
interval creation is done: 211.9140679340344
interval separation started
interval separation is done: 0.011008181143552065


In [129]:
z4 = ZHist(data4, ['no','time','snap','date','status'])
z4.fit()

getWeightedAverage: 1.8409740149509162
interval creation started
current: A
current: B
current: C
current: D
current: E
interval creation is done: 264.2915248409845
interval separation started
interval separation is done: 0.009996433975175023


In [131]:
z5 = ZHist(data5, ['no','time','snap','date','status'])
z5.fit()

getWeightedAverage: 1.8548737389501184
interval creation started
current: A
current: B
current: C
current: D
current: E
interval creation is done: 260.1238736829255
interval separation started
interval separation is done: 0.009955997113138437


In [132]:
z6 = ZHist(data6, ['no','time','snap','date','status'])
z6.fit()

getWeightedAverage: 1.8592853820882738
interval creation started
current: A
current: B
current: C
current: D
current: E
interval creation is done: 260.16304202890024
interval separation started
interval separation is done: 0.011884032050147653


In [149]:
z7 = ZHist(data7, ['no','time','snap','date','status'])
z7.fit()

getWeightedAverage: 1.2041378929279745
interval creation started
current: A
current: B
current: C
current: D
current: E
interval creation is done: 215.02100859908387
interval separation started
interval separation is done: 0.010951247066259384


2. Create analysis

In [134]:
FL_normal_1, FL_repair_1, c1 = runAnalysis(z1)

########## Z-MINER ##########
1-1. MINIMUM SUPPORT: 25.0
1-2. EPSILON CONSTRAINT: 0.0
1-3. GAP CONSTRAINT: 2000.0
1-4. TIMEOUT: 20000000
1-5. LEVEL: inf
2. NUMBER OF E-SEQUENCES: 500
3. TOTAL COMPARISON COUNTS: 2888251
4. TOTAL FREQUENT ARRANGEMENTS: 8134
5. TOTAL TIME CONSUMED: 42.204905441001756
########## Z-MINER ##########
1-1. MINIMUM SUPPORT: 0.0
1-2. EPSILON CONSTRAINT: 0.0
1-3. GAP CONSTRAINT: 2000.0
1-4. TIMEOUT: 20000000
2. NUMBER OF E-SEQUENCES: 1000
3. TOTAL COMPARISON COUNTS: 1970949
4. TOTAL FREQUENT ARRANGEMENTS: 8134
5. TOTAL TIME CONSUMED: 30.26906665199931


In [135]:
FL_normal_2, FL_repair_2, c2 = runAnalysis(z2)

########## Z-MINER ##########
1-1. MINIMUM SUPPORT: 25.0
1-2. EPSILON CONSTRAINT: 0.0
1-3. GAP CONSTRAINT: 2000.0
1-4. TIMEOUT: 20000000
1-5. LEVEL: inf
2. NUMBER OF E-SEQUENCES: 500
3. TOTAL COMPARISON COUNTS: 2978262
4. TOTAL FREQUENT ARRANGEMENTS: 8132
5. TOTAL TIME CONSUMED: 42.94829656699949
########## Z-MINER ##########
1-1. MINIMUM SUPPORT: 0.0
1-2. EPSILON CONSTRAINT: 0.0
1-3. GAP CONSTRAINT: 2000.0
1-4. TIMEOUT: 20000000
2. NUMBER OF E-SEQUENCES: 1000
3. TOTAL COMPARISON COUNTS: 2037866
4. TOTAL FREQUENT ARRANGEMENTS: 8132
5. TOTAL TIME CONSUMED: 30.838913900002808


In [136]:
FL_normal_3, FL_repair_3, c3 = runAnalysis(z3)

########## Z-MINER ##########
1-1. MINIMUM SUPPORT: 25.0
1-2. EPSILON CONSTRAINT: 0.0
1-3. GAP CONSTRAINT: 2000.0
1-4. TIMEOUT: 20000000
1-5. LEVEL: inf
2. NUMBER OF E-SEQUENCES: 500
3. TOTAL COMPARISON COUNTS: 2912834
4. TOTAL FREQUENT ARRANGEMENTS: 8075
5. TOTAL TIME CONSUMED: 41.307417665000685
########## Z-MINER ##########
1-1. MINIMUM SUPPORT: 0.0
1-2. EPSILON CONSTRAINT: 0.0
1-3. GAP CONSTRAINT: 2000.0
1-4. TIMEOUT: 20000000
2. NUMBER OF E-SEQUENCES: 1000
3. TOTAL COMPARISON COUNTS: 1970069
4. TOTAL FREQUENT ARRANGEMENTS: 8075
5. TOTAL TIME CONSUMED: 30.296202350000385


In [138]:
FL_normal_4, FL_repair_4, c4 = runAnalysis(z4)

########## Z-MINER ##########
1-1. MINIMUM SUPPORT: 25.0
1-2. EPSILON CONSTRAINT: 0.0
1-3. GAP CONSTRAINT: 2000.0
1-4. TIMEOUT: 20000000
1-5. LEVEL: inf
2. NUMBER OF E-SEQUENCES: 500
3. TOTAL COMPARISON COUNTS: 8612111
4. TOTAL FREQUENT ARRANGEMENTS: 21888
5. TOTAL TIME CONSUMED: 109.52835602899722
########## Z-MINER ##########
1-1. MINIMUM SUPPORT: 0.0
1-2. EPSILON CONSTRAINT: 0.0
1-3. GAP CONSTRAINT: 2000.0
1-4. TIMEOUT: 20000000
2. NUMBER OF E-SEQUENCES: 1000
3. TOTAL COMPARISON COUNTS: 1629374
4. TOTAL FREQUENT ARRANGEMENTS: 15704
5. TOTAL TIME CONSUMED: 27.213986329999898


In [139]:
FL_normal_5, FL_repair_5, c5 = runAnalysis(z5)

########## Z-MINER ##########
1-1. MINIMUM SUPPORT: 25.0
1-2. EPSILON CONSTRAINT: 0.0
1-3. GAP CONSTRAINT: 2000.0
1-4. TIMEOUT: 20000000
1-5. LEVEL: inf
2. NUMBER OF E-SEQUENCES: 500
3. TOTAL COMPARISON COUNTS: 8941354
4. TOTAL FREQUENT ARRANGEMENTS: 24188
5. TOTAL TIME CONSUMED: 117.38165531999766
########## Z-MINER ##########
1-1. MINIMUM SUPPORT: 0.0
1-2. EPSILON CONSTRAINT: 0.0
1-3. GAP CONSTRAINT: 2000.0
1-4. TIMEOUT: 20000000
2. NUMBER OF E-SEQUENCES: 1000
3. TOTAL COMPARISON COUNTS: 1604464
4. TOTAL FREQUENT ARRANGEMENTS: 17772
5. TOTAL TIME CONSUMED: 30.078794281002047


In [140]:
FL_normal_6, FL_repair_6, c6 = runAnalysis(z6)

########## Z-MINER ##########
1-1. MINIMUM SUPPORT: 25.0
1-2. EPSILON CONSTRAINT: 0.0
1-3. GAP CONSTRAINT: 2000.0
1-4. TIMEOUT: 20000000
1-5. LEVEL: inf
2. NUMBER OF E-SEQUENCES: 500
3. TOTAL COMPARISON COUNTS: 8849820
4. TOTAL FREQUENT ARRANGEMENTS: 23023
5. TOTAL TIME CONSUMED: 113.03075962199728
########## Z-MINER ##########
1-1. MINIMUM SUPPORT: 0.0
1-2. EPSILON CONSTRAINT: 0.0
1-3. GAP CONSTRAINT: 2000.0
1-4. TIMEOUT: 20000000
2. NUMBER OF E-SEQUENCES: 1000
3. TOTAL COMPARISON COUNTS: 2183548
4. TOTAL FREQUENT ARRANGEMENTS: 20199
5. TOTAL TIME CONSUMED: 41.325332876000175


In [64]:
def saveAnalysis(set1, set2, size_set1, size_set2, filename, constraints, lenFix=True):
    
    if lenFix==True:
        #Constraints are just used to generate a file name
        for j in set1[2]:
            for k in set1[2][j]:
                set1[2][j][k] = len(set1[2][j][k])

        for j in set2[2]:
            for k in set2[2][j]:
                set2[2][j][k] = len(set2[2][j][k])

    exportDisprop(filename, set1, set2, size_set1, size_set2, constraints)

In [67]:
lenFix = True

In [142]:
saveAnalysis(FL_repair_1, FL_normal_1, 500, 1000, "Synthetic_1_new", c1, lenFix=lenFix)

In [143]:
saveAnalysis(FL_repair_2, FL_normal_2, 500, 1000, "Synthetic_2_new", c2, lenFix=lenFix)

In [144]:
saveAnalysis(FL_repair_3, FL_normal_3, 500, 1000, "Synthetic_3_new", c3, lenFix=lenFix)

In [145]:
saveAnalysis(FL_repair_4, FL_normal_4, 500, 1000, "Synthetic_4_new", c4, lenFix=lenFix)

In [146]:
saveAnalysis(FL_repair_5, FL_normal_5, 500, 1000, "Synthetic_5_new", c5, lenFix=lenFix)

In [147]:
saveAnalysis(FL_repair_6, FL_normal_6, 500, 1000, "Synthetic_6_new", c6, lenFix=lenFix)