In [None]:
from datetime import timedelta
import time
import numpy as np
from functools import partial
from sklearn.model_selection import train_test_split
import itertools
import pickle
import pandas as pd

In [None]:
def filterAlarmData(df, months=None, sources_filter=[], monmentarly_filter=None, staling_filter=None ,ingore_communication_alarms=False, min_alarms_per_source=10):

    print(f">>Preprocessing... \n   Months to include={months}\n  Ignore Sources={sources_filter}\n  Ingnore Momentarlily Alarms Filter={monmentarly_filter}seconds \n   Ignoreing Staling Alarms Filter={staling_filter} seconds, \n Ignore Communication Alarms = {ingore_communication_alarms} \n Remove sources whose count is less than {min_alarms_per_source}")
    
    
    if months is None:
        months = df["Year-Month"].unique()

    if monmentarly_filter is not None: 
        df = df[(df["TimeDelta"] > monmentarly_filter)]

    if staling_filter is not None:
        df =  df[(df["TimeDelta"] < staling_filter)]

    if ingore_communication_alarms==True:
        df = df[~df["Condition"].isin(["IOP", "IOP-"])]
    
    df =  df[(df["Year-Month"].isin(months))]
    df = df[(~df["SourceName"].isin(sources_filter))]

    source2count = dict(df["SourceName"].value_counts())
    select_sources = [k for k, v in source2count.items() if v >= min_alarms_per_source]
    df = df[df["SourceName"].isin(select_sources)]

    return df

def _concatenateSourceNameAndCondition(sname, condition):
    return sname+"-"+condition


def updatSourceNamewithCondition(df):
    df["SourceName"] = df[["SourceName", "Condition"]].apply(
        lambda arg: _concatenateSourceNameAndCondition(*arg), axis=1)
    return df

def convertSourceNamesToAlias(df):
    alias2name = {f"A{k}": v for k, v in enumerate(df["SourceName"].unique())}
    name2alias = {v: k for k, v in alias2name.items()}
    df["SourceName"] = df["SourceName"].apply(lambda sname: name2alias[sname])
    return name2alias, alias2name

def __removeChatteringAlarmsHelper(alarms, chattering_timedelta_threshold=60.0, chattering_count_threshold=3):
    """Find the chatterings in an alarms list from the same source.  
    """

    alarms_without_chattering = []
    alarms = [alarm for alarm in sorted(alarms, key=lambda arg: arg["StartTime"], reverse=False)]
    i = 0
    j = 0

    while i < (len(alarms)):
        alarms_without_chattering.append(alarms[i])
        prev_start = alarms[i]["StartTime"]
        prev_end = alarms[i]["EndTime"]
        count_alarms = 0
        j = i + 1
        while j < len(alarms):
            next_start = alarms[j]["StartTime"]
            next_end = alarms[j]["EndTime"]

            # this assert is very important: the prev alarm has to turn off before the start of
            # the next one
            assert(prev_start <= next_start)
            assert(prev_end <= next_start)
            assert(prev_end <= next_end)

            delta = timedelta.total_seconds(next_start - prev_start)
            assert (delta >= 0)
            
            if delta > chattering_timedelta_threshold:
                break
            count_alarms += 1
            
            j += 1
        
        if count_alarms >= chattering_count_threshold:
            i = j
        else:
            i += 1

    return alarms_without_chattering

def removeChatteringAlarms(df):
    alarms_without_chatterings = []
    for sname in df["SourceName"].unique():
        df_source = df.loc[df['SourceName'].isin([sname])]

        for condition in df_source["Condition"].unique():
            df_condition = df_source.loc[df_source['Condition'].isin([condition])]
            alarms = __removeChatteringAlarmsHelper(df_condition.to_dict(orient="records"))
            alarms_without_chatterings = alarms_without_chatterings + alarms

    return pd.DataFrame(alarms_without_chatterings)

def loadAlarmsData(file_path):
    df = pd.read_csv(file_path, low_memory=False, usecols=["SourceName", "Condition", "StartTime","EndTime","TimeDelta","Year-Month"],parse_dates=["StartTime", "EndTime"])
    return df

def getDFWithCommonSourcesInAllMonths(df):
    each_month_source_names = [[sname for sname in df[df["Year-Month"]==month]["SourceName"].unique()] for month in df["Year-Month"].unique()]
    common_sourcenames_in_all_months = set.intersection(*[set(l) for l in each_month_source_names])
    df = df[df["SourceName"].isin(common_sourcenames_in_all_months)]
    return df

def getSequenceOfWholeData(df,seq_duration_gap,filter_short_seq):
    print(f">> Duration to next seq: {seq_duration_gap}, ignore seq len: {filter_short_seq}")

    list_of_sequences = []    
    alarms= df.to_dict(orient="records")
    alarms = [alarm for alarm in sorted(alarms, key=lambda arg: arg["StartTime"], reverse=False)] # sorting
    i =0
    j= 0

    max_seq_len = -1
    while i <len(alarms):
        prev_start = alarms[i]["StartTime"]
        seq = []
        seq.append(alarms[i])
        j = i+1
        while j < len(alarms):    
            next_start = alarms[j]["StartTime"]
            delta = timedelta.total_seconds(next_start - prev_start)
            # print(delta)
            assert delta >= 0
            if delta >= seq_duration_gap:
                break

            seq.append(alarms[j])
            j += 1
        i = j

        if len(seq) > max_seq_len:
            max_seq_len = len(seq)
        
        if len(seq)>=filter_short_seq:
            seq = [alarm for alarm in sorted(seq, key=lambda arg: arg["StartTime"], reverse=False)]
            seq = [alarm["SourceName"] for alarm in seq]
            list_of_sequences.append(seq)
    
    
    return list_of_sequences, max_seq_len


In [None]:
"""  Lodading the Data and Preprocessing """
start = time.time()

df_main_alarms =loadAlarmsData(file_path="/home/waris/Github/predict-future-alarms/.data/final-all-months-alarms.csv")
df_main_alarms = updatSourceNamewithCondition(df_main_alarms)
print("Total Time to load the data ", time.time()-start)

df_main_alarms

In [None]:
# """ Chaning name 2 alias for alarm data but skipping it """
source2Alias, alias2source = convertSourceNamesToAlias(df_main_alarms)
df_main_alarms

In [None]:
""" 
    Filter the Alarm Data
    1. Ignore the communication Alarms
    2. Ignore the momentary alarms => 20 seconds
    3. Remove Staling Alarms => 12 hours    
    4. Remove sources which are triggered less 20 in whole dataset
    5. Include all the months
    6. DO SKIP ANY SOURCENAME IF IGNORING COMMUNICATION ALARMS
"""
ignore_comm_alarms = True
momentary_alarms_f = None # seconds
staling_alarm_f = None # hours
min_alarms_per_source_f = 20 # any source which is not triggered atleast 20 times in whole dataset will be removed
months_f = df_main_alarms["Year-Month"].unique()
snames_f = [] # ONLY USE IF NOT IGNORING COMM ALRMS

df_alarms_new = filterAlarmData(df_main_alarms, months=months_f, sources_filter=snames_f,
                                     monmentarly_filter=momentary_alarms_f, staling_filter=staling_alarm_f, ingore_communication_alarms=ignore_comm_alarms, min_alarms_per_source=min_alarms_per_source_f)


df_alarms_new

In [None]:
df_rnn = removeChatteringAlarms(df_alarms_new)
df_rnn

In [None]:
duration_from_1_seq_to_next = 60*15 # duration in seconds
filter_short_seq = 4 # remove the sequence whose size is less than 4
li_of_seqs,max_seq_len = getSequenceOfWholeData(df_rnn,duration_from_1_seq_to_next,filter_short_seq)
print(len(li_of_seqs))
print(li_of_seqs[:2])

In [None]:
with open("/home/waris/Github/predict-future-alarms/.data/seqs.tokens","w") as f:
    for seq in li_of_seqs:
        f.write(f"{' '.join(seq)}\n")
