In [1]:
from datetime import timedelta
import time
import numpy as np
from functools import partial
from sklearn.model_selection import train_test_split
import itertools
import pickle
import pandas as pd

In [2]:
def filterAlarmData(df, months=None, sources_filter=[], monmentarly_filter=None, staling_filter=None ,ingore_communication_alarms=False, min_alarms_per_source=10):

    print(f">>Preprocessing... \n   Months to include={months}\n  Ignore Sources={sources_filter}\n  Ingnore Momentarlily Alarms Filter={monmentarly_filter}seconds \n   Ignoreing Staling Alarms Filter={staling_filter} seconds, \n Ignore Communication Alarms = {ingore_communication_alarms} \n Remove sources whose count is less than {min_alarms_per_source}")
    
    
    if months is None:
        months = df["Year-Month"].unique()

    if monmentarly_filter is not None: 
        df = df[(df["TimeDelta"] > monmentarly_filter)]

    if staling_filter is not None:
        df =  df[(df["TimeDelta"] < staling_filter)]

    if ingore_communication_alarms==True:
        df = df[~df["Condition"].isin(["IOP", "IOP-"])]
    
    df =  df[(df["Year-Month"].isin(months))]
    df = df[(~df["SourceName"].isin(sources_filter))]

    source2count = dict(df["SourceName"].value_counts())
    select_sources = [k for k, v in source2count.items() if v >= min_alarms_per_source]
    df = df[df["SourceName"].isin(select_sources)]

    return df

def _concatenateSourceNameAndCondition(sname, condition):
    return sname+"-"+condition


def updatSourceNamewithCondition(df):
    df["SourceName"] = df[["SourceName", "Condition"]].apply(
        lambda arg: _concatenateSourceNameAndCondition(*arg), axis=1)
    return df

def convertSourceNamesToAlias(df):
    alias2name = {f"A{k}": v for k, v in enumerate(df["SourceName"].unique())}
    name2alias = {v: k for k, v in alias2name.items()}
    df["SourceName"] = df["SourceName"].apply(lambda sname: name2alias[sname])
    return name2alias, alias2name

def __removeChatteringAlarmsHelper(alarms, chattering_timedelta_threshold=60.0, chattering_count_threshold=2):
    """Find the chatterings in an alarms list from the same source.  
    """

    alarms_without_chattering = []
    alarms = [alarm for alarm in sorted(alarms, key=lambda arg: arg["StartTime"], reverse=False)]
    i = 0
    j = 0

    while i < (len(alarms)):
        alarms_without_chattering.append(alarms[i])
        prev_start = alarms[i]["StartTime"]
        prev_end = alarms[i]["EndTime"]
        count_alarms = 0
        j = i + 1
        while j < len(alarms):
            next_start = alarms[j]["StartTime"]
            next_end = alarms[j]["EndTime"]

            # this assert is very important: the prev alarm has to turn off before the start of
            # the next one
            assert(prev_start <= next_start)
            assert(prev_end <= next_start)
            assert(prev_end <= next_end)

            delta = timedelta.total_seconds(next_start - prev_start)
            assert (delta >= 0)
            
            if delta > chattering_timedelta_threshold:
                break
            count_alarms += 1
            
            j += 1
        
        if count_alarms >= chattering_count_threshold:
            i = j
        else:
            i += 1

    return alarms_without_chattering

def removeChatteringAlarms(df,chattering_timedelta_threshold=120):
    alarms_without_chatterings = []
    for sname in df["SourceName"].unique():
        df_source = df.loc[df['SourceName'].isin([sname])]

        for condition in df_source["Condition"].unique():
            df_condition = df_source.loc[df_source['Condition'].isin([condition])]
            alarms = __removeChatteringAlarmsHelper(df_condition.to_dict(orient="records"),chattering_timedelta_threshold=chattering_timedelta_threshold)
            alarms_without_chatterings = alarms_without_chatterings + alarms

    return pd.DataFrame(alarms_without_chatterings)

def loadAlarmsData(file_path):
    df = pd.read_csv(file_path, low_memory=False, usecols=["SourceName", "Condition", "StartTime","EndTime","TimeDelta","Year-Month"],parse_dates=["StartTime", "EndTime"])
    return df

def getDFWithCommonSourcesInAllMonths(df):
    each_month_source_names = [[sname for sname in df[df["Year-Month"]==month]["SourceName"].unique()] for month in df["Year-Month"].unique()]
    common_sourcenames_in_all_months = set.intersection(*[set(l) for l in each_month_source_names])
    df = df[df["SourceName"].isin(common_sourcenames_in_all_months)]
    return df

def getSequenceOfWholeData(df,seq_duration_gap,filter_short_seq):
    print(f">> Duration to next seq: {seq_duration_gap}, ignore seq len: {filter_short_seq}")

    list_of_sequences = []    
    alarms= df.to_dict(orient="records")
    alarms = [alarm for alarm in sorted(alarms, key=lambda arg: arg["StartTime"], reverse=False)] # sorting
    i =0
    j= 0

    max_seq_len = -1
    while i <len(alarms):
        prev_start = alarms[i]["StartTime"]
        seq = []
        seq.append(alarms[i])
        j = i+1
        while j < len(alarms):    
            next_start = alarms[j]["StartTime"]
            delta = timedelta.total_seconds(next_start - prev_start)
            # print(delta)
            assert delta >= 0
            if delta >= seq_duration_gap:
                break

            seq.append(alarms[j])
            j += 1
        i = j

        if len(seq) > max_seq_len:
            max_seq_len = len(seq)
        
        if len(seq)>=filter_short_seq:
            seq = [alarm for alarm in sorted(seq, key=lambda arg: arg["StartTime"], reverse=False)]
            seq = [alarm["SourceName"] for alarm in seq]
            list_of_sequences.append(seq)
    
    
    return list_of_sequences, max_seq_len


In [3]:
"""  Lodading the Data and Preprocessing """
start = time.time()

df_main_alarms =loadAlarmsData(file_path="/home/waris/Github/predict-future-alarms/.data/final-all-months-alarms.csv")
df_main_alarms = updatSourceNamewithCondition(df_main_alarms)
print("Total Time to load the data ", time.time()-start)

df_main_alarms

Total Time to load the data  95.24953818321228


Unnamed: 0,SourceName,Condition,StartTime,EndTime,TimeDelta,Year-Month
0,48TIC2026-VEL-,VEL-,2020-03-01 00:00:03,2020-03-01 00:00:05,2.0,"(2020, 3)"
1,48TIC2026-VEL-,VEL-,2020-03-01 00:00:07,2020-03-01 00:00:08,1.0,"(2020, 3)"
2,48TIC2026-VEL-,VEL-,2020-03-01 00:00:10,2020-03-01 00:00:11,1.0,"(2020, 3)"
3,48TIC2026-VEL-,VEL-,2020-03-01 00:00:18,2020-03-01 00:00:20,2.0,"(2020, 3)"
4,48TIC2026-VEL-,VEL-,2020-03-01 00:00:22,2020-03-01 00:00:23,1.0,"(2020, 3)"
...,...,...,...,...,...,...
12541434,47PI1517B-LLL,LLL,2020-01-09 02:54:19,2020-01-09 02:58:34,255.0,"(2020, 1)"
12541435,47PI2018-LO,LO,2020-01-31 15:15:28,2020-01-31 15:52:16,2208.0,"(2020, 1)"
12541436,47FI2030-HI,HI,2020-01-24 15:22:23,2020-01-24 15:25:16,173.0,"(2020, 1)"
12541437,47XL3503B-ANN-ALM,ALM,2020-01-29 09:18:04,2020-01-29 09:19:29,85.0,"(2020, 1)"


In [20]:
# """ Chaning name 2 alias for alarm data but skipping it """
source2Alias, alias2source = convertSourceNamesToAlias(df_main_alarms)
df_main_alarms

Unnamed: 0,SourceName,Condition,StartTime,EndTime,TimeDelta,Year-Month
0,A0,VEL-,2020-03-01 00:00:03,2020-03-01 00:00:05,2.0,"(2020, 3)"
1,A0,VEL-,2020-03-01 00:00:07,2020-03-01 00:00:08,1.0,"(2020, 3)"
2,A0,VEL-,2020-03-01 00:00:10,2020-03-01 00:00:11,1.0,"(2020, 3)"
3,A0,VEL-,2020-03-01 00:00:18,2020-03-01 00:00:20,2.0,"(2020, 3)"
4,A0,VEL-,2020-03-01 00:00:22,2020-03-01 00:00:23,1.0,"(2020, 3)"
...,...,...,...,...,...,...
12541434,A2260,LLL,2020-01-09 02:54:19,2020-01-09 02:58:34,255.0,"(2020, 1)"
12541435,A565,LO,2020-01-31 15:15:28,2020-01-31 15:52:16,2208.0,"(2020, 1)"
12541436,A807,HI,2020-01-24 15:22:23,2020-01-24 15:25:16,173.0,"(2020, 1)"
12541437,A1001,ALM,2020-01-29 09:18:04,2020-01-29 09:19:29,85.0,"(2020, 1)"


In [21]:
source2count = dict(df_main_alarms["SourceName"].value_counts())
# source2count


In [22]:
""" 
    Filter the Alarm Data
    1. Ignore the communication Alarms
    2. Ignore the momentary alarms => 20 seconds
    3. Remove Staling Alarms => 12 hours    
    4. Remove sources which are triggered less 20 in whole dataset
    5. Include all the months
    6. DO SKIP ANY SOURCENAME IF IGNORING COMMUNICATION ALARMS
"""
ignore_comm_alarms = True
momentary_alarms_f = None # seconds
staling_alarm_f = None # hours
min_alarms_per_source_f = 500 # any source which is not triggered atleast 20 times in whole dataset will be removed
months_f = df_main_alarms["Year-Month"].unique()
snames_f = [] # ONLY USE IF NOT IGNORING COMM ALRMS

df_alarms_new = filterAlarmData(df_main_alarms, months=months_f, sources_filter=snames_f,
                                     monmentarly_filter=momentary_alarms_f, staling_filter=staling_alarm_f, ingore_communication_alarms=ignore_comm_alarms, min_alarms_per_source=min_alarms_per_source_f)


df_alarms_new

>>Preprocessing... 
   Months to include=['(2020, 3)' '(2019, 7)' '(2019, 8)' '(2019, 3)' '(2019, 10)' '(2019, 11)'
 '(2019, 9)' '(2020, 2)' '(2019, 5)' '(2019, 6)' '(2019, 12)' '(2019, 4)'
 '(2020, 1)']
  Ignore Sources=[]
  Ingnore Momentarlily Alarms Filter=Noneseconds 
   Ignoreing Staling Alarms Filter=None seconds, 
 Ignore Communication Alarms = True 
 Remove sources whose count is less than 500


Unnamed: 0,SourceName,Condition,StartTime,EndTime,TimeDelta,Year-Month
0,A0,VEL-,2020-03-01 00:00:03,2020-03-01 00:00:05,2.0,"(2020, 3)"
1,A0,VEL-,2020-03-01 00:00:07,2020-03-01 00:00:08,1.0,"(2020, 3)"
2,A0,VEL-,2020-03-01 00:00:10,2020-03-01 00:00:11,1.0,"(2020, 3)"
3,A0,VEL-,2020-03-01 00:00:18,2020-03-01 00:00:20,2.0,"(2020, 3)"
4,A0,VEL-,2020-03-01 00:00:22,2020-03-01 00:00:23,1.0,"(2020, 3)"
...,...,...,...,...,...,...
12541106,A91,LLL,2020-01-31 14:40:33,2020-01-31 14:41:04,31.0,"(2020, 1)"
12541107,A91,LLL,2020-01-31 14:42:00,2020-01-31 14:42:19,19.0,"(2020, 1)"
12541108,A91,LLL,2020-01-31 15:08:44,2020-01-31 15:09:07,23.0,"(2020, 1)"
12541109,A91,LLL,2020-01-31 15:55:10,2020-01-31 15:57:42,152.0,"(2020, 1)"


In [23]:
source2count = dict(df_alarms_new["SourceName"].value_counts())
print(len(source2count))

124


In [24]:
df_rnn = removeChatteringAlarms(df_alarms_new,chattering_timedelta_threshold=60*3)
df_rnn

Unnamed: 0,SourceName,Condition,StartTime,EndTime,TimeDelta,Year-Month
0,A0,VEL-,2019-04-26 18:51:22,2019-04-26 18:51:29,7.0,"(2019, 4)"
1,A0,VEL-,2019-04-26 18:54:31,2019-04-26 18:54:33,2.0,"(2019, 4)"
2,A0,VEL-,2019-04-26 18:57:36,2019-04-26 18:57:37,1.0,"(2019, 4)"
3,A0,VEL-,2019-04-26 19:00:37,2019-04-26 19:00:38,1.0,"(2019, 4)"
4,A0,VEL-,2019-04-26 19:03:48,2019-04-26 19:03:49,1.0,"(2019, 4)"
...,...,...,...,...,...,...
559848,A2478,ALM,2019-12-07 08:45:57,2019-12-07 08:46:41,44.0,"(2019, 12)"
559849,A2478,ALM,2019-12-07 08:49:36,2019-12-07 08:49:38,2.0,"(2019, 12)"
559850,A2478,ALM,2019-12-07 11:40:58,2019-12-07 11:40:59,1.0,"(2019, 12)"
559851,A2478,ALM,2019-12-07 12:52:26,2019-12-07 12:56:36,250.0,"(2019, 12)"


In [28]:
source2count = dict(df_rnn["SourceName"].value_counts())
print(source2count)

{'A1': 135596, 'A0': 135210, 'A4': 57380, 'A727': 14471, 'A726': 13478, 'A42': 12542, 'A723': 12503, 'A725': 11648, 'A1471': 8868, 'A734': 8374, 'A733': 8372, 'A724': 8367, 'A9': 7940, 'A156': 6701, 'A43': 6582, 'A17': 6275, 'A6': 4838, 'A8': 4562, 'A243': 3819, 'A50': 3762, 'A59': 3548, 'A60': 3366, 'A64': 3038, 'A475': 2935, 'A1358': 2739, 'A15': 2252, 'A154': 2184, 'A77': 1991, 'A57': 1983, 'A75': 1973, 'A56': 1959, 'A99': 1947, 'A98': 1946, 'A40': 1698, 'A69': 1601, 'A89': 1542, 'A7': 1362, 'A138': 1341, 'A55': 1298, 'A49': 1206, 'A113': 1160, 'A18': 1143, 'A66': 1121, 'A1723': 1030, 'A165': 981, 'A67': 922, 'A730': 902, 'A729': 888, 'A27': 870, 'A760': 868, 'A229': 867, 'A29': 860, 'A74': 858, 'A19': 858, 'A36': 840, 'A114': 805, 'A1067': 803, 'A91': 776, 'A78': 773, 'A788': 767, 'A88': 764, 'A26': 752, 'A22': 738, 'A266': 737, 'A102': 711, 'A13': 710, 'A287': 703, 'A23': 701, 'A95': 698, 'A115': 692, 'A170': 682, 'A83': 678, 'A327': 661, 'A71': 649, 'A35': 644, 'A103': 644, 'A163

In [31]:
df_rnn = (lambda df, sources_filter: df[(~df["SourceName"].isin(sources_filter))])(df_rnn, ['A1','A0','A4'])
source2count = dict(df_rnn["SourceName"].value_counts())
print(source2count)

{'A727': 14471, 'A726': 13478, 'A42': 12542, 'A723': 12503, 'A725': 11648, 'A1471': 8868, 'A734': 8374, 'A733': 8372, 'A724': 8367, 'A9': 7940, 'A156': 6701, 'A43': 6582, 'A17': 6275, 'A6': 4838, 'A8': 4562, 'A243': 3819, 'A50': 3762, 'A59': 3548, 'A60': 3366, 'A64': 3038, 'A475': 2935, 'A1358': 2739, 'A15': 2252, 'A154': 2184, 'A77': 1991, 'A57': 1983, 'A75': 1973, 'A56': 1959, 'A99': 1947, 'A98': 1946, 'A40': 1698, 'A69': 1601, 'A89': 1542, 'A7': 1362, 'A138': 1341, 'A55': 1298, 'A49': 1206, 'A113': 1160, 'A18': 1143, 'A66': 1121, 'A1723': 1030, 'A165': 981, 'A67': 922, 'A730': 902, 'A729': 888, 'A27': 870, 'A760': 868, 'A229': 867, 'A29': 860, 'A74': 858, 'A19': 858, 'A36': 840, 'A114': 805, 'A1067': 803, 'A91': 776, 'A78': 773, 'A788': 767, 'A88': 764, 'A26': 752, 'A22': 738, 'A266': 737, 'A102': 711, 'A13': 710, 'A287': 703, 'A23': 701, 'A95': 698, 'A115': 692, 'A170': 682, 'A83': 678, 'A327': 661, 'A71': 649, 'A35': 644, 'A103': 644, 'A163': 639, 'A51': 629, 'A1675': 627, 'A265':

In [42]:
duration_from_1_seq_to_next = 60*15 # duration in seconds
filter_short_seq = 5 # remove the sequence whose size is less than 4
li_of_seqs,max_seq_len = getSequenceOfWholeData(df_rnn,duration_from_1_seq_to_next,filter_short_seq)
print(len(li_of_seqs))
print(li_of_seqs[:2])

>> Duration to next seq: 900, ignore seq len: 5
18724
[['A17', 'A75', 'A17', 'A57', 'A17', 'A98', 'A99', 'A56'], ['A50', 'A59', 'A60', 'A64', 'A726', 'A9', 'A725', 'A726', 'A726', 'A243', 'A725']]


In [43]:
def removeSameAlarms(seq):
    new_seq = []

    new_seq.append(seq[0])

    for a in seq:

        if a == new_seq[-1]:
            continue
        new_seq.append(a)
    
    return new_seq

print(removeSameAlarms(['A37', 'A10', 'A28', 'A10', 'A10', 'A48', 'A49', 'A28']))

seq_len_2_count = {}

with open("/home/waris/Github/predict-future-alarms/.data/seqs.tokens","w") as f:
    for seq in li_of_seqs:
        # seq = removeSameAlarms(seq)
        l = len(seq)
        seq_len_2_count[l] = 1+seq_len_2_count.get(l,0)
        
        if l >filter_short_seq:
            f.write(f"{' '.join(seq)}\n")

seq_len_2_count = {k:v for k,v in sorted(seq_len_2_count.items(), key=lambda t: t[1] )}
seq_len_2_count

['A37', 'A10', 'A28', 'A10', 'A48', 'A49', 'A28']


{71: 1,
 68: 1,
 70: 1,
 60: 1,
 55: 1,
 45: 1,
 57: 1,
 44: 1,
 38: 1,
 42: 2,
 54: 2,
 58: 3,
 46: 3,
 49: 3,
 51: 3,
 47: 3,
 48: 3,
 40: 4,
 50: 4,
 53: 5,
 52: 6,
 36: 7,
 39: 7,
 37: 11,
 35: 12,
 34: 14,
 33: 21,
 32: 30,
 31: 37,
 30: 45,
 29: 63,
 28: 86,
 27: 114,
 26: 117,
 25: 190,
 24: 196,
 23: 263,
 22: 305,
 21: 337,
 19: 341,
 20: 382,
 17: 407,
 18: 421,
 16: 482,
 15: 562,
 14: 639,
 13: 738,
 12: 875,
 11: 1095,
 10: 1289,
 9: 1446,
 8: 1732,
 5: 1951,
 7: 2179,
 6: 2280}

In [57]:
seq_len_2_count = {}
avg_seq_len = 14
ignore_seq_len = 22
with open("/home/waris/Github/predict-future-alarms/.data/seqs.tokens","w") as f:
    for seq in li_of_seqs:
        # seq = removeSameAlarms(seq)
        l = len(seq)
        # print(seq)
        # break
        
        if l <=ignore_seq_len:
            seq_len_2_count[l] = 1+seq_len_2_count.get(l,0)
            if l<avg_seq_len:
                seq = (avg_seq_len-l)*['<pad>'] + seq
            elif l>avg_seq_len:
                seq = seq[:avg_seq_len]

            f.write(f"{' '.join(seq)}\n")

seq_len_2_count = {k:v for k,v in sorted(seq_len_2_count.items(), key=lambda t: t[1] )}
seq_len_2_count

{22: 305,
 21: 337,
 19: 341,
 20: 382,
 17: 407,
 18: 421,
 16: 482,
 15: 562,
 14: 639,
 13: 738,
 12: 875,
 11: 1095,
 10: 1289,
 9: 1446,
 8: 1732,
 5: 1951,
 7: 2179,
 6: 2280}

In [52]:
sum(seq_len_2_count.keys())/len(seq_len_2_count.keys())

13.5

In [54]:
l = 5*['5']

In [55]:
l

['5', '5', '5', '5', '5']