In [1]:
import json
import pandas as pd
from datetime import datetime, timedelta
from dateutil.parser import parse
from pathlib import Path
import glob

In [6]:
# format csvs

class CSV2Alarms:
    """
        Summary:
        Record mean 1 row of the csv file. 
        This class converts records into alarms. 
    """
    def __init__(self,config):
        self.config = config
        print(f'>>Input file: {self.config["dir"]+self.config["in_fname"]}')
        self.df = pd.read_csv(self.config["dir"]+self.config["in_fname"], usecols=self.config["cols"], delimiter= "," ,encoding = "ISO-8859-1")
        # self.df.columns = [''.join(e for e in col if e.isalnum()) for col in self.df.columns]
        # self.df.columns = [col.replace("ï","") for col in self.df.columns]
        
    def __changeDate(self,d):
        d = d.replace(".000000000","")
        d = d.replace("/","-")
        return parse(d)

    def __getMessageType(self,message):
        if message.find("Recover") != -1:
            return "Recover"
        elif message.find("NR") != -1:
            return "NR"
        else:
            return "Activation"
    
    def __getAlarmsFromDFs(self,df_start, df_end):
        alarms = []
        start_records = [v for v in sorted(df_start.to_dict(
            orient="records"), key=lambda arg: arg["EventTime"], reverse=False)]
        end_records = [v for v in sorted(df_end.to_dict(
            orient="records"), key=lambda arg: arg["EventTime"], reverse=False)]
        i = 0
        j = 0
        # print("End len",len(end_records), "Start len", len(start_records))
        while j < len(end_records):
            # print(i,j)
            if len(start_records)>0 and end_records[j]["EventTime"] < start_records[i]["EventTime"]:
                j += 1
            else:
                break

        while i < len(start_records):
            
            if j <len(end_records) and start_records[i]["EventTime"] <= end_records[j]["EventTime"]:
                if i+1 < len(start_records) and start_records[i+1]["EventTime"] < end_records[j]["EventTime"]: # check for the next record
                    i += 1
                    continue
                alarm = {k: v for k, v in start_records[i].items()}
                alarm["StartTime"] = alarm["EventTime"]
                alarm["EndTime"] = end_records[j]["EventTime"]
                alarm["EndMessage"] = end_records[j]["Message"]
                del alarm["EventTime"]
                alarms.append(alarm)
                j += 1
            elif j <len(end_records) and start_records[i]["EventTime"] > end_records[j]["EventTime"]:
                j +=1
                continue   
                
            i += 1

        return alarms
    
    def __convertRecordsToAlarmsV1(self,df_source):
        alarms = []
        for condition in df_source["Condition"].unique():
            df_condition = df_source.loc[df_source['Condition'].isin([condition])]
            df_start = df_condition.loc[df_condition['MessageType'].isin([
                                                                        "Activation"])]
            end_types = [t for t in df_condition["MessageType"].unique() if t !=
                    "Activation"]
            # print(types)
            df_end = df_condition.loc[df_condition['MessageType'].isin(end_types)]
            alarms += self.__getAlarmsFromDFs(df_start, df_end)
        return alarms
    
    def __convertRecordsToAlarmsOld(self,records):
        """ Convert records from the same source to proper alarms with start and end time.   

            The record which contains "Recover" or "NR" in the Message column shows the deactivations. 

        Parameters
        ----------
        records : list of dict
            Each dict represent either activation of an alarm or deactivation of an alarm.  

        Returns
        -------
        alarms : list of dict
            Each dict in the list is an alarm with the StartTime and EndTime of an alarm. 
        """
        alarms = []  # conainsts alarms with start and end time.
        # for enqueue and deque of records., Needed dictionary because there can be multiple types of alarms from the same source.
        conditions_queues = {}
        alarm = None  # dictionary
        records = [v for v in sorted(
            records, key=lambda arg: arg["EventTime"], reverse=False)]
        for record in records:

            # initiazlize the queue
            if conditions_queues.get(record["Condition"]) == None:
                conditions_queues[record["Condition"]] = []

            # Enqueue the record
            if record["Message"].find("Recover") == -1 and record["Message"].find("NR") == -1:
                conditions_queues[record["Condition"]].append(record)
            else:
                if len(conditions_queues[record["Condition"]]) == 0:
                    continue

                alarm = conditions_queues[record["Condition"]].pop(
                    0)  # Dqueue the record
                alarm = {k: v for k, v in alarm.items()}
                alarm["StartTime"] = alarm["EventTime"]
                alarm["EndTime"] = record["EventTime"]
                alarm["EndMessage"] = record["Message"]
                del alarm["EventTime"]
                alarms.append(alarm)

        return alarms
    
    def formatCSV(self):
        print(">>Column  Types: ", end="")
        for col in self.df.columns:
            print(col, type(self.df[col][0]), end=", ")
            if isinstance(self.df[col][0],str):
                try:
                    self.df[col] = self.df[col].apply(lambda s: " ".join(s.split()))
                except Exception as e:
                    print(f"\n\n !!!!!!!!!!!!!!!!!! Excetpion {e} !!!!!!!!!!!!!!!!!!!!!!!!!!")

        print(type(self.df["EventTime"][0]))
        self.df["EventTime"] = self.df["EventTime"].apply(self.__changeDate)
        self.df["MessageType"] = self.df["Message"].apply(self.__getMessageType)
        # self.df["Month"] = self.df["EventTime"].apply(lambda arg: arg.month)
        print(f">>Before Filtering ACKS: {self.df.shape}")
        self.df = self.df.loc[self.df['Message'].map(lambda arg: arg.find(self.config["ack-filter"])) == -1] 
        print(f">>After Filtering ACKS: {self.df.shape}") 
        

        fpath = self.config["dir"] + self.config["formated_fname"]
        
        self.df.to_csv(fpath, index=False)
        self.df = pd.read_csv(fpath, low_memory=False, parse_dates=["EventTime"])
        print(f">> Formating is complete. Outfile: {fpath}")
        return self.df

    def convertRecords2Alarms(self,df):
        # df = pd.read_csv(p, low_memory=False, usecols=cols,parse_dates=["EventTime"])

        assert len(df["MachineName"].unique()) == 1 # all the alarms should be related to the same unit
        
        alarms = []
        differs = []
        sources_ranks_dict = df['SourceName'].value_counts()
        id = 0 # for debugging
        for sname in sources_ranks_dict.keys():
            id += 1
            df_sname = df.loc[df['SourceName'].isin([sname])] # source DF
            types_rank_dict = df_sname["MessageType"].value_counts() # source ranks
            total = 0
            for key in types_rank_dict.keys():
                total += types_rank_dict[key]
            assert(total== sources_ranks_dict[sname]) # sum is equal to count 
            
            source_alarms = self.__convertRecordsToAlarmsV1(df_sname)
            alarms += source_alarms
        
            print(f"[{id}]Source:{sname}, Conditions:{df_sname['Condition'].unique()}, Total Alarms:{len(source_alarms)}")

        ##comparing it with older algo
            # temp_alarms2 = self.__convertRecordsToAlarmsOld(df_sname.to_dict(orient="records"))
            # if len(source_alarms)-len(temp_alarms2) != 0:
            #     print(">>[{}]Source: {},Conditions:{}".format(id,sname, df_sname["Condition"].unique()), end="=>")
            #     print("ALARMS1:{},Alarms2:{},Diff(new-old):{}".format(len(source_alarms),len(temp_alarms2), len(source_alarms)-len(temp_alarms2)),end="")
            #     print("")

        
            # if (len(source_alarms) != len(temp_alarms2)):
            #     differs.append(sname)
        

        # print(">> Difference in 2 Algos",differs,len(differs))

        # writing to ouptut alarms
        df_out = pd.DataFrame(alarms)
        df_out["TimeDelta"] = df_out[["StartTime", "EndTime"]].apply(lambda arg: timedelta.total_seconds(arg[1]-arg[0]) , axis=1)
        df_out["Year-Month"] =df_out["StartTime"].apply(lambda arg: (arg.year,arg.month))

        file_path = self.config["dir"]+self.config["alarm_out_fname"] 
        df_out.to_csv(file_path, index = False)
        print(f">>Conversion from records to alarms is complete. Outputfile : {fiel_path}, Info : {df_out.info()}")
        return df_out


In [3]:

ACK_FILTER = "ACK"
config = {
    "dir": "../.data/csvs/",
    "in_fname": "raw/processed_2018.csv",
    "formated_fname": "raw/formatted_"+ "processed_2018.csv",
    "alarm_out_fname": "alarms/alarms_"+"processed_2018.csv",
    "ack-filter":ACK_FILTER,
    'cols':["MachineName","SourceName","EventTime", "Message","Condition"]

}
alarm  = CSV2Alarms(config)
df = alarm.formatCSV()
# df = alarm.

>>Input file: ../.data/csvs/raw/processed_2018.csv
>>Column  Types: MachineName <class 'str'>, SourceName <class 'str'>, EventTime <class 'str'>, Message <class 'str'>, Condition <class 'str'>, <class 'str'>
>>Before Filtering ACKS: (8636603, 7)
>>After Filtering ACKS: (8525505, 7)
>> Formating is complete. Outfile: ../.data/csvs/raw/formatted_processed_2018.csv


In [4]:
df2 = df

In [7]:
alarm2 = CSV2Alarms(config)
df_alarms = alarm2.convertRecords2Alarms(df2)
df_alarms

I'], Total Alarms:1
[3980]Source:47BA1501F-ANN, Conditions:['ALM'], Total Alarms:0
[3981]Source:47TI948K-BYP, Conditions:['OVR'], Total Alarms:1
[3982]Source:47VSHH601-BYP, Conditions:['PWON'], Total Alarms:1
[3983]Source:47TSHH720D-BYP, Conditions:['PWON'], Total Alarms:1
[3984]Source:47TI938D-BYP, Conditions:['OVR'], Total Alarms:1
[3985]Source:47PSL850-BYP, Conditions:['PWON'], Total Alarms:1
[3986]Source:48PALL2004-ANN, Conditions:['ALM'], Total Alarms:0
[3987]Source:47TI2080B-BYP, Conditions:['OVR'], Total Alarms:1
[3988]Source:47TI1576, Conditions:['IOP'], Total Alarms:1
[3989]Source:47BA1501G-ANN, Conditions:['ALM'], Total Alarms:0
[3990]Source:01UA003-ANN, Conditions:['ALM'], Total Alarms:0
[3991]Source:47ZAHH500-ANN, Conditions:['ALM'], Total Alarms:0
[3992]Source:47XA035-ANN, Conditions:['ALM'], Total Alarms:0
[3993]Source:47BA001B-ANN, Conditions:['ALM'], Total Alarms:0
[3994]Source:47TI1550, Conditions:['IOP'], Total Alarms:1
[3995]Source:47HSD-057HH-ANN, Conditions:['ALM']

NameError: name 'timedelta' is not defined