In [1]:
import pandas as pd
import re

We will clean our new samples of 'bad login' and 'good login' in 3 simple steps:
1. Open the new samples.
2. Extract our wanted features, as we learnt from the previous samples.
3. Save them as CSV files.

In [2]:
LogName = []
Date = []
Time = []
ComputerName = []
SourceName = []
TaskCategory = []
EventType = []
Message = []
Keywords = []
OpCode = []
EventCode = []
Type = []
RecordNumber = []

for log_name in ['bad_login', 'good_login']:
    df = pd.read_csv(log_name + '.csv')
    for i in range(0,len(df)):
        current_log = df.iloc[i]['_raw']
        Date.append(current_log[0:10])
        Time.append(current_log[11:22])
        LogName.append(re.search('LogName=(.*)\n', current_log).group(1))
        ComputerName.append(re.search('ComputerName=(.*)\n', current_log).group(1))
        SourceName.append(re.search('SourceName=(.*)\n', current_log).group(1))
        TaskCategory.append(re.search('TaskCategory=(.*)\n', current_log).group(1))
        EventType.append(re.search('EventType=(.*)\n', current_log).group(1))
        Keywords.append(re.search('Keywords=(.*)\n', current_log).group(1))
        OpCode.append(re.search('OpCode=(.*)\n', current_log).group(1))
        EventCode.append(re.search('EventCode=(.*)\n', current_log).group(1))
        Type.append(re.search('Type=(.*)\n', current_log).group(1))
        RecordNumber.append(re.search('RecordNumber=(.*)\n', current_log).group(1))
        Message.append(re.search('Message=(.*)', current_log).group(1))

    df_cleaned = pd.DataFrame({ 'LogName' : LogName,
                            'Date' : Date,
                            'Time' : Time,
                            'ComputerName' : ComputerName,
                            'SourceName' : SourceName,
                            'TaskCategory' : TaskCategory,
                            'EventType' : EventType,
                            'Keywords' : Keywords,
                            'OpCode' : OpCode,
                            'EventCode' : EventCode,
                            'Type' : Type,
                            'RecordNumber' : RecordNumber,
                            'Message' : Message })
    
    df_cleaned['EventType'] = df_cleaned['EventType'].astype(int)
    df_cleaned['EventCode'] = df_cleaned['EventCode'].astype(int)
    df_cleaned['Type'] = df_cleaned['Type'].astype(int)
    df_cleaned['RecordNumber'] = df_cleaned['RecordNumber'].astype(int)
    df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'], format='%m/%d/%Y')
    df_cleaned['Time'] = pd.to_datetime(df_cleaned['Time'], format='%I:%M:%S %p').dt.time

    df_cleaned.to_csv(log_name + '_cleaned.csv', index=False)

#### Let's check how a <b>'bad'</b> and a <b>'good'</b> login should look like:

We know from reading on the internet that the logs should look like this:<br>
<u>EventCode 4624:</u> An account was successfully logged on.<br>
<u>EventCode 4625:</u> An account failed to log on.
<br><br>
but let's confirm that it's the case:

In [3]:
bad_login_cleaned = pd.read_csv('bad_login_cleaned.csv')
good_login_cleaned = pd.read_csv('good_login_cleaned.csv')

In [4]:
bad_login_cleaned[bad_login_cleaned['EventCode'] == 4625].iloc[0:1]

Unnamed: 0,LogName,Date,Time,ComputerName,SourceName,TaskCategory,EventType,Keywords,OpCode,EventCode,Type,RecordNumber,Message
0,Security,2023-09-26,18:01:50,CLIENT-PC,Microsoft Windows security auditing.,Logon,0,Audit Failure,Info,4625,0,324891,An account failed to log on.\r


In [5]:
good_login_cleaned[good_login_cleaned['EventCode'] == 4624].iloc[0:1]

Unnamed: 0,LogName,Date,Time,ComputerName,SourceName,TaskCategory,EventType,Keywords,OpCode,EventCode,Type,RecordNumber,Message
22,Security,2023-09-26,18:16:11,CLIENT-PC,Microsoft Windows security auditing.,Logon,0,Audit Success,Info,4624,0,336576,An account was successfully logged on.


#### <u>Conclution:</u> As we can see, we got the codes right and now we can safely continue to create our alerts system.