**Import modules**

In [1]:
import pandas as pd
import datetime
from pandas.io import json
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp
conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)
spark.conf.set("spark.sql.caseSensitive", "true")

**Import dataset**

In [2]:
path = "dataset/aptsimulator_cobaltstrike_2021-06-11T21081492.json"
df = json.read_json(path, lines=True)
df2 = spark.read.json(path)

**Initial view of the the dataset**

In [3]:
df.head()

Unnamed: 0,SourceName,ProviderGuid,Level,Keywords,Channel,Hostname,TimeCreated,@timestamp,EventID,Message,...,ServiceType,ServiceStartType,ServiceAccount,ClientProcessStartKey,ClientProcessId,ImagePath,StartType,AccountName,param1,param2
0,Microsoft-Windows-Sysmon,{5770385f-c22a-43e0-bf4c-06f5698ffbd9},4,0x8000000000000000,Microsoft-Windows-Sysmon/Operational,WORKSTATION5,2021-06-11T09:07:15.635Z,2021-06-11T09:07:15.635Z,1,Process Create:\r\nRuleName: -\r\nUtcTime: 202...,...,,,,,,,,,,
1,Microsoft-Windows-Sysmon,{5770385f-c22a-43e0-bf4c-06f5698ffbd9},4,0x8000000000000000,Microsoft-Windows-Sysmon/Operational,WORKSTATION5,2021-06-11T09:07:15.636Z,2021-06-11T09:07:15.636Z,10,Process accessed:\r\nRuleName: -\r\nUtcTime: 2...,...,,,,,,,,,,
2,Microsoft-Windows-Sysmon,{5770385f-c22a-43e0-bf4c-06f5698ffbd9},4,0x8000000000000000,Microsoft-Windows-Sysmon/Operational,WORKSTATION5,2021-06-11T09:07:15.636Z,2021-06-11T09:07:15.636Z,10,Process accessed:\r\nRuleName: -\r\nUtcTime: 2...,...,,,,,,,,,,
3,Microsoft-Windows-Sysmon,{5770385f-c22a-43e0-bf4c-06f5698ffbd9},4,0x8000000000000000,Microsoft-Windows-Sysmon/Operational,WORKSTATION5,2021-06-11T09:07:15.636Z,2021-06-11T09:07:15.636Z,7,Image loaded:\r\nRuleName: -\r\nUtcTime: 2021-...,...,,,,,,,,,,
4,Microsoft-Windows-Sysmon,{5770385f-c22a-43e0-bf4c-06f5698ffbd9},4,0x8000000000000000,Microsoft-Windows-Sysmon/Operational,WORKSTATION5,2021-06-11T09:07:15.636Z,2021-06-11T09:07:15.636Z,7,Image loaded:\r\nRuleName: -\r\nUtcTime: 2021-...,...,,,,,,,,,,


In [27]:
def timeconvert(t):
    result = datetime.datetime.strptime(t, "%Y-%m-%dT%H:%M:%S.%fZ")
    return result.strftime("%d %b %Y")

In [28]:
df['TimeCreated'] = df['TimeCreated'].apply(timeconvert)

In [29]:
df['TimeCreated']

0       11 Jun 2021
1       11 Jun 2021
2       11 Jun 2021
3       11 Jun 2021
4       11 Jun 2021
           ...     
2606    11 Jun 2021
2607    11 Jun 2021
2608    11 Jun 2021
2609    11 Jun 2021
2610    11 Jun 2021
Name: TimeCreated, Length: 2611, dtype: object

**View all columns/features in raw dataset

In [30]:
df.columns

Index(['SourceName', 'ProviderGuid', 'Level', 'Keywords', 'Channel',
       'Hostname', 'TimeCreated', '@timestamp', 'EventID', 'Message',
       ...
       'ServiceStartType', 'ServiceAccount', 'ClientProcessStartKey',
       'ClientProcessId', 'ImagePath', 'StartType', 'AccountName', 'param1',
       'param2', 'unix'],
      dtype='object', length=127)

In [10]:
print(df.columns.tolist())

['SourceName', 'ProviderGuid', 'Level', 'Keywords', 'Channel', 'Hostname', 'TimeCreated', '@timestamp', 'EventID', 'Message', 'Task', 'RuleName', 'UtcTime', 'ProcessGuid', 'ProcessId', 'Image', 'FileVersion', 'Description', 'Product', 'Company', 'OriginalFileName', 'CommandLine', 'CurrentDirectory', 'User', 'LogonGuid', 'LogonId', 'TerminalSessionId', 'IntegrityLevel', 'Hashes', 'ParentProcessGuid', 'ParentProcessId', 'ParentImage', 'ParentCommandLine', 'SourceProcessGUID', 'SourceProcessId', 'SourceThreadId', 'SourceImage', 'TargetProcessGUID', 'TargetProcessId', 'TargetImage', 'GrantedAccess', 'CallTrace', 'ImageLoaded', 'Signed', 'Signature', 'SignatureStatus', 'TargetFilename', 'IsExecutable', 'CreationUtcTime', 'EventType', 'TargetObject', 'Details', 'PipeName', 'Device', 'Protocol', 'Initiated', 'SourceIsIpv6', 'SourceIp', 'SourceHostname', 'SourcePort', 'SourcePortName', 'DestinationIsIpv6', 'DestinationIp', 'DestinationHostname', 'DestinationPort', 'DestinationPortName', 'Proce

#### Notes:

- Work on discretizing, normalizing and think of how to create features from the columns and have questions about the data

#### Findings:

- Trying to find columns similar to LANL Denign AD Logs & PurpleSharp AD Log Playbook for Lateral Movement

'SourceName', <br>
'Hostname', <br>
'TimeCreated', <-- 'EventTime' (LANL) <br>
'DestinationIsIpv6', 'DestinationIp', 'DestinationHostname', 'DestinationPort', 'DestinationPortName'; <-- (Look into these) <br>
'SubjectDomainName', <br>
'EventID', <br>
'EventType', <br>
'LogonId', <br>
'ParentProcessId', <br>
'ParentProcessName', <br>
'ProcessID', <br>
'ProcessName', <br>
'NewProcessId', <br>
'NewProcessName', <br>
'ServiceName', <br>
'SubjectUserSid', <br>
'SubjectUserName', <br>
'SubjectDomainName', <br>
'SubjectLogonId', <br>
'Status' <br>

'AccessReason', 'Task', 'Message', 'Description', 'Details' <-- (Look into these)

In [7]:
df_cobalt = df[['EventID', 'EventType', 'LogonId', 'Hostname', 'SubjectUserName', 'SubjectDomainName', 'SubjectLogonId', 
                'Status', 'SourceName', 'ServiceName', 'DestinationHostname', 'ProcessName', 'ProcessID', 'ParentProcessName',
                'ParentProcessId']]

In [8]:
df_cobalt.head()

Unnamed: 0,EventID,EventType,LogonId,Hostname,SubjectUserName,SubjectDomainName,SubjectLogonId,Status,SourceName,ServiceName,DestinationHostname,ProcessName,ProcessID,ParentProcessName,ParentProcessId
0,1,,0x3719fb,WORKSTATION5,,,,,Microsoft-Windows-Sysmon,,,,,,5632.0
1,10,,,WORKSTATION5,,,,,Microsoft-Windows-Sysmon,,,,,,
2,10,,,WORKSTATION5,,,,,Microsoft-Windows-Sysmon,,,,,,
3,7,,,WORKSTATION5,,,,,Microsoft-Windows-Sysmon,,,,,,
4,7,,,WORKSTATION5,,,,,Microsoft-Windows-Sysmon,,,,,,


In [9]:
df_cobalt.isnull().all(0)

EventID                False
EventType              False
LogonId                False
Hostname               False
SubjectUserName        False
SubjectDomainName      False
SubjectLogonId         False
Status                 False
SourceName             False
ServiceName            False
DestinationHostname    False
ProcessName            False
ProcessID              False
ParentProcessName      False
ParentProcessId        False
dtype: bool

In [10]:
for col in list(df_cobalt):
    print(col)
    print(df_cobalt[col].unique())

EventID
[   1   10    7    5   26   11   13   12   17    9   18    3 5156 5158
 4689 4688 4690 4658 4656 4663 4703 4670 5140 5145 4697 7045 7009 7000]
EventType
[nan 'SetValue' 'CreateKey' 'CreatePipe' 'DeleteKey' 'ConnectPipe']
LogonId
['0x3719fb' nan '0x3e7']
Hostname
['WORKSTATION5']
SubjectUserName
[nan 'APT-Simulator' 'WORKSTATION5$']
SubjectDomainName
[nan 'WORKSTATION5' 'WORKGROUP']
SubjectLogonId
[nan '0x3719fb' '0x3e7']
Status
[nan '0x0' '0x1' '0x2' '0x41d' '0x426' '0x80' '0x7']
SourceName
['Microsoft-Windows-Sysmon' 'Microsoft-Windows-Security-Auditing'
 'Service Control Manager']
ServiceName
[nan 'tbbd05']
DestinationHostname
[nan '-']
ProcessName
[nan 'C:\\Windows\\System32\\PING.EXE'
 'C:\\Users\\APT-Simulator\\Documents\\APTSimulator-master\\helpers\\7z.exe'
 'C:\\Windows\\System32\\conhost.exe' 'C:\\Windows\\System32\\timeout.exe'
 'C:\\TMP\\CreateNamedPipe.exe' 'C:\\Windows\\System32\\taskkill.exe'
 'C:\\Windows\\System32\\svchost.exe'
 'C:\\Windows\\System32\\wbem\\Wmi

In [11]:
df_cobalt.groupby(by=['ServiceName']).get_group('tbbd05')

Unnamed: 0,EventID,EventType,LogonId,Hostname,SubjectUserName,SubjectDomainName,SubjectLogonId,Status,SourceName,ServiceName,DestinationHostname,ProcessName,ProcessID,ParentProcessName,ParentProcessId
2500,4697,,,WORKSTATION5,APT-Simulator,WORKSTATION5,0x3719fb,,Microsoft-Windows-Security-Auditing,tbbd05,,,,,5632.0
2608,7045,,,WORKSTATION5,,,,,Service Control Manager,tbbd05,,,,,


In [12]:
df2.groupby('Status').count().show()

+------+-----+
|Status|count|
+------+-----+
|   0x0|   26|
|  null| 2575|
| 0x426|    1|
| 0x41d|    1|
|   0x1|    4|
|   0x2|    2|
|  0x80|    1|
|   0x7|    1|
+------+-----+



In [13]:
df2.groupby('Task').count().show()

+-----+-----+
| Task|count|
+-----+-----+
|12811|   14|
|    7|  911|
|   11|   13|
|12808|    1|
|12289|    1|
|13312|   36|
|    3|    2|
|13313|   36|
|    0|    3|
|12810|   35|
|    5|   36|
|12802|   20|
|   18|    1|
|13317|   22|
|   17|    5|
|   26|   18|
|13570|    1|
|    9|    1|
|    1|   37|
|   10| 1094|
+-----+-----+
only showing top 20 rows



In [15]:
df2.groupby('SourceName').count().show()

+--------------------+-----+
|          SourceName|count|
+--------------------+-----+
|Service Control M...|    3|
|Microsoft-Windows...| 2264|
|Microsoft-Windows...|  344|
+--------------------+-----+



In [17]:
df2.groupby('Hostname').count().show()

+------------+-----+
|    Hostname|count|
+------------+-----+
|WORKSTATION5| 2611|
+------------+-----+



In [25]:
df2.groupby('EventID').count().show()

+-------+-----+
|EventID|count|
+-------+-----+
|     26|   18|
|      7|  911|
|   4656|   41|
|   7009|    1|
|   7000|    1|
|      9|    1|
|     17|    5|
|      5|   36|
|      1|   37|
|   4697|    1|
|     10| 1094|
|   4663|   34|
|      3|    2|
|     12|   56|
|   7045|    1|
|     11|   13|
|   5158|    9|
|   4688|   36|
|   5140|    1|
|     13|   90|
+-------+-----+
only showing top 20 rows



In [31]:
df2.groupby('LogonId').count().show()                

+--------+-----+
| LogonId|count|
+--------+-----+
|    null| 2574|
|   0x3e7|    5|
|0x3719fb|   32|
+--------+-----+



In [19]:
df2.groupby('Channel').count().show()

+--------------------+-----+
|             Channel|count|
+--------------------+-----+
|            Security|  344|
|Microsoft-Windows...| 2264|
|              System|    3|
+--------------------+-----+



In [20]:
df2.groupby('Keywords').count().show()

+------------------+-----+
|          Keywords|count|
+------------------+-----+
|0x8020000000000000|  344|
|0x8080000000000000|    3|
|0x8000000000000000| 2264|
+------------------+-----+



In [21]:
df2.groupby('Message').count().show()

+--------------------+-----+
|             Message|count|
+--------------------+-----+
|Process accessed:...|    1|
|Process accessed:...|    1|
|Process accessed:...|    1|
|Process terminate...|    1|
|Process accessed:...|    1|
|Image loaded:
Ru...|    1|
|Image loaded:
Ru...|    1|
|Process accessed:...|    1|
|Process accessed:...|    1|
|A handle to an ob...|    1|
|An attempt was ma...|    1|
|A handle to an ob...|    1|
|Process terminate...|    1|
|Process accessed:...|    1|
|Process accessed:...|    1|
|Process accessed:...|    1|
|Process accessed:...|    1|
|Registry object a...|    1|
|Image loaded:
Ru...|    1|
|Image loaded:
Ru...|    1|
+--------------------+-----+
only showing top 20 rows



In [29]:
df2.groupby('SourceName','EventID').count().show()

+--------------------+-------+-----+
|          SourceName|EventID|count|
+--------------------+-------+-----+
|Microsoft-Windows...|      5|   36|
|Microsoft-Windows...|   4663|   34|
|Microsoft-Windows...|     12|   56|
|Microsoft-Windows...|      9|    1|
|Microsoft-Windows...|     13|   90|
|Microsoft-Windows...|     26|   18|
|Microsoft-Windows...|   4689|   36|
|Microsoft-Windows...|   4688|   36|
|Microsoft-Windows...|   5140|    1|
|Microsoft-Windows...|   5156|   26|
|Microsoft-Windows...|      1|   37|
|Microsoft-Windows...|   4670|    1|
|Microsoft-Windows...|   4690|   41|
|Service Control M...|   7000|    1|
|Service Control M...|   7045|    1|
|Service Control M...|   7009|    1|
|Microsoft-Windows...|     10| 1094|
|Microsoft-Windows...|   4656|   41|
|Microsoft-Windows...|   4658|   82|
|Microsoft-Windows...|     17|    5|
+--------------------+-------+-----+
only showing top 20 rows



In [34]:
df2.groupby('SourceName','LogonId').count().show()

+--------------------+--------+-----+
|          SourceName| LogonId|count|
+--------------------+--------+-----+
|Microsoft-Windows...|   0x3e7|    5|
|Service Control M...|    null|    3|
|Microsoft-Windows...|    null| 2227|
|Microsoft-Windows...|0x3719fb|   32|
|Microsoft-Windows...|    null|  344|
+--------------------+--------+-----+

