**Import modules**

In [1]:
import pandas as pd
import datetime
import time
from pandas.io import json
import pyspark
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.types import *
conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
spark.conf.set("spark.sql.caseSensitive", "true")
sqlContext = SQLContext(SparkContext.getOrCreate())
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

**Import dataset**

In [2]:
path = "../data/raw/aptsimulator_cobaltstrike_2021-06-11T21081492.json"
df = pd.read_json(path,lines=True)
df2 = spark.read.json(path)

**Initial view of the the dataset**

In [3]:
df.head()

Unnamed: 0,SourceName,ProviderGuid,Level,Keywords,Channel,Hostname,TimeCreated,@timestamp,EventID,Message,...,ServiceType,ServiceStartType,ServiceAccount,ClientProcessStartKey,ClientProcessId,ImagePath,StartType,AccountName,param1,param2
0,Microsoft-Windows-Sysmon,{5770385f-c22a-43e0-bf4c-06f5698ffbd9},4,0x8000000000000000,Microsoft-Windows-Sysmon/Operational,WORKSTATION5,2021-06-11T09:07:15.635Z,2021-06-11T09:07:15.635Z,1,Process Create:\r\nRuleName: -\r\nUtcTime: 202...,...,,,,,,,,,,
1,Microsoft-Windows-Sysmon,{5770385f-c22a-43e0-bf4c-06f5698ffbd9},4,0x8000000000000000,Microsoft-Windows-Sysmon/Operational,WORKSTATION5,2021-06-11T09:07:15.636Z,2021-06-11T09:07:15.636Z,10,Process accessed:\r\nRuleName: -\r\nUtcTime: 2...,...,,,,,,,,,,
2,Microsoft-Windows-Sysmon,{5770385f-c22a-43e0-bf4c-06f5698ffbd9},4,0x8000000000000000,Microsoft-Windows-Sysmon/Operational,WORKSTATION5,2021-06-11T09:07:15.636Z,2021-06-11T09:07:15.636Z,10,Process accessed:\r\nRuleName: -\r\nUtcTime: 2...,...,,,,,,,,,,
3,Microsoft-Windows-Sysmon,{5770385f-c22a-43e0-bf4c-06f5698ffbd9},4,0x8000000000000000,Microsoft-Windows-Sysmon/Operational,WORKSTATION5,2021-06-11T09:07:15.636Z,2021-06-11T09:07:15.636Z,7,Image loaded:\r\nRuleName: -\r\nUtcTime: 2021-...,...,,,,,,,,,,
4,Microsoft-Windows-Sysmon,{5770385f-c22a-43e0-bf4c-06f5698ffbd9},4,0x8000000000000000,Microsoft-Windows-Sysmon/Operational,WORKSTATION5,2021-06-11T09:07:15.636Z,2021-06-11T09:07:15.636Z,7,Image loaded:\r\nRuleName: -\r\nUtcTime: 2021-...,...,,,,,,,,,,


In [4]:
def timetounix(t):
    return (time.mktime(datetime.datetime.strptime(t, "%Y-%m-%dT%H:%M:%S.%fZ").timetuple()))

df['Time'] = df['TimeCreated'].apply(timetounix)

In [5]:
df['Time']

0       1.623417e+09
1       1.623417e+09
2       1.623417e+09
3       1.623417e+09
4       1.623417e+09
            ...     
2606    1.623417e+09
2607    1.623417e+09
2608    1.623417e+09
2609    1.623417e+09
2610    1.623417e+09
Name: Time, Length: 2611, dtype: float64

In [6]:
df = df[['EventID',"SubjectUserName","Time"]]

In [7]:
df.head()
file_name = 'apt.csv'
df.to_csv(file_name, sep='\t')

#### Notes:

- Work on discretizing, normalizing and think of how to create features from the columns and have questions about the data

#### Findings:

- Trying to find columns similar to LANL Denign AD Logs & PurpleSharp AD Log Playbook for Lateral Movement

'SourceName', <br>
'Hostname', <br>
'TimeCreated', <-- 'EventTime' (LANL) <br>
'DestinationIsIpv6', 'DestinationIp', 'DestinationHostname', 'DestinationPort', 'DestinationPortName'; <-- (Look into these) <br>
'SubjectDomainName', <br>
'EventID', <br>
'EventType', <br>
'LogonId', <br>
'ParentProcessId', <br>
'ParentProcessName', <br>
'ProcessID', <br>
'ProcessName', <br>
'NewProcessId', <br>
'NewProcessName', <br>
'ServiceName', <br>
'SubjectUserSid', <br>
'SubjectUserName', <br>
'SubjectDomainName', <br>
'SubjectLogonId', <br>
'Status' <br>

'AccessReason', 'Task', 'Message', 'Description', 'Details' <-- (Look into these)

In [8]:
df_cobalt = df[['EventID', 'EventType', 'LogonId', 'Hostname', 'SubjectUserName', 'SubjectDomainName', 'SubjectLogonId', 
                'Status', 'SourceName', 'ServiceName', 'DestinationHostname', 'ProcessName', 'ProcessID', 'ParentProcessName',
                'ParentProcessId']]

KeyError: "['LogonId', 'Status', 'ServiceName', 'ProcessID', 'SourceName', 'EventType', 'SubjectLogonId', 'Hostname', 'DestinationHostname', 'ParentProcessId', 'ProcessName', 'ParentProcessName', 'SubjectDomainName'] not in index"

In [None]:
df_cobalt.head()

In [None]:
df_cobalt.isnull().all(0)

In [None]:
for col in list(df_cobalt):
    print(col)
    print(df_cobalt[col].unique())

In [None]:
df_cobalt.groupby(by=['ServiceName']).get_group('tbbd05')

In [None]:
df2.groupby('Status').count().show()

In [None]:
df2.groupby('Task').count().show()

In [None]:
df2.groupby('SourceName').count().show()

In [None]:
df2.groupby('Hostname').count().show()

In [None]:
df2.groupby('EventID').count().show()

In [None]:
df2.groupby('LogonId').count().show()                

In [None]:
df2.groupby('Channel').count().show()

In [None]:
df2.groupby('Keywords').count().show()

In [None]:
df2.groupby('Message').count().show()

In [None]:
df2.groupby('SourceName','EventID').count().show()

In [None]:
df2.groupby('SourceName','LogonId').count().show()