# **Feature Selection**: RF and Chi Squared
------------------

In [1]:
# https://infosecjupyterthon.com/notebooks/2_Jupyterthon_Cyb3rPandaH_2020.html
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.types import *
import pyspark.sql.functions as func
from pyspark.sql.functions import length, concat ,concat_ws

import pandas as pd, numpy as np, networkx as nx
import matplotlib.pyplot as plt
import requests
from openhunt import ossem, descriptiveStatistics as ds, visualizations as vis
import warnings
warnings.filterwarnings('ignore')

# Import packages
import os
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
%matplotlib inline

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.caseSensitive", "true")
spark.sparkContext.setCheckpointDir("./tmp/checkpoints")
spark.sparkContext.version
spark.conf.set("spark.driver.memory", "3g")
spark.conf.set("spark.executor.memory", "3g")

In [4]:
df_index = spark.read.json('df_labelled_coalesce.json')

In [5]:
df_index = df_index.select("*").withColumn("id", func.monotonically_increasing_id()).drop("tags")

In [6]:
df_index = df_index.withColumn("TP_Binary", func.when((df_index["TP_Label"]=='0'), 0).otherwise(1))

In [6]:
df_index = df_index.checkpoint()

In [7]:
df_index = df_index.cache()

In [8]:
# Try to Pands thru Spark -> post checkpoint + cache -> : java.lang.OutOfMemoryError: Java heap space -> No change
# pd_index = df_index.toPandas()

In [7]:
df_index.createTempView('apt29Host')

In [8]:
target_host_day1 = spark.sql(
'''
SELECT *
FROM apt29Host
WHERE NOT TP_Label = '0'
''')

In [11]:
pd_target_distict_count = target_host_day1.agg(*[
    func.countDistinct(c).alias(c)    # vertical (column-wise) operations in SQL ignore NULLs
    for c in target_host_day1.columns
]).toPandas().transpose()

In [59]:
pd.options.display.max_rows = 100
pd_target_distict_count.sort_values(by = 0, ascending = False).head(30)

Unnamed: 0,0
id,3438
RecordNumber,759
Message,756
ActivityID,638
ContextInfo,623
@timestamp,475
Payload,416
EventReceivedTime,87
EventTime,78
UtcTime,59


In [57]:
# Select subset of columns with more than 1 AND less than 52 unique values
columnList = list(pd_target_distict_count.loc[(pd_target_distict_count[0] > 1) & (pd_target_distict_count[0] < 52)].sort_values(by = 0, ascending = False).iloc[:3].index.values)

In [58]:
columnList

['ProcessId', 'ProcessGuid', 'ThreadID']

In [23]:
#columnList = ["EventID", "ProcessGuid"]

In [14]:
# Class Balance
df_index.groupBy(df_index["TP_Binary"]).count().orderBy('count', ascending=False).toPandas()

Unnamed: 0,TP_Binary,count
0,0,195322
1,1,3438


In [15]:
# Categorical Freq Count by Columns with more than one distinct value in the target subset
#for i in columnList:
#    display(df_index.groupBy(df_index[i]).count().orderBy('count', ascending=False).toPandas())

Unnamed: 0,@timestamp,count
0,2020-05-02T03:00:36.941Z,121
1,2020-05-02T03:00:37.248Z,117
2,2020-05-02T03:20:38.217Z,99
3,2020-05-02T03:20:46.921Z,99
4,2020-05-02T03:20:53.888Z,98
...,...,...
50318,2020-05-02T03:08:18.613Z,1
50319,2020-05-02T03:01:35.720Z,1
50320,2020-05-02T02:56:17.115Z,1
50321,2020-05-02T03:03:07.031Z,1


Unnamed: 0,AccessList,count
0,,187719
1,%%4432\r\n\t\t\t\t,6773
2,%%1538\r\n\t\t\t\t%%4432\r\n\t\t\t\t%%4435\r\n...,3821
3,%%1538\r\n\t\t\t\t%%1541\r\n\t\t\t\t%%4416\r\n...,44
4,%%1537\r\n\t\t\t\t%%1538\r\n\t\t\t\t%%4432\r\n...,44
5,%%4416\r\n\t\t\t\t,42
6,%%1538\r\n\t\t\t\t%%1541\r\n\t\t\t\t%%4416\r\n...,36
7,%%1537\r\n\t\t\t\t,30
8,%%1541\r\n\t\t\t\t%%4416\r\n\t\t\t\t%%4423\r\n...,30
9,%%1537\r\n\t\t\t\t%%1538\r\n\t\t\t\t%%1539\r\n...,26


Unnamed: 0,AccessMask,count
0,,187664
1,0x1,6815
2,0x20019,3821
3,0x3001f,44
4,0x120089,44
5,0x12019f,36
6,0xf003f,36
7,0x2,36
8,0x100081,30
9,0x10000,30


Unnamed: 0,AccountName,count
0,SYSTEM,145230
1,,46308
2,pbeesly,7158
3,NETWORK SERVICE,41
4,LOCAL SERVICE,23


Unnamed: 0,ActivityID,count
0,,186806
1,{BCCD0404-4290-0001-60ED-CE673020D601},3177
2,{A725FCD6-CBDF-0000-6337-C31E1D20D601},202
3,{55C748F1-AE9F-0001-EE1C-BFD51F20D601},107
4,{7AEC1283-BA78-47DB-AB99-27A339F7971E},70
...,...,...
5218,{BCCD0404-4290-0000-CE1E-14D71F20D601},1
5219,{BCCD0404-4290-0001-E44A-13D71F20D601},1
5220,{A725FCD6-CBDF-0001-F27D-D81E1D20D601},1
5221,{BCCD0404-4290-0000-B1FC-10D71F20D601},1


Unnamed: 0,Application,count
0,,193299
1,\device\harddiskvolume2\windows\system32\svcho...,1518
2,\device\harddiskvolume2\packages\plugins\micro...,721
3,\device\harddiskvolume2\windows\temp\python.exe,696
4,\device\harddiskvolume4\windows\system32\lsass...,445
5,System,378
6,\device\harddiskvolume4\windows\system32\svcho...,375
7,\device\harddiskvolume2\windows\system32\lsass...,334
8,\device\harddiskvolume4\windows\system32\dns.exe,291
9,\device\harddiskvolume4\windows\adws\microsoft...,86


Unnamed: 0,Archived,count
0,,198338
1,true,418
2,false - shredded file with pattern 0x00,4


Unnamed: 0,Category,count
0,,144192
1,Registry,21638
2,Executing Pipeline,7795
3,Handle Manipulation,5471
4,Filtering Platform Connection,5461
5,Pipeline Execution Details,5113
6,Other Policy Change Events,2588
7,Token Right Adjusted Events,1802
8,Sensitive Privilege Use,989
9,Process Creation,461


Unnamed: 0,Channel,count
0,Microsoft-Windows-Sysmon/Operational,143891
1,Security,28629
2,security,12375
3,Microsoft-Windows-PowerShell/Operational,8364
4,Windows PowerShell,5285
5,System,91
6,Microsoft-Windows-WMI-Activity/Operational,90
7,Microsoft-Windows-TerminalServices-RemoteConne...,15
8,Microsoft-Windows-Windows Firewall With Advanc...,10
9,Microsoft-Windows-TerminalServices-LocalSessio...,9


Unnamed: 0,CommandLine,count
0,,197867
1,\??\C:\windows\system32\conhost.exe 0xffffffff...,126
2,C:\Windows\System32\RuntimeBroker.exe -Embedding,26
3,"""C:\windows\system32\backgroundTaskHost.exe"" -...",24
4,C:\windows\system32\wbem\wmiprvse.exe -Embedding,20
...,...,...
268,%SystemRoot%\system32\csrss.exe ObjectDirector...,1
269,\??\C:\windows\system32\autochk.exe *,1
270,wininit.exe,1
271,\SystemRoot\System32\smss.exe,1


Unnamed: 0,Company,count
0,,178055
1,Microsoft Corporation,20298
2,-,331
3,The ICU Project,20
4,Sysinternals - www.sysinternals.com,20
5,Microsoft CoreXT,9
6,Google LLC,8
7,Sysinternals,8
8,SQLite Development Team,4
9,Alexander Roshal,2


Unnamed: 0,ContextInfo,count
0,,190965
1,Severity = Informational\r\n Ho...,16
2,Severity = Informational\r\n Ho...,16
3,Severity = Informational\r\n Ho...,16
4,Severity = Informational\r\n Ho...,16
...,...,...
5120,Severity = Informational\r\n Ho...,1
5121,Severity = Informational\r\n Ho...,1
5122,Severity = Informational\r\n Ho...,1
5123,Severity = Informational\r\n Ho...,1


Unnamed: 0,CreationUtcTime,count
0,,196884
1,2016-04-18 20:53:44.000,41
2,2020-05-02 01:22:02.247,17
3,2020-05-02 01:21:27.463,17
4,2020-05-02 01:02:09.939,17
...,...,...
966,2020-05-02 03:19:16.837,1
967,2020-05-02 03:00:27.751,1
968,2020-05-02 03:08:30.895,1
969,2020-05-02 03:19:28.009,1


Unnamed: 0,CurrentDirectory,count
0,,198314
1,C:\windows\system32\,292
2,C:\windows,70
3,C:\Windows\SystemApps\Microsoft.Windows.Conten...,17
4,C:\ProgramData\victim\,10
5,C:\WindowsAzure\CollectGuestLogsTemp\,8
6,C:\Program Files\SysinternalsSuite\,8
7,C:\WindowsAzure\Packages\GuestAgent\,5
8,C:\Windows\SystemApps\Microsoft.Windows.Cortan...,4
9,C:\Packages\Plugins\Microsoft.Azure.NetworkWat...,4


Unnamed: 0,Description,count
0,,178094
1,Microsoft® C Runtime Library,844
2,Windows NT BASE API Client DLL,844
3,GDI Client DLL,753
4,Windows Cryptographic Primitives Library,621
...,...,...
1062,Network Diagnostic Framework HC Discovery API,1
1063,Kerberos Client Shared Functionality,1
1064,Home Networking Configuration API Client,1
1065,Window Management,1


Unnamed: 0,DestAddress,count
0,,195557
1,10.0.0.4,831
2,169.254.169.254,733
3,10.0.1.10,556
4,192.168.0.4,370
...,...,...
101,192.54.112.30,1
102,95.100.175.64,1
103,23.35.79.143,1
104,84.53.139.128,1


Unnamed: 0,DestPort,count
0,,195557
1,80.0,792
2,5985.0,568
3,53.0,437
4,8443.0,348
5,389.0,325
6,445.0,167
7,88.0,114
8,135.0,91
9,49667.0,73


Unnamed: 0,DestinationIp,count
0,,197531
1,192.168.0.4,356
2,10.0.0.4,323
3,fe80:0:0:0:e40c:95b6:b0a7:6429,116
4,0:0:0:0:0:0:0:1,111
...,...,...
88,192.175.48.1,1
89,192.54.112.30,1
90,95.100.175.64,1
91,84.53.139.128,1


Unnamed: 0,DestinationPort,count
0,,197531
1,8443,348
2,389,222
3,88,95
4,53,80
...,...,...
196,58458,1
197,64543,1
198,63518,1
199,61440,1


Unnamed: 0,Details,count
0,,181219
1,Binary Data,4500
2,DWORD (0x00000000),1799
3,DWORD (0x00000001),1136
4,DWORD (0x00000002),630
...,...,...
3125,2020/5/2 3:18:35'110,1
3126,C:\windows\system32\config\systemprofile\AppDa...,1
3127,S-1-5-19,1
3128,NarratorQuickStart.App,1


Unnamed: 0,Domain,count
0,NT AUTHORITY,145294
1,,46308
2,DMEVALS,7158


Unnamed: 0,EventID,count
0,12,61158
1,10,39286
2,7,20259
3,13,17542
4,4658,10973
...,...,...
160,51047,1
161,261,1
162,6038,1
163,5024,1


Unnamed: 0,EventReceivedTime,count
0,2020-05-01 23:19:52,2266
1,2020-05-01 23:20:17,2094
2,2020-05-01 23:20:18,2029
3,2020-05-01 23:20:16,1705
4,2020-05-01 23:01:44,1592
...,...,...
1791,2020-05-01 23:23:24,1
1792,2020-05-01 23:13:32,1
1793,2020-05-01 23:26:44,1
1794,2020-05-01 23:11:54,1


Unnamed: 0,EventTime,count
0,2020-05-01 23:19:30,8724
1,2020-05-01 23:19:31,5301
2,2020-05-01 23:08:26,3255
3,2020-05-01 23:08:27,3144
4,2020-05-01 23:19:20,3115
...,...,...
1955,2020-05-01 23:27:07,1
1956,2020-05-01 23:26:41,1
1957,2020-05-01 23:25:56,1
1958,2020-05-01 23:23:23,1


Unnamed: 0,EventType,count
0,INFO,157221
1,AUDIT_SUCCESS,40078
2,AUDIT_FAILURE,925
3,WARNING,261
4,VERBOSE,225
5,ERROR,50


Unnamed: 0,ExecutionProcessID,count
0,3368,50981
1,3484,46478
2,4,36831
3,3428,24789
4,3968,11240
...,...,...
68,1148,1
69,5624,1
70,8000,1
71,4184,1


Unnamed: 0,FileVersion,count
0,,178055
1,10.0.18362.1 (WinBuild.160101.0800),6513
2,10.0.18362.719 (WinBuild.160101.0800),2454
3,10.0.18362.387 (WinBuild.160101.0800),2147
4,10.0.18362.693 (WinBuild.160101.0800),2055
...,...,...
136,1.5.5.9,1
137,4.8.4110.0,1
138,9.00.30729.9619,1
139,2.8.6.69,1


Unnamed: 0,Hashes,count
0,,177633
1,"SHA1=C67C3C415EBDFF8C51C80E09838EAD29F79CD57D,...",413
2,"SHA1=D2AD4D4A061147CA2102FD1198A1AB5E81089687,...",401
3,"SHA1=3CB1C63FEA955C4A987F9B3140E2194536B3D533,...",401
4,"SHA1=6F760E5583E17EE8ACF5941811F98F357E329CD3,...",401
...,...,...
1737,"SHA1=A356444BEBF9DE3B3E2BF0A8D94FCE149A4F956A,...",1
1738,"SHA1=5170DDD2109F93EF587058F0CECB52B02159A68C,...",1
1739,"SHA1=8BFFA768F7C9A4344A33E5CEDD701099E29245D0,...",1
1740,"SHA1=2182F5048F1F3F56092295A9291CC052F830E42F,...",1


Unnamed: 0,Hostname,count
0,SCRANTON.dmevals.local,133796
1,NASHUA.dmevals.local,29058
2,NEWYORK.dmevals.local,23935
3,UTICA.dmevals.local,11971


Unnamed: 0,Image,count
0,,94250
1,C:\windows\system32\svchost.exe,25502
2,C:\windows\system32\WindowsPowerShell\v1.0\pow...,17059
3,C:\windows\system32\wbem\wmiprvse.exe,7332
4,C:\Windows\System32\svchost.exe,5506
...,...,...
184,C:\windows\Sysmon.exe,1
185,C:\Program Files\SysinternalsSuite\accesschk.exe,1
186,C:\Program Files\WindowsApps\Microsoft.Windows...,1
187,C:\windows\system32\dfssvc.exe,1


Unnamed: 0,ImageLoaded,count
0,,178501
1,C:\Windows\System32\ntdll.dll,430
2,C:\Windows\System32\KernelBase.dll,418
3,C:\Windows\System32\kernel32.dll,418
4,C:\Windows\System32\rpcrt4.dll,418
...,...,...
1241,C:\Windows\System32\cryptsvc.dll,1
1242,C:\Windows\System32\userinitext.dll,1
1243,C:\Windows\System32\winbio.dll,1
1244,C:\Windows\Temp\Rar.exe,1


Unnamed: 0,IntegrityLevel,count
0,,198314
1,System,296
2,Medium,78
3,High,39
4,AppContainer,33


Unnamed: 0,IpPort,count
0,,198241
1,-,129
2,59997,29
3,49669,17
4,0,15
...,...,...
204,57174,1
205,49763,1
206,57177,1
207,57342,1


Unnamed: 0,IsExecutable,count
0,,198338
1,False,286
2,True,136


Unnamed: 0,Keywords,count
0,-9223372036854775808,143932
1,-9214364837600034816,40078
2,0,8362
3,36028797018963968,5293
4,-9218868437227405312,925
5,4611686018427387904,93
6,1152921504606846976,24
7,-9187343239835811840,22
8,2305843009213693952,9
9,-9223369837831520256,8


Unnamed: 0,LogonGuid,count
0,,197972
1,{47ab858c-e6ad-5eac-e703-000000000000},171
2,{00000000-0000-0000-0000-000000000000},86
3,{8e9952b9-76c2-4180-9723-806b9f40f806},62
4,{2ae0a698-7c53-02af-08c2-81dfeb66f781},49
...,...,...
78,{6fdaf3e9-3bd5-fda8-bc54-4d4175959ba0},1
79,{8dc4ffd7-bafa-e1c0-c76c-9579058078aa},1
80,{fc1fd21a-c818-423c-69a9-b8a74ad3132b},1
81,{c2614bad-598d-ef57-01ff-f8373fe0331d},1


Unnamed: 0,LogonId,count
0,,198314
1,0x3e7,243
2,0x13069a,42
3,0x372e81,31
4,0x3e5,30
5,0x3731f3,29
6,0x3e4,27
7,0x30ba2c,17
8,0x89177d,9
9,0x1305dc,4


Unnamed: 0,MandatoryLabel,count
0,,198300
1,S-1-16-16384,307
2,S-1-16-8192,79
3,S-1-16-12288,39
4,S-1-16-4096,35


Unnamed: 0,NewProcessId,count
0,,198300
1,0x14f4,3
2,0x16a0,3
3,0x1284,2
4,0x16b0,2
...,...,...
416,0xf4c,1
417,0x14bc,1
418,0x13ac,1
419,0x1ba8,1


Unnamed: 0,NewProcessName,count
0,,198300
1,C:\Windows\System32\svchost.exe,100
2,C:\Windows\System32\conhost.exe,65
3,C:\Windows\System32\backgroundTaskHost.exe,18
4,C:\Windows\System32\wbem\WmiPrvSE.exe,17
...,...,...
110,C:\Windows\System32\VSSVC.exe,1
111,C:\Program Files\SysinternalsSuite\accessChk.exe,1
112,C:\Windows\SysWOW64\dllhost.exe,1
113,C:\Windows\Temp\Rar.exe,1


Unnamed: 0,ObjectType,count
0,,187521
1,Key,10761
2,File,197
3,Token,143
4,SAM_DOMAIN,38
5,Process,19
6,-,18
7,SERVICE OBJECT,14
8,SAM_SERVER,10
9,SC_MANAGER OBJECT,8


Unnamed: 0,Opcode,count
0,,143915
1,Info,46470
2,To be used when operation is just executing a ...,7842
3,On create calls,414
4,To be used when an exception is raised,47
5,Open (async),31
6,Start,15
7,Stop,11
8,to be used when an object is constructed,6
9,ServiceShutdown,4


Unnamed: 0,OpcodeValue,count
0,0.0,185095
1,20.0,7842
2,,5290
3,15.0,414
4,19.0,47
5,10.0,31
6,1.0,15
7,2.0,11
8,16.0,6
9,129.0,4


Unnamed: 0,OriginalFileName,count
0,,178055
1,gdi32,753
2,ntdll.dll,434
3,rpcrt4.dll,422
4,Kernelbase.dll,422
...,...,...
1130,SecurityHealthSSO.dll,1
1131,FECLIENT.DLL,1
1132,provdatastore.dll,1
1133,pcasvc.dll,1


Unnamed: 0,ParentCommandLine,count
0,,198314
1,C:\windows\system32\services.exe,127
2,C:\windows\system32\svchost.exe -k DcomLaunch -p,75
3,C:\windows\system32\svchost.exe -k netsvcs -p ...,31
4,C:\WindowsAzure\Packages\WaAppAgent.exe,21
...,...,...
78,?,1
79,"""sc.exe"" config WindowsAzureGuestAgent error= ...",1
80,C:\windows\system32\dispdiag.exe -out dispdiag...,1
81,"""powershell.exe"" -nop -w hidden -c &amp;([scri...",1


Unnamed: 0,ParentImage,count
0,,198314
1,C:\Windows\System32\services.exe,127
2,C:\Windows\System32\svchost.exe,121
3,C:\Windows\System32\cmd.exe,25
4,C:\Windows\System32\WindowsPowerShell\v1.0\pow...,24
5,C:\WindowsAzure\Packages\WaAppAgent.exe,21
6,C:\Windows\System32\smss.exe,10
7,C:\Windows\System32\SearchIndexer.exe,9
8,C:\Windows\System32\winlogon.exe,9
9,C:\WindowsAzure\Packages\GuestAgent\WindowsAzu...,9


Unnamed: 0,ParentProcessGuid,count
0,,198314
1,{47ab858c-e6ad-5eac-0b00-000000000500},102
2,{47ab858c-e6ae-5eac-1000-000000000500},24
3,{47ab858c-cada-5eac-1000-000000000400},21
4,{5aa8ec29-cad8-5eac-1000-000000000400},20
...,...,...
129,{6bbf237a-cb01-5eac-4c00-000000000400},1
130,{47ab858c-e379-5eac-dc03-000000000400},1
131,{47ab858c-e3d3-5eac-e603-000000000400},1
132,{47ab858c-e6af-5eac-1600-000000000500},1


Unnamed: 0,ParentProcessId,count
0,,198314
1,736,102
2,884,41
3,944,31
4,720,20
...,...,...
122,3324,1
123,760,1
124,5220,1
125,5636,1


Unnamed: 0,ParentProcessName,count
0,,198303
1,C:\Windows\System32\services.exe,132
2,C:\Windows\System32\svchost.exe,124
3,C:\Windows\System32\WindowsPowerShell\v1.0\pow...,25
4,C:\Windows\System32\cmd.exe,25
5,C:\WindowsAzure\Packages\WaAppAgent.exe,21
6,C:\Windows\System32\smss.exe,10
7,C:\Windows\System32\SearchIndexer.exe,9
8,C:\Windows\System32\winlogon.exe,9
9,C:\WindowsAzure\Packages\GuestAgent\WindowsAzu...,9


Unnamed: 0,Path,count
0,,198729
1,C:\Program Files\SysinternalsSuite\psversion.ps1,6
2,C:\Program Files\SysinternalsSuite\readme.ps1,4
3,C:\Program Files\WindowsPowerShell\Modules\xNe...,4
4,C:\Program Files\WindowsPowerShell\Modules\xAc...,2
5,C:\Program Files\WindowsPowerShell\Modules\xSt...,2
6,C:\Program Files\WindowsPowerShell\Modules\xSt...,2
7,C:\Program Files\WindowsPowerShell\Modules\xNe...,2
8,C:\Program Files\WindowsPowerShell\Modules\xSt...,2
9,C:\Program Files\WindowsPowerShell\Modules\xAc...,2


Unnamed: 0,Payload,count
0,,190965
1,"CommandInvocation(Start-Sleep): ""Start-Sleep""\...",978
2,"CommandInvocation(Test-Path): ""Test-Path""\r\nP...",154
3,"CommandInvocation(Test-Path): ""Test-Path""\r\nP...",153
4,"CommandInvocation(Set-StrictMode): ""Set-Strict...",121
...,...,...
2304,"CommandInvocation(ProgressBarHelper): ""Progres...",1
2305,"CommandInvocation(Join-Path): ""Join-Path""\r\nP...",1
2306,"CommandInvocation(Move-Item): ""Move-Item""\r\nP...",1
2307,"CommandInvocation(Write-Verbose): ""Write-Verbo...",1


Unnamed: 0,ProcessGuid,count
0,,94250
1,{47ab858c-e374-5eac-d803-000000000400},15802
2,{47ab858c-e6c0-5eac-7600-000000000500},13694
3,{32aa854b-e288-5eac-5203-000000000300},3773
4,{32aa854b-e60c-5eac-6103-000000000300},3369
...,...,...
646,{47ab858c-caf2-5eac-9600-000000000400},1
647,{47ab858c-cadf-5eac-3b00-000000000400},1
648,{47ab858c-dabb-5eac-df02-000000000400},1
649,{5aa8ec29-cae0-5eac-4f00-000000000400},1


Unnamed: 0,ProcessId,count
0,3852,15850
1,,15230
2,5044,14215
3,900,8689
4,884,8411
...,...,...
1103,5544,1
1104,3380,1
1105,0x74c,1
1106,0x1128,1


Unnamed: 0,ProcessName,count
0,,173222
1,C:\Windows\System32\svchost.exe,8002
2,C:\Windows\System32\WindowsPowerShell\v1.0\pow...,7134
3,C:\Windows\System32\wbem\WmiPrvSE.exe,5447
4,C:\Windows\System32\services.exe,770
...,...,...
111,\Device\HarddiskVolume2\Windows\System32\svcho...,1
112,C:\Program Files\WindowsApps\Microsoft.Windows...,1
113,C:\WindowsAzure\SecAgent\WaSecAgentProv.exe,1
114,C:\Windows\System32\hostui.exe,1


Unnamed: 0,Product,count
0,,178055
1,Microsoft® Windows® Operating System,19283
2,-,340
3,Microsoft® .NET Framework,330
4,Internet Explorer,188
5,Windows® Search,177
6,Microsoft (R) Windows (R) Operating System,151
7,Microsoft® Visual Studio® 2017,47
8,International Components for Unicode,20
9,Microsoft ® Windows Script Host,18


Unnamed: 0,Protocol,count
0,,192066
1,6,4230
2,17,1216
3,tcp,921
4,udp,308
5,58,10
6,2,5
7,256,4


Unnamed: 0,ProviderGuid,count
0,{5770385F-C22A-43E0-BF4C-06F5698FFBD9},143891
1,{54849625-5478-4994-A5BA-3E3B0328C30D},41003
2,{A0C1853B-5C40-4B15-8766-3CF1C58F985A},8364
3,,5290
4,{1418EF04-B0B4-4623-BF7E-D74AB47BBDAA},90
5,{555908D1-A6D7-4695-8E1E-26931D2012F4},18
6,{C76BAA63-AE81-421C-B425-340B4B24157F},15
7,{F3C5E28E-63F6-49C7-A204-E48A1BC4B09D},12
8,{D1BC9AFF-2ABF-4D71-9146-ECB2A986EB85},10
9,{152FBE4B-C7AD-4F68-BADA-A4FCC1464F6C},9


Unnamed: 0,RecordNumber,count
0,8859,17
1,8853,17
2,10239,17
3,9555,17
4,9321,17
...,...,...
179565,398601,1
179566,70765,1
179567,141829,1
179568,412492,1


Unnamed: 0,RelativeTargetName,count
0,,198639
1,PSEXESVC.exe,12
2,samr,11
3,dmevals.local\Policies\{31B2F340-016D-11D2-945...,10
4,lsarpc,10
5,svcctl,8
6,dmevals.local\Policies\{6AC1786C-016F-11D2-945...,7
7,\,7
8,PSEXESVC,4
9,dmevals.local\Policies,3


Unnamed: 0,ScriptBlockId,count
0,,198503
1,6547700f-6538-4b23-8e76-0b42fbd3cfef,3
2,9642527f-caf4-4da9-b0a9-2b1990f5adac,1
3,d7770e1c-9706-45f7-b821-a58d7d7b71d6,1
4,52213e0b-f743-4395-a6b5-4e85061ce056,1
...,...,...
251,de86d413-a00c-4619-ae4e-9d11c4274ba0,1
252,0943f7fe-e808-492e-b955-ca275c8ca2b5,1
253,e3f1b192-9d7b-4a9a-8409-20beb000fca7,1
254,e854ca8e-4867-45d4-b4b5-8220a3c45bc5,1


Unnamed: 0,ScriptBlockText,count
0,,198503
1,prompt,52
2,function __cmdletization_BindCommonParameters\...,8
3,exit,5
4,".\PsExec64.exe -accepteula \\NASHUA -u ""dmeval...",4
...,...,...
138,{ $_.MainWindowHandle -eq $TopWindow },1
139,"Remove-Job -Name ""Screenshot"" -Force",1
140,{ $_.GlobalAssemblyCache -And $_.Location.Spli...,1
141,"{ $_.UserName -like ""*\$env:USERNAME"" }",1


Unnamed: 0,Severity,count
0,INFO,197299
1,ERROR,975
2,WARNING,261
3,DEBUG,225


Unnamed: 0,SeverityValue,count
0,2,197299
1,4,975
2,3,261
3,1,225


Unnamed: 0,ShareName,count
0,,198593
1,\\*\SYSVOL,69
2,\\*\IPC$,66
3,\\*\ADMIN$,30
4,\\*\C$,1
5,\\*\D$,1


Unnamed: 0,Signature,count
0,,178501
1,Microsoft Windows,19568
2,-,365
3,Microsoft Corporation,193
4,Microsoft Windows Publisher,117
5,Windows Phone,6
6,Google LLC,4
7,Microsoft Windows 3rd party Component,4
8,win.rar GmbH,1
9,Microsoft Windows Third Party Application Comp...,1


Unnamed: 0,SignatureStatus,count
0,,178501
1,Valid,19894
2,Unavailable,365


Unnamed: 0,Signed,count
0,,178501
1,True,19894
2,False,365


Unnamed: 0,SourceName,count
0,Microsoft-Windows-Sysmon,143891
1,Microsoft-Windows-Security-Auditing,41003
2,Microsoft-Windows-PowerShell,8364
3,PowerShell,5285
4,Microsoft-Windows-WMI-Activity,90
5,Service Control Manager,18
6,Microsoft-Windows-TerminalServices-RemoteConne...,15
7,Microsoft-Windows-FilterManager,12
8,Microsoft-Windows-Windows Firewall With Advanc...,10
9,Microsoft-Windows-Hyper-V-Netvsc,9


Unnamed: 0,SourcePort,count
0,,192070
1,49706,403
2,49702,312
3,53,152
4,49669,102
...,...,...
1417,51296,1
1418,61183,1
1419,50493,1
1420,63518,1


Unnamed: 0,SubjectDomainName,count
0,,166440
1,DMEVALS,26202
2,NT AUTHORITY,5619
3,-,476
4,Window Manager,19
5,Font Driver Host,4


Unnamed: 0,SubjectLogonId,count
0,,166440
1,0x3e7,14050
2,0x372e81,7930
3,0x3e5,5558
4,0x3731f3,1130
...,...,...
182,0x62e176,1
183,0x121a19,1
184,0x3c45c,1
185,0x612c31,1


Unnamed: 0,SubjectUserName,count
0,,166440
1,pbeesly,11532
2,NEWYORK$,7766
3,SCRANTON$,5804
4,LOCAL SERVICE,5558
5,NASHUA$,832
6,-,476
7,UTICA$,253
8,SYSTEM,59
9,WEC$,15


Unnamed: 0,SubjectUserSid,count
0,,166440
1,S-1-5-18,14207
2,S-1-5-21-1830255721-3727074217-2423397540-1107,11532
3,S-1-5-19,5558
4,S-1-5-20,446
5,S-1-0-0,439
6,S-1-5-21-1830255721-3727074217-2423397540-1113,49
7,S-1-5-21-1830255721-3727074217-2423397540-1112,35
8,S-1-5-21-1830255721-3727074217-2423397540-1114,15
9,S-1-5-21-1830255721-3727074217-2423397540-1111,15


Unnamed: 0,TP_Label,count
0,0,195322
1,"0, 4.A.3. Deobfuscate/Decode Files or Informat...",3291
2,"0, 8.C.2. Windows Admin Shares",16
3,"0, 8.C.3. Service Execution",8
4,"0, 8.A.2. Remote System Discovery",8
5,"0, 4.B.2. File Deletion",7
6,"0, 5.A.1. New Service",7
7,"0, 4.B.3. File Deletion",7
8,"0, 4.B.4. File Deletion",7
9,"0, 1.B.2. PowerShell",6


Unnamed: 0,TargetDomainName,count
0,,195558
1,DMEVALS,2178
2,DMEVALS.LOCAL,472
3,-,312
4,NT AUTHORITY,155
5,Builtin,24
6,Font Driver Host,13
7,Window Manager,12
8,SCRANTON,11
9,dmevals,8


Unnamed: 0,TargetFilename,count
0,,196462
1,C:\Windows\ServiceState\EventLog\Data\lastaliv...,68
2,C:\Windows\ServiceState\EventLog\Data\lastaliv...,65
3,C:\Users\pbeesly\AppData\Local\Packages\Micros...,22
4,C:\Windows\System32\Configuration\DSCEngineCac...,11
...,...,...
1451,C:\Users\pbeesly\Downloads\__MACOSX\Sysinterna...,1
1452,C:\Users\pbeesly\Downloads\SysinternalsSuite\A...,1
1453,C:\Windows\Prefetch\LOGONUI.EXE-09140401.pf,1
1454,C:\Users\pbeesly\Downloads\SysinternalsSuite\d...,1


Unnamed: 0,TargetLogonId,count
0,,195664
1,0x3e7,1902
2,0x0,311
3,0x13069a,35
4,0x3e5,33
...,...,...
256,0x30c6a1,1
257,0x30bee8,1
258,0x30c776,1
259,0x30b3ee,1


Unnamed: 0,TargetObject,count
0,,120062
1,HKU\S-1-5-21-1830255721-3727074217-2423397540-...,4250
2,HKLM\SOFTWARE\Microsoft\SystemCertificates\Dis...,1928
3,HKLM\SOFTWARE\Microsoft\EnterpriseCertificates...,1928
4,HKLM\SOFTWARE,1746
...,...,...
20981,HKLM\SOFTWARE\Microsoft\Windows\CurrentVersion...,1
20982,HKLM\SOFTWARE\Microsoft\Windows\CurrentVersion...,1
20983,HKLM\SOFTWARE\Microsoft\Windows\CurrentVersion...,1
20984,HKLM\System\CurrentControlSet\Control\Storage\...,1


Unnamed: 0,TargetUserName,count
0,,195557
1,SCRANTON$,1143
2,NEWYORK$,838
3,-,312
4,NASHUA$,262
5,pbeesly,217
6,UTICA$,144
7,SYSTEM,120
8,LOCAL SERVICE,33
9,WEC$,26


Unnamed: 0,TargetUserSid,count
0,,195664
1,S-1-0-0,1834
2,S-1-5-18,793
3,S-1-5-21-1830255721-3727074217-2423397540-1107,197
4,S-1-5-21-1830255721-3727074217-2423397540-1113,101
5,S-1-5-21-1830255721-3727074217-2423397540-1112,74
6,S-1-5-19,31
7,S-1-5-21-1830255721-3727074217-2423397540-1111,26
8,S-1-5-21-1830255721-3727074217-2423397540-1114,17
9,S-1-5-90-0-2,5


Unnamed: 0,Task,count
0,12,61157
1,10,39283
2,12801,21638
3,7,20259
4,13,17541
...,...,...
65,1001,1
66,13568,1
67,14336,1
68,1101,1


Unnamed: 0,TerminalSessionId,count
0,,198314
1,0.0,290
2,2.0,150
3,1.0,6


Unnamed: 0,ThreadID,count
0,4612,45958
1,4588,45787
2,4224,24010
3,4236,10573
4,4396,10377
...,...,...
253,6868,1
254,1268,1
255,3788,1
256,9152,1


Unnamed: 0,TokenElevationType,count
0,,198300
1,%%1936,315
2,%%1938,108
3,%%1937,37


Unnamed: 0,User,count
0,,196663
1,NT AUTHORITY\SYSTEM,1174
2,DMEVALS\pbeesly,790
3,NT AUTHORITY\NETWORK SERVICE,75
4,NT AUTHORITY\LOCAL SERVICE,53
5,Window Manager\DWM-2,1
6,Window Manager\DWM-1,1
7,Font Driver Host\UMFD-1,1
8,Font Driver Host\UMFD-0,1
9,Font Driver Host\UMFD-2,1


Unnamed: 0,UserID,count
0,S-1-5-18,145230
1,,46308
2,S-1-5-21-1830255721-3727074217-2423397540-1107,7158
3,S-1-5-20,41
4,S-1-5-19,23


Unnamed: 0,UtcTime,count
0,,54870
1,2020-05-02 03:19:31.447,397
2,2020-05-02 03:19:30.337,396
3,2020-05-02 03:19:30.165,384
4,2020-05-02 03:19:30.259,383
...,...,...
16376,2020-05-02 03:15:32.261,1
16377,2020-05-02 03:23:09.379,1
16378,2020-05-02 03:11:00.001,1
16379,2020-05-02 03:02:46.460,1


Unnamed: 0,Version,count
0,2.0,81873
1,3.0,59944
2,0.0,26316
3,1.0,22949
4,,5290
5,5.0,2179
6,4.0,209


Unnamed: 0,id,count
0,26,1
1,29,1
2,474,1
3,964,1
4,1677,1
...,...,...
198755,42949703422,1
198756,42949703509,1
198757,42949704151,1
198758,42949704368,1


In [60]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

# one hot encoding and assembling
encoding_var = columnList
#num_var = [i[0] for i in df.dtypes if ((i[1]=='int') | (i[1]=='double')) & (i[0]!='y')]

string_indexes = [StringIndexer(inputCol = c, outputCol = 'IDX_' + c, handleInvalid = 'keep') for c in encoding_var]
onehot_indexes = [OneHotEncoderEstimator(inputCols = ['IDX_' + c], outputCols = ['OHE_' + c]) for c in encoding_var]
label_indexes = StringIndexer(inputCol = 'TP_Binary', outputCol = 'label', handleInvalid = 'keep')
assembler = VectorAssembler(inputCols = ['OHE_' + c for c in encoding_var], outputCol = "features")
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed = 8464,
                            numTrees=10, cacheNodeIds = True, subsamplingRate = 0.7)

In [61]:
pipe = Pipeline(stages = string_indexes + onehot_indexes + [assembler, label_indexes, rf])

In [62]:
mod = pipe.fit(df_index)

In [63]:
df2 = mod.transform(df_index)

In [64]:
# Failed -> : org.apache.spark.SparkException: Job aborted due to stage failure: Task serialization failed: java.lang.OutOfMemoryError: Java heap space
# Worked on columnList = ["EventID", "ProcessGuid"] feature list
# Worked on columnList = ['ProcessId', 'ProcessGuid', 'ThreadID']
df2 = df2.checkpoint()

In [65]:
# Worked
df2 = df2.cache()

In [67]:
df2.printSchema()

root
 |-- @timestamp: string (nullable = true)
 |-- @version: string (nullable = true)
 |-- AccessList: string (nullable = true)
 |-- AccessMask: string (nullable = true)
 |-- AccessReason: string (nullable = true)
 |-- AccountName: string (nullable = true)
 |-- AccountType: string (nullable = true)
 |-- Action: string (nullable = true)
 |-- Active: string (nullable = true)
 |-- ActiveProfile: string (nullable = true)
 |-- ActivityID: string (nullable = true)
 |-- AdapterName: string (nullable = true)
 |-- AdapterSuffixName: string (nullable = true)
 |-- AdditionalInfo: string (nullable = true)
 |-- AdditionalInfo2: string (nullable = true)
 |-- AdvancedOptions: string (nullable = true)
 |-- AlgorithmName: string (nullable = true)
 |-- Application: string (nullable = true)
 |-- Archived: string (nullable = true)
 |-- AuthenticationPackageName: string (nullable = true)
 |-- BitlockerUserInputTime: string (nullable = true)
 |-- BootMenuPolicy: string (nullable = true)
 |-- BootMode: stri

In [68]:
mod.stages[-1].featureImportances

SparseVector(2015, {1: 0.2435, 17: 0.1124, 24: 0.0235, 698: 0.208, 1113: 0.1131, 1777: 0.06, 1921: 0.2395})

In [69]:
def ExtractFeatureImp(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

In [70]:
ExtractFeatureImp(mod.stages[-1].featureImportances, df2, "features").head(10)

Unnamed: 0,idx,name,score
1,1,OHE_ProcessId_5044,0.243494
1921,1921,OHE_ThreadID_8416,0.23953
698,698,OHE_ProcessId_0x87c,0.207961
1113,1113,OHE_ProcessGuid_{47ab858c-dac2-5eac-e802-00000...,0.113114
17,17,OHE_ProcessId_1960,0.112398
1777,1777,OHE_ThreadID_1740,0.060042
24,24,OHE_ProcessId_5360,0.02346
1342,1342,OHE_ProcessGuid_{5aa8ec29-cae5-5eac-5d00-00000...,0.0
1343,1343,OHE_ProcessGuid_{47ab858c-e378-5eac-db03-00000...,0.0
1344,1344,OHE_ProcessGuid_{47ab858c-e6b2-5eac-5600-00000...,0.0


In [41]:
# Retry with more columns

In [71]:
# Select subset of columns with more than 1 AND less than 52 unique values -> top 5
columnList = list(pd_target_distict_count.loc[(pd_target_distict_count[0] > 1) & (pd_target_distict_count[0] < 52)].sort_values(by = 0, ascending = False).iloc[:5].index.values)

In [72]:
# one hot encoding and assembling
encoding_var = columnList
#num_var = [i[0] for i in df.dtypes if ((i[1]=='int') | (i[1]=='double')) & (i[0]!='y')]

string_indexes = [StringIndexer(inputCol = c, outputCol = 'IDX_' + c, handleInvalid = 'keep') for c in encoding_var]
onehot_indexes = [OneHotEncoderEstimator(inputCols = ['IDX_' + c], outputCols = ['OHE_' + c]) for c in encoding_var]
label_indexes = StringIndexer(inputCol = 'TP_Binary', outputCol = 'label', handleInvalid = 'keep')
assembler = VectorAssembler(inputCols = ['OHE_' + c for c in encoding_var], outputCol = "features")
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed = 8464,
                            numTrees=10, cacheNodeIds = True, subsamplingRate = 0.7)

In [73]:
pipe = Pipeline(stages = string_indexes + onehot_indexes + [assembler, label_indexes, rf])

In [74]:
mod = pipe.fit(df_index)

In [75]:
df2 = mod.transform(df_index)

In [76]:
mod.stages[-1].featureImportances

SparseVector(4176, {1: 0.0981, 5: 0.0106, 11: 0.0119, 20: 0.0107, 45: 0.0047, 72: 0.0044, 84: 0.0024, 208: 0.034, 1112: 0.0156, 1758: 0.0777, 1761: 0.0282, 1763: 0.0289, 1835: 0.0061, 1921: 0.0291, 2075: 0.0065, 2291: 0.0199, 2300: 0.1385, 2337: 0.0505, 2380: 0.0901, 2414: 0.0505, 2422: 0.1088, 2428: 0.1429, 3455: 0.03})

In [77]:
ExtractFeatureImp(mod.stages[-1].featureImportances, df2, "features").head(10)

Unnamed: 0,idx,name,score
2428,2428,OHE_NewProcessId_0x2290,0.142857
2300,2300,OHE_NewProcessId_0xef8,0.138478
2422,2422,OHE_NewProcessId_0x102c,0.10882
1,1,OHE_ProcessId_5044,0.09814
2380,2380,OHE_NewProcessId_0x195c,0.090089
1758,1758,OHE_ThreadID_4588,0.077664
2414,2414,OHE_NewProcessId_0xf0c,0.050453
2337,2337,OHE_NewProcessId_0xbc,0.050451
208,208,OHE_ProcessId_8524,0.034037
3455,3455,OHE_Hashes_SHA1=61A5FFA57DFE0D9B3E4938439FF345...,0.03003


In [78]:
# Select subset of columns with more than 1 AND less than 52 unique values -> top 10
columnList = list(pd_target_distict_count.loc[(pd_target_distict_count[0] > 1) & (pd_target_distict_count[0] < 52)].sort_values(by = 0, ascending = False).iloc[:10].index.values)

# one hot encoding and assembling
encoding_var = columnList
#num_var = [i[0] for i in df.dtypes if ((i[1]=='int') | (i[1]=='double')) & (i[0]!='y')]

string_indexes = [StringIndexer(inputCol = c, outputCol = 'IDX_' + c, handleInvalid = 'keep') for c in encoding_var]
onehot_indexes = [OneHotEncoderEstimator(inputCols = ['IDX_' + c], outputCols = ['OHE_' + c]) for c in encoding_var]
label_indexes = StringIndexer(inputCol = 'TP_Binary', outputCol = 'label', handleInvalid = 'keep')
assembler = VectorAssembler(inputCols = ['OHE_' + c for c in encoding_var], outputCol = "features")
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed = 8464,
                            numTrees=10, cacheNodeIds = True, subsamplingRate = 0.7)

pipe = Pipeline(stages = string_indexes + onehot_indexes + [assembler, label_indexes, rf])
mod = pipe.fit(df_index)
df2 = mod.transform(df_index)
ExtractFeatureImp(mod.stages[-1].featureImportances, df2, "features").head(20)

Unnamed: 0,idx,name,score
4628,4628,OHE_Task_106,0.123079
1,1,OHE_ProcessId_5044,0.120104
4623,4623,OHE_Task_12,0.094825
2558,2558,OHE_Hashes_SHA1=36C5D12033B2EAF251BAE61C00690F...,0.086222
2711,2711,OHE_Hashes_SHA1=585EB59D12A111E9291518C5CF5D3F...,0.075562
4176,4176,OHE_Image_C:\windows\system32\svchost.exe,0.067394
4381,4381,OHE_ParentProcessGuid_{47ab858c-e43f-5eac-eb03...,0.06347
4626,4626,OHE_Task_7,0.059324
2897,2897,OHE_Hashes_SHA1=388B289E2FD96234E2C1E8AE777248...,0.042146
4508,4508,OHE_ParentProcessId_3480,0.038778


In [80]:
# Select subset of columns with more than 1 AND less than 52 unique values -> top 20
columnList = list(pd_target_distict_count.loc[(pd_target_distict_count[0] > 1) & (pd_target_distict_count[0] < 52)].sort_values(by = 0, ascending = False).iloc[:20].index.values)

# one hot encoding and assembling
encoding_var = columnList
#num_var = [i[0] for i in df.dtypes if ((i[1]=='int') | (i[1]=='double')) & (i[0]!='y')]

string_indexes = [StringIndexer(inputCol = c, outputCol = 'IDX_' + c, handleInvalid = 'keep') for c in encoding_var]
onehot_indexes = [OneHotEncoderEstimator(inputCols = ['IDX_' + c], outputCols = ['OHE_' + c]) for c in encoding_var]
label_indexes = StringIndexer(inputCol = 'TP_Binary', outputCol = 'label', handleInvalid = 'keep')
assembler = VectorAssembler(inputCols = ['OHE_' + c for c in encoding_var], outputCol = "features")
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed = 8464,
                            numTrees=10, cacheNodeIds = True, subsamplingRate = 0.7)

pipe = Pipeline(stages = string_indexes + onehot_indexes + [assembler, label_indexes, rf])
mod = pipe.fit(df_index)
df2 = mod.transform(df_index)
ExtractFeatureImp(mod.stages[-1].featureImportances, df2, "features").head(20)

Unnamed: 0,idx,name,score
5067,5067,"OHE_CommandLine_""PowerShell.exe"" -noni -noexit...",0.137528
7621,7621,OHE_TargetObject_HKU\S-1-5-21-1830255721-37270...,0.12086
4865,4865,"OHE_CommandLine_""C:\Windows\Temp\python.exe""",0.1034
4875,4875,OHE_CommandLine_powershell,0.085716
4897,4897,"OHE_CommandLine_""C:\windows\system32\sdclt.exe""",0.081018
4432,4432,OHE_ParentProcessGuid_{47ab858c-e1e3-5eac-b603...,0.065404
1921,1921,OHE_ThreadID_8416,0.061839
5013,5013,"OHE_CommandLine_""C:\ProgramData\victim\â€®cod....",0.057142
5307,5307,OHE_ScriptBlockId_63fc6cf4-cd9f-4134-9231-51cc...,0.056115
2558,2558,OHE_Hashes_SHA1=36C5D12033B2EAF251BAE61C00690F...,0.055759


In [82]:
#pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 100
pd.get_option("display.max_colwidth")

100

In [83]:
ExtractFeatureImp(mod.stages[-1].featureImportances, df2, "features").head(20)

Unnamed: 0,idx,name,score
5067,5067,"OHE_CommandLine_""PowerShell.exe"" -noni -noexit -ep bypass -window hidden -c ""sal a New-Object;Ad...",0.137528
7621,7621,OHE_TargetObject_HKU\S-1-5-21-1830255721-3727074217-2423397540-1107\Software\Sysinternals\SDelet...,0.12086
4865,4865,"OHE_CommandLine_""C:\Windows\Temp\python.exe""",0.1034
4875,4875,OHE_CommandLine_powershell,0.085716
4897,4897,"OHE_CommandLine_""C:\windows\system32\sdclt.exe""",0.081018
4432,4432,OHE_ParentProcessGuid_{47ab858c-e1e3-5eac-b603-000000000400},0.065404
1921,1921,OHE_ThreadID_8416,0.061839
5013,5013,"OHE_CommandLine_""C:\ProgramData\victim\â€®cod.3aka3.scr"" /S",0.057142
5307,5307,OHE_ScriptBlockId_63fc6cf4-cd9f-4134-9231-51ccb5c7d247,0.056115
2558,2558,"OHE_Hashes_SHA1=36C5D12033B2EAF251BAE61C00690FFB17FDDC87,MD5=CDA48FC75952AD12D99E526D0B6BF70A,SH...",0.055759


In [84]:
columnList

['ProcessId',
 'ProcessGuid',
 'ThreadID',
 'NewProcessId',
 'Hashes',
 'Image',
 'ParentProcessGuid',
 'ParentProcessId',
 'Task',
 'EventID',
 'CommandLine',
 'ScriptBlockId',
 'RelativeTargetName',
 'TargetFilename',
 'ScriptBlockText',
 'TargetObject',
 'ExecutionProcessID',
 'ParentCommandLine',
 'SubjectLogonId',
 'Description']

In [101]:
# Select subset of columns with more than 1 AND less than 52 unique values -> top 20
columnList = list(pd_target_distict_count.loc[(pd_target_distict_count[0] > 1) & (pd_target_distict_count[0] < 52)].sort_values(by = 0, ascending = False).iloc[:20].index.values)
#columnList = ["EventID", "ProcessGuid"]

# one hot encoding and assembling
encoding_var = columnList
#num_var = [i[0] for i in df.dtypes if ((i[1]=='int') | (i[1]=='double')) & (i[0]!='y')]

string_indexes = [StringIndexer(inputCol = c, outputCol = 'IDX_' + c, handleInvalid = 'keep') for c in encoding_var]
onehot_indexes = [OneHotEncoderEstimator(inputCols = ['IDX_' + c], outputCols = ['OHE_' + c]) for c in encoding_var]
label_indexes = StringIndexer(inputCol = 'TP_Binary', outputCol = 'label', handleInvalid = 'keep')
assembler = VectorAssembler(inputCols = ['OHE_' + c for c in encoding_var], outputCol = "features")

In [102]:
pipe = Pipeline(stages = string_indexes + onehot_indexes + [assembler, label_indexes])
mod = pipe.fit(df_index)
df2 = mod.transform(df_index)

In [103]:
from pyspark.ml.feature import ChiSqSelector
selector = ChiSqSelector(numTopFeatures=10, featuresCol='features', outputCol="selectedFeatures", labelCol='label', selectorType='numTopFeatures', percentile=0.1, fpr=0.05, fdr=0.05, fwe=0.05)

In [104]:
mod = selector.fit(df2)

In [105]:
result = mod.transform(df2)

In [108]:
mod.selectedFeatures

[0, 1, 2, 3, 4, 5, 1107, 1108, 1757, 1758]

In [112]:
result.head()

Row(@timestamp='2020-05-02T03:20:47.897Z', @version='1', AccessList=None, AccessMask=None, AccessReason=None, AccountName=None, AccountType=None, Action=None, Active=None, ActiveProfile=None, ActivityID=None, AdapterName=None, AdapterSuffixName=None, AdditionalInfo=None, AdditionalInfo2=None, AdvancedOptions=None, AlgorithmName=None, Application=None, Archived=None, AuthenticationPackageName=None, BitlockerUserInputTime=None, BootMenuPolicy=None, BootMode=None, BootStatusPolicy=None, BootType=None, BuildVersion=None, CallTrace=None, CallerProcessId=None, CallerProcessName=None, CalloutId=None, CalloutKey=None, CalloutName=None, CalloutType=None, Category='Kerberos Service Ticket Operations', ChangeType=None, Channel='Security', ClassId=None, ClassName=None, ClientCreationTime=None, ClientProcessId=None, CommandLine=None, Company=None, CompatibleIds=None, Conditions=None, Config=None, ConfigAccessPolicy=None, ContextInfo=None, CorruptionActionState=None, CountNew=None, CountOfCredential

In [110]:
result.head().selectedFeatures

SparseVector(10, {})

In [111]:
result.printSchema()

root
 |-- @timestamp: string (nullable = true)
 |-- @version: string (nullable = true)
 |-- AccessList: string (nullable = true)
 |-- AccessMask: string (nullable = true)
 |-- AccessReason: string (nullable = true)
 |-- AccountName: string (nullable = true)
 |-- AccountType: string (nullable = true)
 |-- Action: string (nullable = true)
 |-- Active: string (nullable = true)
 |-- ActiveProfile: string (nullable = true)
 |-- ActivityID: string (nullable = true)
 |-- AdapterName: string (nullable = true)
 |-- AdapterSuffixName: string (nullable = true)
 |-- AdditionalInfo: string (nullable = true)
 |-- AdditionalInfo2: string (nullable = true)
 |-- AdvancedOptions: string (nullable = true)
 |-- AlgorithmName: string (nullable = true)
 |-- Application: string (nullable = true)
 |-- Archived: string (nullable = true)
 |-- AuthenticationPackageName: string (nullable = true)
 |-- BitlockerUserInputTime: string (nullable = true)
 |-- BootMenuPolicy: string (nullable = true)
 |-- BootMode: stri

In [113]:
r = ChiSquareTest.test(df2, "features", "label")

In [114]:
r

DataFrame[pValues: vector, degreesOfFreedom: array<int>, statistics: vector]

In [116]:
ChiSquareResults = []
for idx, result in enumerate(r):
    row = {
        'feature_index': idx,
        'p_value': result.pValue,
        'statistic': result.statistic,
        'degrees_of_freedom': result.degreesOfFreedom
    }
    ChiSquareResults.append(row)

In [117]:
ChiSquareResults

[{'feature_index': 0,
  'p_value': Column<b'pValues[pValue]'>,
  'statistic': Column<b'pValues[statistic]'>,
  'degrees_of_freedom': Column<b'pValues[degreesOfFreedom]'>},
 {'feature_index': 1,
  'p_value': Column<b'degreesOfFreedom[pValue]'>,
  'statistic': Column<b'degreesOfFreedom[statistic]'>,
  'degrees_of_freedom': Column<b'degreesOfFreedom[degreesOfFreedom]'>},
 {'feature_index': 2,
  'p_value': Column<b'statistics[pValue]'>,
  'statistic': Column<b'statistics[statistic]'>,
  'degrees_of_freedom': Column<b'statistics[degreesOfFreedom]'>}]

In [121]:
# Select subset of columns with more than 1 AND less than 52 unique values -> top 20
#columnList = list(pd_target_distict_count.loc[(pd_target_distict_count[0] > 1) & (pd_target_distict_count[0] < 52)].sort_values(by = 0, ascending = False).iloc[:20].index.values)
columnList = ["EventID", "ProcessGuid"]

# one hot encoding and assembling
encoding_var = columnList
#num_var = [i[0] for i in df.dtypes if ((i[1]=='int') | (i[1]=='double')) & (i[0]!='y')]

string_indexes = [StringIndexer(inputCol = c, outputCol = 'IDX_' + c, handleInvalid = 'keep') for c in encoding_var]
onehot_indexes = [OneHotEncoderEstimator(inputCols = ['IDX_' + c], outputCols = ['OHE_' + c]) for c in encoding_var]
label_indexes = StringIndexer(inputCol = 'TP_Binary', outputCol = 'label', handleInvalid = 'keep')
assembler = VectorAssembler(inputCols = ['OHE_' + c for c in encoding_var], outputCol = "features")
selector = ChiSqSelector(numTopFeatures=50, featuresCol='features', outputCol="selectedFeatures", labelCol='label', selectorType='numTopFeatures', percentile=0.1, fpr=0.05, fdr=0.05, fwe=0.05)

In [122]:
pipe = Pipeline(stages = string_indexes + onehot_indexes + [assembler, label_indexes, selector])

In [123]:
mod = pipe.fit(df_index)

In [124]:
df2 = mod.transform(df_index)

In [164]:
def ExtractFeatureSelection(featuresSelected, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
      list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(featuresSelected).rename(columns={0:"idx"})
    pd_list_extract = pd.DataFrame(list_extract)
    varlist['name'] = varlist['idx'].apply(lambda x: pd_list_extract.iloc[x, 1])
    return(varlist)

In [165]:
ExtractFeatureSelection(mod.stages[-1].selectedFeatures, df2, "features")

Unnamed: 0,idx,name
0,0,OHE_EventID_12
1,1,OHE_EventID_10
2,2,OHE_EventID_7
3,3,OHE_EventID_13
4,4,OHE_EventID_4658
5,5,OHE_EventID_4103
6,6,OHE_EventID_4656
7,7,OHE_EventID_4690
8,8,OHE_EventID_4663
9,9,OHE_EventID_800


In [166]:
# Try again with more columns

In [167]:
# Select subset of columns with more than 1 AND less than 52 unique values -> top 20
columnList = list(pd_target_distict_count.loc[(pd_target_distict_count[0] > 1) & (pd_target_distict_count[0] < 52)].sort_values(by = 0, ascending = False).iloc[:20].index.values)
#columnList = ["EventID", "ProcessGuid"]

# one hot encoding and assembling
encoding_var = columnList
#num_var = [i[0] for i in df.dtypes if ((i[1]=='int') | (i[1]=='double')) & (i[0]!='y')]

string_indexes = [StringIndexer(inputCol = c, outputCol = 'IDX_' + c, handleInvalid = 'keep') for c in encoding_var]
onehot_indexes = [OneHotEncoderEstimator(inputCols = ['IDX_' + c], outputCols = ['OHE_' + c]) for c in encoding_var]
label_indexes = StringIndexer(inputCol = 'TP_Binary', outputCol = 'label', handleInvalid = 'keep')
assembler = VectorAssembler(inputCols = ['OHE_' + c for c in encoding_var], outputCol = "features")
selector = ChiSqSelector(numTopFeatures=50, featuresCol='features', outputCol="selectedFeatures", labelCol='label', selectorType='numTopFeatures', percentile=0.1, fpr=0.05, fdr=0.05, fwe=0.05)

In [168]:
pipe = Pipeline(stages = string_indexes + onehot_indexes + [assembler, label_indexes, selector])

In [169]:
mod = pipe.fit(df_index)

In [170]:
df2 = mod.transform(df_index)

In [171]:
ExtractFeatureSelection(mod.stages[-1].selectedFeatures, df2, "features")

Unnamed: 0,idx,name
0,0,OHE_ProcessId_3852
1,1,OHE_ProcessId_5044
2,2,OHE_ProcessId_900
3,3,OHE_ProcessId_884
4,4,OHE_ProcessId_0xf0c
5,5,OHE_ProcessId_944
6,1107,OHE_ProcessGuid_{47ab858c-e374-5eac-d803-000000000400}
7,1108,OHE_ProcessGuid_{47ab858c-e6c0-5eac-7600-000000000500}
8,1757,OHE_ThreadID_4612
9,1758,OHE_ThreadID_4588


In [1]:
df_index

NameError: name 'df_index' is not defined