**Feature Selection**: RF and Chi Squared Revisited -> with pandas
------------------

In [1]:
import pandas as pd
import pyspark
import numpy as np
import csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import chi2
pd.options.display.max_columns = 500
pd.options.display.max_colwidth = 500

In [3]:
df = pd.read_json('./df_labelled_coalesce.json', lines=True)

In [12]:
df['TP_Binary'] = df.TP_Label!='0'

In [13]:
df.TP_Binary.unique()

array([False,  True])

In [16]:
colName = 'EventID'

In [17]:
x = pd.get_dummies(df[colName])

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,
                           n_jobs = -1,
                           oob_score = True,
                           bootstrap = True,
                           random_state = 42,
						   max_depth = 30)


rf.fit(x, df.TP_Binary)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=30, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [18]:
rfi = pd.DataFrame(list(zip(rf.feature_importances_,x.columns)),
                   columns=['importance', colName])

In [20]:
rfi.sort_values('importance', ascending=False).head(20)

Unnamed: 0,importance,EventID
57,0.829773,4103
11,0.035353,12
9,0.033548,10
12,0.018251,13
72,0.012022,4658
6,0.010894,7
76,0.010019,4663
84,0.007128,4690
70,0.006562,4656
45,0.006382,800


In [21]:
c2i = pd.DataFrame(list(zip(chi2(x, df.TP_Binary)[1],x.columns)),
                   columns=['probability', colName])

In [22]:
c2i.sort_values('probability', ascending=True).head(20)

Unnamed: 0,probability,EventID
57,0.0,4103
11,6.050097e-230,12
9,2.108551e-152,10
6,8.997044e-78,7
12,2.491794e-64,13
72,6.549415e-44,4658
110,4.450441e-32,5145
70,7.839717000000001e-23,4656
84,9.878382000000001e-23,4690
76,3.251865e-22,4663


In [23]:
from sklearn.linear_model.logistic import LogisticRegression

lr = LogisticRegression()
lr.fit(x, df.TP_Binary)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
lri = pd.DataFrame(list(zip(lr.coef_[0], x.columns)),
                   columns=['coef', colName])

In [25]:
lri['coef_abs'] = lri.coef.abs()

In [26]:
lri.sort_values('coef_abs', ascending=False).head(20)

Unnamed: 0,coef,EventID,coef_abs
57,6.027801,4103,6.027801
110,4.351354,5145,4.351354
0,3.252657,1,3.252657
82,3.12543,4688,3.12543
9,-3.108962,10,3.108962
58,2.754523,4104,2.754523
72,-2.186635,4658,2.186635
11,-1.919571,12,1.919571
6,-1.821199,7,1.821199
21,1.743758,23,1.743758


In [27]:
colName = 'Channel'

In [28]:
x = pd.get_dummies(df[colName])

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,
                           n_jobs = -1,
                           oob_score = True,
                           bootstrap = True,
                           random_state = 42,
						   max_depth = 30)


rf.fit(x, df.TP_Binary)

rfi = pd.DataFrame(list(zip(rf.feature_importances_,x.columns)),
                   columns=['importance', colName])

rfi.sort_values('importance', ascending=False).head(20)

Unnamed: 0,importance,Channel
1,0.804941,Microsoft-Windows-PowerShell/Operational
2,0.096542,Microsoft-Windows-Sysmon/Operational
10,0.054763,security
7,0.029262,Security
9,0.013875,Windows PowerShell
5,0.000242,Microsoft-Windows-WMI-Activity/Operational
8,0.000227,System
4,7.9e-05,Microsoft-Windows-TerminalServices-RemoteConnectionManager/Operational
3,3.4e-05,Microsoft-Windows-TerminalServices-LocalSessionManager/Operational
6,3.2e-05,Microsoft-Windows-Windows Firewall With Advanced Security/Firewall


In [29]:
c2i = pd.DataFrame(list(zip(chi2(x, df.TP_Binary)[1],x.columns)),
                   columns=['probability', colName])
c2i.sort_values('probability', ascending=True).head(20)

Unnamed: 0,probability,Channel
1,0.0,Microsoft-Windows-PowerShell/Operational
2,0.0,Microsoft-Windows-Sysmon/Operational
7,8.860335999999999e-90,Security
10,7.491915e-49,security
9,5.163780000000001e-22,Windows PowerShell
8,0.2056547,System
5,0.2081635,Microsoft-Windows-WMI-Activity/Operational
4,0.6073679,Microsoft-Windows-TerminalServices-RemoteConnectionManager/Operational
6,0.6748187,Microsoft-Windows-Windows Firewall With Advanced Security/Firewall
3,0.6906197,Microsoft-Windows-TerminalServices-LocalSessionManager/Operational


In [30]:
from sklearn.linear_model.logistic import LogisticRegression

lr = LogisticRegression()
lr.fit(x, df.TP_Binary)

lri = pd.DataFrame(list(zip(lr.coef_[0], x.columns)),
                   columns=['coef', colName])

lri['coef_abs'] = lri.coef.abs()
lri.sort_values('coef_abs', ascending=False).head(20)

Unnamed: 0,coef,Channel,coef_abs
1,5.752134,Microsoft-Windows-PowerShell/Operational,5.752134
10,-2.109059,security,2.109059
9,-1.803022,Windows PowerShell,1.803022
2,-1.322964,Microsoft-Windows-Sysmon/Operational,1.322964
8,-0.159667,System,0.159667
5,-0.158323,Microsoft-Windows-WMI-Activity/Operational,0.158323
7,-0.126954,Security,0.126954
4,-0.033106,Microsoft-Windows-TerminalServices-RemoteConnectionManager/Operational,0.033106
6,-0.022469,Microsoft-Windows-Windows Firewall With Advanced Security/Firewall,0.022469
3,-0.020295,Microsoft-Windows-TerminalServices-LocalSessionManager/Operational,0.020295


In [31]:
colName = 'SourceName'

In [32]:
x = pd.get_dummies(df[colName])

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100,
                           n_jobs = -1,
                           oob_score = True,
                           bootstrap = True,
                           random_state = 42,
						   max_depth = 30)


rf.fit(x, df.TP_Binary)

rfi = pd.DataFrame(list(zip(rf.feature_importances_,x.columns)),
                   columns=['importance', colName])

rfi.sort_values('importance', ascending=False).head(20)

Unnamed: 0,importance,SourceName
17,0.714968,Microsoft-Windows-PowerShell
19,0.162618,Microsoft-Windows-Sysmon
18,0.070895,Microsoft-Windows-Security-Auditing
28,0.04855,PowerShell
23,0.001374,Microsoft-Windows-WMI-Activity
29,0.000319,Service Control Manager
13,0.000174,Microsoft-Windows-Kernel-General
9,0.000166,Microsoft-Windows-FilterManager
21,0.000156,Microsoft-Windows-TerminalServices-RemoteConnectionManager
25,0.00015,Microsoft-Windows-Windows Firewall With Advanced Security


In [33]:
c2i = pd.DataFrame(list(zip(chi2(x, df.TP_Binary)[1],x.columns)),
                   columns=['probability', colName])
c2i.sort_values('probability', ascending=True).head(20)

Unnamed: 0,probability,SourceName
17,0.0,Microsoft-Windows-PowerShell
19,0.0,Microsoft-Windows-Sysmon
18,2.154216e-136,Microsoft-Windows-Security-Auditing
28,5.163780000000001e-22,PowerShell
23,0.2081635,Microsoft-Windows-WMI-Activity
29,0.5735186,Service Control Manager
21,0.6073679,Microsoft-Windows-TerminalServices-RemoteConnectionManager
9,0.6458125,Microsoft-Windows-FilterManager
25,0.6748187,Microsoft-Windows-Windows Firewall With Advanced Security
13,0.6906197,Microsoft-Windows-Kernel-General


In [34]:
from sklearn.linear_model.logistic import LogisticRegression

lr = LogisticRegression()
lr.fit(x, df.TP_Binary)

lri = pd.DataFrame(list(zip(lr.coef_[0], x.columns)),
                   columns=['coef', colName])

lri['coef_abs'] = lri.coef.abs()
lri.sort_values('coef_abs', ascending=False).head(20)

Unnamed: 0,coef,SourceName,coef_abs
17,5.325727,Microsoft-Windows-PowerShell,5.325727
28,-2.084512,PowerShell,2.084512
19,-1.744249,Microsoft-Windows-Sysmon,1.744249
18,-0.880299,Microsoft-Windows-Security-Auditing,0.880299
23,-0.219857,Microsoft-Windows-WMI-Activity,0.219857
29,-0.064114,Service Control Manager,0.064114
21,-0.054448,Microsoft-Windows-TerminalServices-RemoteConnectionManager,0.054448
9,-0.044404,Microsoft-Windows-FilterManager,0.044404
25,-0.037487,Microsoft-Windows-Windows Firewall With Advanced Security,0.037487
20,-0.03396,Microsoft-Windows-TerminalServices-LocalSessionManager,0.03396


In [36]:
def FeatureImportance_RF_Chi2_LR(colName):
    x = pd.get_dummies(df[colName])
    # Random Forest Feature Importance
    rf = RandomForestClassifier(n_estimators = 100,
                               n_jobs = -1,
                               oob_score = True,
                               bootstrap = True,
                               random_state = 42,
                               max_depth = 30)
    rf.fit(x, df.TP_Binary)
    rfi = pd.DataFrame(list(zip(rf.feature_importances_,x.columns)),
                       columns=['importance', colName])
    display(rfi.sort_values('importance', ascending=False).head(20))
    
    # Chi Squared Feature Selection
    c2i = pd.DataFrame(list(zip(chi2(x, df.TP_Binary)[1],x.columns)),
                   columns=['probability', colName])
    display(c2i.sort_values('probability', ascending=True).head(20))
    
    # Logistic Regression Feature Importance
    lr = LogisticRegression()
    lr.fit(x, df.TP_Binary)

    lri = pd.DataFrame(list(zip(lr.coef_[0], x.columns)),
                       columns=['coef', colName])

    lri['coef_abs'] = lri.coef.abs()
    display(lri.sort_values('coef_abs', ascending=False).head(20))

In [37]:
FeatureImportance_RF_Chi2_LR('ObjectType')

Unnamed: 0,importance,ObjectType
4,0.719058,Key
3,0.256905,File
16,0.009484,Token
8,0.007767,SAM_DOMAIN
6,0.00132,Process
1,0.001216,-
13,0.000971,SERVICE OBJECT
10,0.00072,SAM_SERVER
12,0.000508,SC_MANAGER OBJECT
17,0.000469,Unknown


Unnamed: 0,probability,ObjectType
4,4.272616e-43,Key
3,1.58424e-17,File
16,0.1126215,Token
6,0.5630607,Process
1,0.5735186,-
13,0.6196045,SERVICE OBJECT
8,0.6698081,SAM_DOMAIN
10,0.6748187,SAM_SERVER
12,0.7074735,SC_MANAGER OBJECT
17,0.7255767,Unknown


Unnamed: 0,coef,ObjectType,coef_abs
4,-3.927714,Key,3.927714
3,1.650014,File,1.650014
16,-0.983607,Token,0.983607
6,-0.268845,Process,0.268845
1,-0.256909,-,0.256909
13,-0.207014,SERVICE OBJECT,0.207014
8,0.186505,SAM_DOMAIN,0.186505
10,-0.153394,SAM_SERVER,0.153394
12,-0.125054,SC_MANAGER OBJECT,0.125054
17,-0.110475,Unknown,0.110475


In [40]:
df[df.ObjectType.notnull()].EventID.value_counts()

4656    5497
4663    5337
4670     143
5145     121
4674      55
5140      42
4661      37
4662       7
Name: EventID, dtype: int64