In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

In [2]:
df= pd.read_csv('../data/processed/merged_dataset_pivoted.csv')
df.head()

FileNotFoundError: [Errno 2] File ../data/processed/merged_dataset_pivoted.csv does not exist: '../data/processed/merged_dataset_pivoted.csv'

In [None]:
columns = list(df.columns)
features = columns[4:]
df[features] = df[features].div(df.total_events, axis = 0)
# Separating out the features
X = df.loc[:, features].values
y = df.loc[:, ['is_malicious']].values
print(X)
print(y)

In [None]:
X = StandardScaler().fit_transform(X)

In [None]:
pca = PCA(n_components=2)
pcs = pca.fit_transform(X)
pca_df = pd.DataFrame(data = pcs, columns = ['pc1', 'pc2'])

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('Features in 2D', fontsize = 20)

targets = [0, 1]
colors = ['r', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = df['is_malicious'] == target
    ax.scatter(pca_df.loc[indicesToKeep, 'pc1']
               , pca_df.loc[indicesToKeep, 'pc2']
               , c = color
               , s = 50)
    
# ax.legend(targets)
# ax.scatter(pca_df['pc1']
#                , pca_df['pc2']
#                , s = 50)
ax.grid()

In [None]:
smote = SMOTE(random_state=0, sampling_strategy="minority")
X_os, y_os = smote.fit_resample(X, y)
X_train_os, X_test_os, y_train_os, y_test_os = train_test_split(X_os, y_os, test_size = 0.2, random_state=2)

In [None]:
# Decision Tree Classifier

# Create Decision Tree classifer object
clf_os = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf_os.fit(X_train_os, y_train_os)

# Predict the response for the test dataset
y_pred_os = clf_os.predict(X_test_os)

# Check Prediction
print(classification_report(y_test_os, y_pred_os))
# print(confusion_matrix(y_test_os, y_pred_os))
print("Accuracy:", accuracy_score(y_test_os, y_pred_os))
print("Label 0:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 0))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 0))
print("Label 1:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 1))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 1))

In [None]:
classes = np.unique(y_pred_os)
fig,ax = plt.subplots()
cm = metrics.confusion_matrix(y_test_os,y_pred_os,labels=classes)
sns.heatmap(cm, annot=True,fmt='d',cmap=plt.cm.Blues,cbar=False)
ax.set(xlabel="Pred",ylabel="True",title="Confusion Matrix")
ax.set_yticklabels(labels=classes,rotation=0)
plt.show()

In [None]:
# K Neighbors Classifier

# Create K Neighbors Classifier Object
neigh = KNeighborsClassifier(n_neighbors=3)

s = time.time()
# Train K Neighbors Classifier Object
neigh.fit(X_train_os,y_train_os.ravel())
e = time.time()
print("K-neigh training time:",e-s)

s = time.time()
# Predict the response for test dataset
y_pred_os = neigh.predict(X_test_os)
e = time.time()
print("K-neigh testing time:",e-s)

# Check Prediction
print(classification_report(y_test_os, y_pred_os))
# print(confusion_matrix(y_test_os, y_pred_os))
print("Accuracy:", accuracy_score(y_test_os, y_pred_os))
print("Label 0:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 0))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 0))
print("Label 1:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 1))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 1))

In [None]:
classes = np.unique(y_pred_os)
fig,ax = plt.subplots()
cm = metrics.confusion_matrix(y_test_os,y_pred_os,labels=classes)
sns.heatmap(cm, annot=True,fmt='d',cmap=plt.cm.Blues,cbar=False)
ax.set(xlabel="Pred",ylabel="True",title="Confusion Matrix")
ax.set_yticklabels(labels=classes,rotation=0)
plt.show()

In [None]:
print("Logistic Regression")
importance_logreg = model_logreg.coef_[0] 
# summarize feature importance
logreg_list = []
for i,v in enumerate(importance_logreg):
        print('Feature: %0d, Score: %.5f' % (i,v))
        logreg_list.append([feature_names[i],round(v,5)])
# plot feature importance
ax = plt.bar([x for x in range(len(importance_logreg))], importance_logreg)
plt.show()
logreg_list = Sort(logreg_list)
print(logreg_list[6:])

In [None]:
# Logistic Regression

# Instantiate Model
model = LogisticRegression()

# Fit
s = time.time()
model.fit(X_train_os,y_train_os)
e = time.time()
print("Log reg training time:",e-s)

# Predict
s = time.time()
y_pred_os = model.predict(X_test_os)
e = time.time()
print("Log reg testing time:",e-s)

# Check Prediction
print(classification_report(y_test_os, y_pred_os))
# print(confusion_matrix(y_test_os, y_pred_os))
print("Accuracy:", accuracy_score(y_test_os, y_pred_os))
print("Label 0:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 0))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 0))
print("Label 1:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 1))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 1))

In [None]:
classes = np.unique(y_pred_os)
fig,ax = plt.subplots()
cm = metrics.confusion_matrix(y_test_os,y_pred_os,labels=classes)
sns.heatmap(cm, annot=True,fmt='d',cmap=plt.cm.Blues,cbar=False)
ax.set(xlabel="Pred",ylabel="True",title="Confusion Matrix")
ax.set_yticklabels(labels=classes,rotation=0)
plt.show()

In [None]:
print(classes)

In [None]:
# Random Forest:

# Instantiate Model
random_model = RandomForestClassifier()

# Fit
s = time.time()
random_model.fit(X_train_os, y_train_os)
e = time.time()
print("Rand for training time:",e-s)

# Predict
s = time.time()
y_pred_os = random_model.predict(X_test_os)
e = time.time()
print("Rand for testing time:",e-s)

# Check Prediction
print(classification_report(y_test_os, y_pred_os))
# print(confusion_matrix(y_test_os, y_pred_os))
print("Accuracy:", accuracy_score(y_test_os, y_pred_os))
print("Label 0:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 0))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 0))
print("Label 1:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 1))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 1))

In [None]:
classes = np.unique(y_pred_os)
fig,ax = plt.subplots()
cm = metrics.confusion_matrix(y_test_os,y_pred_os,labels=classes)
sns.heatmap(cm, annot=True,fmt='d',cmap=plt.cm.Blues,cbar=False)
ax.set(xlabel="Pred",ylabel="True",title="Confusion Matrix")
ax.set_yticklabels(labels=classes,rotation=0)
plt.show()

In [None]:
# Gradient Boosting Classifier

# Instantiate Model
gb = GradientBoostingClassifier()

# Fit
s = time.time()
gb.fit(X_train_os, y_train_os)
e = time.time()
print("Gradient boost training time:",e-s)

# Predict
s = time.time()
y_pred_os = gb.predict(X_test_os)
e = time.time()
print("Gradient boost testing time:",e-s)

# Check Prediction
print(classification_report(y_test_os, y_pred_os))
# print(confusion_matrix(y_test_os, y_pred_os))
print("Accuracy:", accuracy_score(y_test_os, y_pred_os))
print("Label 0:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 0))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 0))
print("Label 1:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 1))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 1))

In [None]:
classes = np.unique(y_pred_os)
fig,ax = plt.subplots()
cm = metrics.confusion_matrix(y_test_os,y_pred_os,labels=classes)
sns.heatmap(cm, annot=True,fmt='d',cmap=plt.cm.Blues,cbar=False)
ax.set(xlabel="Pred",ylabel="True",title="Confusion Matrix")
ax.set_yticklabels(labels=classes,rotation=0)
plt.show()

In [None]:
# # sklearn.svm.SVC (Support Vector Classification)

# svc = SVC(gamma="auto")

# svc.fit(X_train_os, y_train_os)

# # Predict
# y_pred_os = svc.predict(X_test_os)

# # Check Prediction
# print(classification_report(y_test_os, y_pred_os))
# # print(confusion_matrix(y_test_os, y_pred_os))
# print("Accuracy:", accuracy_score(y_test_os, y_pred_os))
# print("Label 0:")
# print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 0))
# print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 0))
# print("Label 1:")
# print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 1))
# print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 1))

In [None]:
# classes = np.unique(y_pred_os)
# fig,ax = plt.subplots()
# cm = metrics.confusion_matrix(y_test_os,y_pred_os,labels=classes)
# sns.heatmap(cm, annot=True,fmt='d',cmap=plt.cm.Blues,cbar=False)
# ax.set(xlabel="Pred",ylabel="True",title="Confusion Matrix")
# ax.set_yticklabels(labels=classes,rotation=0)
# plt.show()

### Creation of features for chain of events for attacks

In [None]:
df = pd.read_csv('../data/processed/merged_dataset_pivoted.csv')
df.head()

#### Lateral Movement (Link: https://www.rapid7.com/resources/using-windows-event-logs-to-detect-lateral-movement/):
Using window event logs to detect lateral movement:

Authentication Events (all):
1. Event_528 <-- Successful Login
2. Event_529 <-- Unsucessful Login
3. Event_4624 and Event_4625 <-- Two methods of Lateral Movement (Windows NT5 and Nt6 Operating Systems)

- SMB: 552, 4648
- Scheduled Tasks: 602, 4698
- PS Exec: 601, 4697 <-- System Admin Tool to execute code remote
- SSH: app logs <-- Less common in windows environment

In [None]:
df[["event_4624", "event_4625", "event_4648", "event_4698"]].head()

In [None]:
df["total_authN_events"] = df["event_4624"] + df["event_4625"] # ".Logon Type:[\W](3|10).*"
df.head()

- PS Exec not there
- No columns for "event_552", "event_528", "event_529", "event_601", "event_602", and "event_4697"
- Have some of the cases for SMB, Scheduled Tasks <-- Look into regenerating dataset to get features
- Have some of the cases for authentication events <-- Same as above

#### Ransomware, malware and cobalt strike (Link: https://www.beyondtrust.com/blog/entry/windows-server-events-monitor):

Ransomware:
- event_8
or
- event_22 <-- Not Contained in the dataframe

Hacker Presence:

- event_104 <-- Event Log was Cleared
- event_1102 <-- Audit Log was Cleared
- event_4719 <-- System Audit Policy Changed

In [None]:
columns = list(df.columns)
features = columns[4:]
print(features)

#### APT

- event_4674 <-- Account Name, Service, Process, Object
- event_4688 <-- Account Name, Process

Type Ratio:
- event_4624 <-- Logon
- event_4627 <-- Group Membership
- event_4658 <-- Handle to an object
- event_4768 <-- Kerberos AuthN
- event_4769 <-- Kerberos AuthN - Services
- event_4672 <-- Assignment of Admin Rights
- event_4776 <-- Kerberos Service Ticket

No event ids found.

#### Detect Pass the Hash Attacks (Link: https://stealthbits.com/blog/how-to-detect-pass-the-hash-attacks/)

Workstation Logs (source host):
- event_4648
- event_4624
- event_4672

- sysmon event 10

Target Server Logs (target host):
- event_4624
- event_4672

Domain Controller:
- event 4768
- event 4769
- event 4776

In [None]:
df[["event_4624", "event_4672", "event_4648", "event_10"]].head()

In [None]:
df["hash_attack"] = df["event_4648"] + df["event_4624"] + df["event_4672"] + df["event_10"]
df.head()

#### Common Incident Response Scenario - Phishing (Link: https://www.netscylla.com/blog/2020/02/01/Threat-hunting-with-Windows-Event-Logs.html):

- event_1 <-- Process Creation
- event_11 <-- FileCreate
- event_15 <-- FileCreateStreamHash

In [None]:
df["phishing"] = df["event_1"] + df["event_11"]
pd.unique(df["phishing"].values.ravel())
df.head()

In [None]:
columns = list(df.columns)
features = columns[4:]
df[features] = df[features].div(df.total_events, axis = 0)
# Separating out the features
X = df.loc[:, features].values
y = df.loc[:, ['is_malicious']].values
print(X)
print(y)

In [None]:
X = StandardScaler().fit_transform(X)

In [None]:
pca = PCA(n_components=2)
pcs = pca.fit_transform(X)
pca_df = pd.DataFrame(data = pcs, columns = ['pc1', 'pc2'])

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('Features in 2D', fontsize = 20)

targets = [0, 1]
colors = ['r', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = df['is_malicious'] == target
    ax.scatter(pca_df.loc[indicesToKeep, 'pc1']
               , pca_df.loc[indicesToKeep, 'pc2']
               , c = color
               , s = 50)
    
# ax.legend(targets)
# ax.scatter(pca_df['pc1']
#                , pca_df['pc2']
#                , s = 50)
ax.grid()

In [None]:
smote = SMOTE(random_state=0, sampling_strategy="minority")
X_os, y_os = smote.fit_resample(X, y)
X_train_os, X_test_os, y_train_os, y_test_os = train_test_split(X_os, y_os, test_size = 0.2, random_state=2)

In [None]:
# Decision Tree Classifier

# Create Decision Tree classifer object
clf_os = DecisionTreeClassifier()

# Train Decision Tree Classifer
s = time.time()
clf_os.fit(X_train_os, y_train_os)
e = time.time()
print("Dec tree training time:",e-s)

# Predict the response for the test dataset
s = time.time()
y_pred_os = clf_os.predict(X_test_os)
e = time.time()
print("Dec tree boost testing time:",e-s)

# Check Prediction
print(classification_report(y_test_os, y_pred_os))
# print(confusion_matrix(y_test_os, y_pred_os))
print("Accuracy:", accuracy_score(y_test_os, y_pred_os))
print("Label 0:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 0))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 0))
print("Label 1:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 1))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 1))

In [None]:
classes = np.unique(y_pred_os)
fig,ax = plt.subplots()
cm = metrics.confusion_matrix(y_test_os,y_pred_os,labels=classes)
sns.heatmap(cm, annot=True,fmt='d',cmap=plt.cm.Blues,cbar=False)
ax.set(xlabel="Pred",ylabel="True",title="Confusion Matrix")
ax.set_yticklabels(labels=classes,rotation=0)
plt.show()

In [None]:
# K Neighbors Classifier

# Create K Neighbors Classifier Object
neigh = KNeighborsClassifier(n_neighbors=3)

# Train K Neighbors Classifier Object
s = time.time()
neigh.fit(X_train_os,y_train_os.ravel())
e = time.time()
print("Kneigh training time:",e-s)

# Predict the response for test dataset
s = time.time()
y_pred_os = neigh.predict(X_test_os)
e = time.time()
print("Kneigh testing time:",e-s)

# Check Prediction
print(classification_report(y_test_os, y_pred_os))
# print(confusion_matrix(y_test_os, y_pred_os))
print("Accuracy:", accuracy_score(y_test_os, y_pred_os))
print("Label 0:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 0))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 0))
print("Label 1:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 1))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 1))

In [None]:
classes = np.unique(y_pred_os)
fig,ax = plt.subplots()
cm = metrics.confusion_matrix(y_test_os,y_pred_os,labels=classes)
sns.heatmap(cm, annot=True,fmt='d',cmap=plt.cm.Blues,cbar=False)
ax.set(xlabel="Pred",ylabel="True",title="Confusion Matrix")
ax.set_yticklabels(labels=classes,rotation=0)
plt.show()

In [None]:
# Logistic Regression

# instantiate model
model = LogisticRegression()

# fit 
s = time.time()
model.fit(X_train_os,y_train_os)
e = time.time()
print("Log reg training time:",e-s)

# predict
s = time.time()
y_pred_os = model.predict(X_test_os)
e = time.time()
print("Log reg testing time:",e-s)

# Check Prediction
print(classification_report(y_test_os, y_pred_os))
# print(confusion_matrix(y_test_os, y_pred_os))
print("Accuracy:", accuracy_score(y_test_os, y_pred_os))
print("Label 0:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 0))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 0))
print("Label 1:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 1))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 1))

In [None]:
classes = np.unique(y_pred_os)
fig,ax = plt.subplots()
cm = metrics.confusion_matrix(y_test_os,y_pred_os,labels=classes)
sns.heatmap(cm, annot=True,fmt='d',cmap=plt.cm.Blues,cbar=False)
ax.set(xlabel="Pred",ylabel="True",title="Confusion Matrix")
ax.set_yticklabels(labels=classes,rotation=0)
plt.show()

In [None]:
# Random Forest:

# Instantiate Model
random_model = RandomForestClassifier()

# Fit
s = time.time()
random_model.fit(X_train_os, y_train_os)
e = time.time()
print("Rand for training time:",e-s)

# Predict
s = time.time()
y_pred_os = random_model.predict(X_test_os)
e = time.time()
print("Rand for testing time:",e-s)

# Check Prediction
print(classification_report(y_test_os, y_pred_os))
# print(confusion_matrix(y_test_os, y_pred_os))
print("Accuracy:", accuracy_score(y_test_os, y_pred_os))
print("Label 0:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 0))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 0))
print("Label 1:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 1))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 1))

In [None]:
classes = np.unique(y_pred_os)
fig,ax = plt.subplots()
cm = metrics.confusion_matrix(y_test_os,y_pred_os,labels=classes)
sns.heatmap(cm, annot=True,fmt='d',cmap=plt.cm.Blues,cbar=False)
ax.set(xlabel="Pred",ylabel="True",title="Confusion Matrix")
ax.set_yticklabels(labels=classes,rotation=0)
plt.show()

In [None]:
# Gradient Boosting Classifier

# Instantiate Model
gb = GradientBoostingClassifier()

# Fit
s = time.time()
gb.fit(X_train_os, y_train_os)
e = time.time()
print("Gradient boost training time:",e-s)

# Predict
s = time.time()
y_pred_os = gb.predict(X_test_os)
e = time.time()
print("Gradient boost testing time:",e-s)

# Check Prediction
print(classification_report(y_test_os, y_pred_os))
# print(confusion_matrix(y_test_os, y_pred_os))
print("Accuracy:", accuracy_score(y_test_os, y_pred_os))
print("Label 0:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 0))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 0))
print("Label 1:")
print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 1))
print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 1))

In [None]:
classes = np.unique(y_pred_os)
fig,ax = plt.subplots()
cm = metrics.confusion_matrix(y_test_os,y_pred_os,labels=classes)
sns.heatmap(cm, annot=True,fmt='d',cmap=plt.cm.Blues,cbar=False)
ax.set(xlabel="Pred",ylabel="True",title="Confusion Matrix")
ax.set_yticklabels(labels=classes,rotation=0)
plt.show()

In [None]:
# # sklearn.svm.SVC (Support Vector Classification)

# svc = SVC(gamma="auto")

# svc.fit(X_train_os, y_train_os)

# # Predict
# y_pred_os = svc.predict(X_test_os)

# # Check Prediction
# print(classification_report(y_test_os, y_pred_os))
# # print(confusion_matrix(y_test_os, y_pred_os))
# print("Accuracy:", accuracy_score(y_test_os, y_pred_os))
# print("Label 0:")
# print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 0))
# print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 0))
# print("Label 1:")
# print("Precision:", precision_score(y_test_os, y_pred_os, pos_label = 1))
# print("Recall:", recall_score(y_test_os, y_pred_os, pos_label = 1))

In [None]:
# classes = np.unique(y_pred_os)
# fig,ax = plt.subplots()
# cm = metrics.confusion_matrix(y_test_os,y_pred_os,labels=classes)
# sns.heatmap(cm, annot=True,fmt='d',cmap=plt.cm.Blues,cbar=False)
# ax.set(xlabel="Pred",ylabel="True",title="Confusion Matrix")
# ax.set_yticklabels(labels=classes,rotation=0)
# plt.show()