In [1]:
import pyspark
import pandas as pd

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "8g") \
    .appName('vb-app') \
    .getOrCreate()

In [3]:
path = "../data/raw/wls_day-01"
df = spark.read.json(path)

In [4]:
df.columns

['AuthenticationPackage',
 'Destination',
 'DomainName',
 'EventID',
 'FailureReason',
 'LogHost',
 'LogonID',
 'LogonType',
 'LogonTypeDescription',
 'ParentProcessID',
 'ParentProcessName',
 'ProcessID',
 'ProcessName',
 'ServiceName',
 'Source',
 'Status',
 'SubjectDomainName',
 'SubjectLogonID',
 'SubjectUserName',
 'Time',
 'UserName',
 '_corrupt_record']

In [5]:
df.select('DomainName').show()

+------------+
|  DomainName|
+------------+
|   Domain001|
|   Domain001|
|   Domain001|
|   Domain001|
|   Domain001|
|nt authority|
|nt authority|
|nt authority|
|   Domain001|
|   Domain001|
|   Domain001|
|nt authority|
|   Domain001|
|   Domain001|
|   Domain001|
|   Domain001|
|   Domain001|
|   Domain001|
|   Domain001|
|   Domain001|
+------------+
only showing top 20 rows



In [6]:
import datetime
import time

file_name = 'lanl.csv' 
df.select('EventID',"ProcessName","Time").toPandas().to_csv(file_name)

In [7]:
# df.groupBy('EventID').count().show()

In [8]:
df.groupBy('UserName').count().show()

+-----------+-----+
|   UserName|count|
+-----------+-----+
|Comp334881$|  260|
|Comp193344$|  217|
|Comp626532$|  173|
|Comp423822$|  201|
|Comp071603$|  184|
|Comp194392$|  198|
|Comp079982$|  272|
|Comp102246$|  178|
|Comp924592$|  169|
|Comp332130$|  194|
|Comp629929$|  117|
|Comp989334$|  193|
|Comp851861$|  246|
|Comp854936$|  221|
|Comp282520$|  358|
|Comp349824$|  232|
|Comp156468$|  188|
|Comp528070$|  163|
|Comp811288$|  184|
|Comp925148$|  183|
+-----------+-----+
only showing top 20 rows



In [9]:
# df.groupBy('LogonType').count().show()

In [10]:
# df.groupBy('Source').count().show()

## Convert time into timestamp

In [11]:
# df.groupBy('Time').count().show()

In [12]:
# from pyspark.sql.functions import from_unixtime

# df = df.withColumn('timestamp', from_unixtime((df.Time.cast('bigint')/1000)).cast('timestamp'))


In [13]:
df.take(10)

[Row(AuthenticationPackage=None, Destination=None, DomainName='Domain001', EventID=4688, FailureReason=None, LogHost='Comp607982', LogonID='0x3e7', LogonType=None, LogonTypeDescription=None, ParentProcessID='0x2ac', ParentProcessName='services', ProcessID='0x1418', ProcessName='svchost.exe', ServiceName=None, Source=None, Status=None, SubjectDomainName=None, SubjectLogonID=None, SubjectUserName=None, Time=1, UserName='Comp607982$', _corrupt_record=None),
 Row(AuthenticationPackage=None, Destination=None, DomainName='Domain001', EventID=4688, FailureReason=None, LogHost='Comp991643', LogonID='0x3e7', LogonType=None, LogonTypeDescription=None, ParentProcessID='0x334', ParentProcessName='services', ProcessID='0xc0c', ProcessName='rundll32.exe', ServiceName=None, Source=None, Status=None, SubjectDomainName=None, SubjectLogonID=None, SubjectUserName=None, Time=1, UserName='Comp991643$', _corrupt_record=None),
 Row(AuthenticationPackage=None, Destination=None, DomainName='Domain001', EventID

In [14]:
# df.groupBy('timestamp').count().show()

## Create Window Functions (Fix Usernames)

### Total events per user in an (last) hour window interval  
Ref: https://stackoverflow.com/questions/45806194/pyspark-rolling-average-using-timeseries-data

In [15]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F
new_df = df.select("Time", "UserName", "EventID")
w = (Window.partitionBy('UserName').orderBy(F.col("Time").cast('long')).rangeBetween(-3600, Window.currentRow))
new_df = new_df.withColumn('total_events', F.count("Time").over(w))

In [16]:
new_df.orderBy("Time").take(10)

[Row(Time=None, UserName=None, EventID=None, total_events=0),
 Row(Time=1, UserName='Comp194392$', EventID=4634, total_events=1),
 Row(Time=1, UserName='Comp071603$', EventID=4688, total_events=3),
 Row(Time=1, UserName='Comp071603$', EventID=4688, total_events=3),
 Row(Time=1, UserName='Comp079982$', EventID=4688, total_events=1),
 Row(Time=1, UserName='Comp071603$', EventID=4688, total_events=3),
 Row(Time=1, UserName='Comp102246$', EventID=4688, total_events=2),
 Row(Time=1, UserName='Comp193344$', EventID=4688, total_events=3),
 Row(Time=1, UserName='Comp193344$', EventID=4688, total_events=3),
 Row(Time=1, UserName='Comp193344$', EventID=4688, total_events=3)]

## Paritition By UserName and Event

In [17]:
w = (Window.partitionBy('UserName', 'EventID').orderBy(F.col("Time").cast('long')).rangeBetween(-3600, Window.currentRow))
new_df = new_df.withColumn('total_per_event', F.count("Time").over(w))

### Total number of Event 4624 per user in an hour window interval  (Successful Logon)


In [18]:
new_df.filter((new_df.UserName == 'Comp502166$') & (new_df.EventID == 4624)).show()

+----+-----------+-------+------------+---------------+
|Time|   UserName|EventID|total_events|total_per_event|
+----+-----------+-------+------------+---------------+
| 109|Comp502166$|   4624|          10|              1|
| 123|Comp502166$|   4624|          12|              2|
|1009|Comp502166$|   4624|          25|              3|
|1562|Comp502166$|   4624|          33|              5|
|1562|Comp502166$|   4624|          33|              5|
|1563|Comp502166$|   4624|          36|              6|
|1564|Comp502166$|   4624|          38|              7|
|1858|Comp502166$|   4624|          61|             16|
|1858|Comp502166$|   4624|          61|             16|
|1858|Comp502166$|   4624|          61|             16|
|1858|Comp502166$|   4624|          61|             16|
|1858|Comp502166$|   4624|          61|             16|
|1858|Comp502166$|   4624|          61|             16|
|1858|Comp502166$|   4624|          61|             16|
|1858|Comp502166$|   4624|          61|         

### Total number of Event 4625 per user in an hour window interval  (Failed Logon)

### Total number of Event 4627 per user in an hour window interval  (Group Membership)


### Total number of Event 4648 per user in an hour window interval  (A logon was attempted using explicit credentials)

### Total number of Event 4658 per user in an hour window interval  (handle to an object was closed)

### Total number of Event 4661 per user in an hour window interval  (handle to an object was requsted)

### Total number of Event 4672 per user in an hour window interval  (Special privileges assigned to new logon)

### Total number of Event 4768 per user in an hour window interval  (Kerberos Authentication)

### Total number of Event 4697 per user in an hour window interval  (A service was installed in the system)

### Total number of Event 4698 per user in an hour window interval  (A scheduled task was created)

### Total number of Event 4779 per user in an hour window interval  (A security-enabled local group membership was enumerated)

### Total number of Event 5140 per user in an hour window interval  (A network share object was accessed)

### Total number of Event 5145 per user in an hour window interval  (A network share object was checked to see whether client can be granted desired access)

### Total number of Event 5158 per user in an hour window interval  (The Windows Filtering Platform has permitted a bind to a local port)

## Concatenate datasets 