In [1]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.types import *
conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [2]:
path = "../data/processed/benign_with_rolling_window.csv"
df = spark.read.csv(path).toDF(
  "time", "username", "event_id", "total_events", "total_per_event"
)
df.show()

+----+-----------+--------+------------+---------------+
|time|   username|event_id|total_events|total_per_event|
+----+-----------+--------+------------+---------------+
|   1|Comp071603$|    4688|           3|              3|
|   1|Comp079982$|    4688|           1|              1|
|   1|Comp102246$|    4688|           2|              2|
|   1|Comp193344$|    4688|           3|              3|
|   1|Comp194392$|    4634|           1|              1|
|   1|Comp332130$|    4688|           1|              1|
|   1|Comp334881$|    4688|           2|              2|
|   1|Comp423822$|    4688|           3|              3|
|   1|Comp626532$|    4688|           3|              3|
|   1|Comp629929$|    4688|           2|              2|
|   1|Comp924592$|    4688|           1|              1|
|   1|Comp002915$|    4688|           2|              2|
|   1|Comp287324$|    4624|           4|              2|
|   1|Comp287324$|    4634|           4|              2|
|   1|Comp386851$|    4688|    

In [3]:
keep_event_ids = [4624, 4625, 4627, 4648, 4658, 4661, 4672, 4697, 4698, 4768, 4779, 5140, 5145, 5158]

In [4]:
df.printSchema()

root
 |-- time: string (nullable = true)
 |-- username: string (nullable = true)
 |-- event_id: string (nullable = true)
 |-- total_events: string (nullable = true)
 |-- total_per_event: string (nullable = true)



In [5]:
df = df.withColumn("event_id", df["event_id"].cast(IntegerType()))
df = df.withColumn("total_events", df["total_events"].cast(IntegerType()))
df = df.withColumn("total_per_event", df["total_per_event"].cast(IntegerType()))

In [6]:
df.printSchema()

root
 |-- time: string (nullable = true)
 |-- username: string (nullable = true)
 |-- event_id: integer (nullable = true)
 |-- total_events: integer (nullable = true)
 |-- total_per_event: integer (nullable = true)



In [7]:
df = df.filter(df.event_id.isin(keep_event_ids))
df.show()

+----+-----------+--------+------------+---------------+
|time|   username|event_id|total_events|total_per_event|
+----+-----------+--------+------------+---------------+
|   1|Comp287324$|    4624|           4|              2|
|   1|Comp362621$|    4624|           6|              1|
|   1|Comp939275$|    4624|          17|              8|
|   1|Comp939275$|    4672|          17|              8|
|   1| User641851|    4624|          20|             11|
|   1| User641851|    4672|          20|              6|
|   1| User032516|    4624|          32|              6|
|   1| User032516|    4648|          32|              6|
|   1| User032516|    4672|          32|              6|
|   1|Comp347730$|    4624|          13|              7|
|   1| User668517|    4625|           2|              2|
|   1|Comp767914$|    4768|           3|              1|
|   1| User643724|    4624|          36|             10|
|   1|    Scanner|    4624|          46|             12|
|   1|    Scanner|    4672|    

In [8]:
# Pivot:
df = df.groupBy("time", "username", "total_events").pivot("event_id").sum("total_per_event")
df.show()

+-----+----------------+------------+-----+----+----+-----+----+
| time|        username|total_events| 4624|4625|4648| 4672|4768|
+-----+----------------+------------+-----+----+----+-----+----+
|  122|     Comp954789$|           8|    2|null|null| null|null|
|  654|     Comp306263$|          43|   16|null|null| null|null|
| 2392|     Comp960825$|          29|    4|null|null| null|null|
| 2613|      User031784|        1180| null| 586|null| null|   5|
| 2825|      AppService|      102723|27252|1707|null|25391|null|
| 2906|     Comp916004$|      140474|42742|null|null|42729|null|
| 3575|     Comp692745$|          39|    6|null|null| null|null|
| 3855|     Comp870075$|          82|   20|null|null|    2|null|
| 5455|ActiveDirectory$|       29103|10203|null|null| 8549|null|
| 5895|      User793535|       14137| 6896|null|null|   60|null|
| 6033|      AppService|      122147|31699|null|null| null|null|
| 6112|      User974844|         655|  138| 121|null| null|null|
| 6756|      AppService| 

In [9]:
df.printSchema()

root
 |-- time: string (nullable = true)
 |-- username: string (nullable = true)
 |-- total_events: integer (nullable = true)
 |-- 4624: long (nullable = true)
 |-- 4625: long (nullable = true)
 |-- 4648: long (nullable = true)
 |-- 4672: long (nullable = true)
 |-- 4768: long (nullable = true)



In [10]:
df = df.na.fill(value=0, subset=["4624", "4625", "4648", "4672", "4768"])
df.show()

+-----+----------------+------------+-----+----+----+-----+----+
| time|        username|total_events| 4624|4625|4648| 4672|4768|
+-----+----------------+------------+-----+----+----+-----+----+
|  122|     Comp954789$|           8|    2|   0|   0|    0|   0|
|  654|     Comp306263$|          43|   16|   0|   0|    0|   0|
| 2392|     Comp960825$|          29|    4|   0|   0|    0|   0|
| 2613|      User031784|        1180|    0| 586|   0|    0|   5|
| 2825|      AppService|      102723|27252|1707|   0|25391|   0|
| 2906|     Comp916004$|      140474|42742|   0|   0|42729|   0|
| 3575|     Comp692745$|          39|    6|   0|   0|    0|   0|
| 3855|     Comp870075$|          82|   20|   0|   0|    2|   0|
| 5455|ActiveDirectory$|       29103|10203|   0|   0| 8549|   0|
| 5895|      User793535|       14137| 6896|   0|   0|   60|   0|
| 6033|      AppService|      122147|31699|   0|   0|    0|   0|
| 6112|      User974844|         655|  138| 121|   0|    0|   0|
| 6756|      AppService| 

In [11]:
split_df = df.filter(df.username.isin("User793535", "User031784", "User318330", "User974844"))
split_df.show()

+-----+----------+------------+----+----+----+----+----+
| time|  username|total_events|4624|4625|4648|4672|4768|
+-----+----------+------------+----+----+----+----+----+
| 2613|User031784|        1180|   0| 586|   0|   0|   5|
| 5895|User793535|       14137|6896|   0|   0|  60|   0|
| 6112|User974844|         655| 138| 121|   0|   0|   0|
| 9461|User793535|       12502|5995|   0|   0|   0|   0|
|18277|User318330|        5192|1986|   0|   0|   0|   0|
|31588|User031784|        1644|   0| 814|   0|   0|   0|
|50140|User793535|       14757|7347|   0|   0|   0|   0|
|33765|User031784|        1926|  82|   0|   0|   0|   0|
|40373|User974844|         624| 125| 121|   0|   0|   0|
|73206|User031784|        1661|   0| 824|   0|   0|   0|
|75224|User793535|       15959|7940|   0|   0|   0|   0|
| 7402|User974844|         653| 141| 121|   0|   0|   0|
|60599|User793535|       15826|7848|   0|   0|   0|   0|
|76764|User318330|        4607|1687|   0|   0|   0|   0|
| 2716|User031784|        1222|

In [12]:
# split_df.toPandas().to_csv('mycsv.csv')
# split_df.write.option("header",True).csv("Desktop/Team Elysium/anomaly_detection_active_directory_logs/data/processed/pivotdata.csv")

In [13]:
# Standardize:
from pyspark.ml import Pipeline
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

unlist = udf(lambda x: round(float(list(x)[0]),3), DoubleType())

# columns_to_scale = ["total_events", "4624", "4625", "4648", "4672", "4768"]

# assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_scale]

# scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
# pipeline = Pipeline(stages=assemblers + scalers)
# scalerModel = pipeline.fit(df)
# scaledData = scalerModel.transform(df)

In [14]:
for i in ["total_events", "4624", "4625", "4648", "4672", "4768"]:
    assembler = VectorAssembler(inputCols=[i], outputCol=i+"_Vect")
    scaler = MinMaxScaler(inputCol=i+"_Vect", outputCol=i+"_Scaled")
    pipeline = Pipeline(stages=[assembler, scaler])
    split_df = pipeline.fit(split_df).transform(split_df).withColumn(i+"_Scaled", unlist(i+"_Scaled")).drop(i+"_Vect")

print("After Scaling:")
split_df.show()

After Scaling:
+-----+----------+------------+----+----+----+----+----+-------------------+-----------+-----------+-----------+-----------+-----------+
| time|  username|total_events|4624|4625|4648|4672|4768|total_events_Scaled|4624_Scaled|4625_Scaled|4648_Scaled|4672_Scaled|4768_Scaled|
+-----+----------+------------+----+----+----+----+----+-------------------+-----------+-----------+-----------+-----------+-----------+
| 2613|User031784|        1180|   0| 586|   0|   0|   5|              0.062|        0.0|       0.65|        0.0|        0.0|       0.25|
| 5895|User793535|       14137|6896|   0|   0|  60|   0|               0.75|      0.761|        0.0|        0.0|      0.698|        0.0|
| 6112|User974844|         655| 138| 121|   0|   0|   0|              0.035|      0.015|      0.134|        0.0|        0.0|        0.0|
| 9461|User793535|       12502|5995|   0|   0|   0|   0|              0.663|      0.661|        0.0|        0.0|        0.0|        0.0|
|18277|User318330|        