In [1]:
import pyspark
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.types import *
conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [2]:
path = "../data/processed/benign_with_rolling_window.csv"
df = spark.read.csv(path).toDF(
  "time", "username", "event_id", "total_events", "total_per_event"
)
df.show()

+----+-----------+--------+------------+---------------+
|time|   username|event_id|total_events|total_per_event|
+----+-----------+--------+------------+---------------+
|   1|Comp071603$|    4688|           3|              3|
|   1|Comp079982$|    4688|           1|              1|
|   1|Comp102246$|    4688|           2|              2|
|   1|Comp193344$|    4688|           3|              3|
|   1|Comp194392$|    4634|           1|              1|
|   1|Comp332130$|    4688|           1|              1|
|   1|Comp334881$|    4688|           2|              2|
|   1|Comp423822$|    4688|           3|              3|
|   1|Comp626532$|    4688|           3|              3|
|   1|Comp629929$|    4688|           2|              2|
|   1|Comp924592$|    4688|           1|              1|
|   1|Comp002915$|    4688|           2|              2|
|   1|Comp287324$|    4624|           4|              2|
|   1|Comp287324$|    4634|           4|              2|
|   1|Comp386851$|    4688|    

In [3]:
keep_event_ids = [4624, 4625, 4627, 4648, 4658, 4661, 4672, 4697, 4698, 4768, 4779, 5140, 5145, 5158]

In [4]:
df.printSchema()

root
 |-- time: string (nullable = true)
 |-- username: string (nullable = true)
 |-- event_id: string (nullable = true)
 |-- total_events: string (nullable = true)
 |-- total_per_event: string (nullable = true)



In [None]:
df = df.withColumn("event_id", df["event_id"].cast(IntegerType()))
df = df.withColumn("total_events", df["total_events"].cast(IntegerType()))
df = df.withColumn("total_per_event", df["total_per_event"].cast(IntegerType()))

In [None]:
df.printSchema()

In [7]:
df = df.filter(df.event_id.isin(keep_event_ids))
df.show()

+----+-----------+--------+------------+---------------+
|time|   username|event_id|total_events|total_per_event|
+----+-----------+--------+------------+---------------+
|   1|Comp287324$|    4624|           4|              2|
|   1|Comp362621$|    4624|           6|              1|
|   1|Comp939275$|    4624|          17|              8|
|   1|Comp939275$|    4672|          17|              8|
|   1| User641851|    4624|          20|             11|
|   1| User641851|    4672|          20|              6|
|   1| User032516|    4624|          32|              6|
|   1| User032516|    4648|          32|              6|
|   1| User032516|    4672|          32|              6|
|   1|Comp347730$|    4624|          13|              7|
|   1| User668517|    4625|           2|              2|
|   1|Comp767914$|    4768|           3|              1|
|   1| User643724|    4624|          36|             10|
|   1|    Scanner|    4624|          46|             12|
|   1|    Scanner|    4672|    

In [10]:
# Pivot:
df = df.groupBy("time", "username").pivot("event_id").sum("total_events")
df.show()

+-----+--------------------+------+------+------+------+------+
| time|            username|  4624|  4625|  4648|  4672|  4768|
+-----+--------------------+------+------+------+------+------+
|  798|         Comp645556$|    26|  null|  null|  null|  null|
| 1024|         Comp309142$|    49|  null|    49|    49|    49|
| 2255|         Comp916004$|106340|  null|  null|106340|  null|
| 2336|          User833985|  null|  null|  null|  null|  2472|
| 3031|          User643724| 16933|  null|  null| 16933|  null|
| 4756|         Comp897367$| 36260|  null|  null|  null|  null|
| 5766|         Comp234982$|   344|  null|  null|  null|  null|
| 8095|          User467268|   536|  null|  null|  null|  null|
| 8785|EnterpriseAppServer$| 22525|  null| 22525| 22525| 22525|
| 8870|          User860895|   244|  null|  null|  null|  null|
| 9103|          AppService| 95466| 95466|  null| 95466|  null|
|10431|          User897275|106554|  null|106554|  null|106554|
|10498|         Comp377346$|    83|  nul

In [11]:
df.printSchema()

root
 |-- time: string (nullable = true)
 |-- username: string (nullable = true)
 |-- 4624: long (nullable = true)
 |-- 4625: long (nullable = true)
 |-- 4648: long (nullable = true)
 |-- 4672: long (nullable = true)
 |-- 4768: long (nullable = true)



In [None]:
# Standardize:
from pyspark.ml import Pipeline
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
columns_to_scale = ["4624", "4625", "4648", "4672", "4768"]
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_scale]
scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
pipeline = Pipeline(stages=assemblers + scalers)
scalerModel = pipeline.fit(df)
scaledData = scalerModel.transform(df)