In [1]:
import pyspark
import numpy as np
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import lit, rand, row_number
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.feature import PCA
from pyspark.ml.feature import StandardScaler

In [2]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.memory.fraction", 0.8) \
    .config("spark.executor.memory", "14g") \
    .config("spark.driver.memory", "12g")\
    .config("spark.sql.shuffle.partitions" , "800")\
    .config("spark.sql.crossJoin.enabled" , "true")\
    .getOrCreate()

In [3]:
mirai_devices = ['danmini_doorbell', 'ecobee_thermostat', 'philips_B120N10_baby_monitor', 
            'provision_PT_737E_security_camera', 'provision_PT_838_security_camera',
            'simplehome_XCS_1002_WHT_security_camera', 'simplehome_XCS_1003_WHT_security_camera']

benign = ['danmini_doorbell', 'ecobee_thermostat', 'ennio_doorbell', 'philips_B120N10_baby_monitor', 
            'provision_PT_737E_security_camera', 'provision_PT_838_security_camera', 'samsung_SNH_1011_N_webcam',
            'simplehome_XCS_1002_WHT_security_camera', 'simplehome_XCS_1003_WHT_security_camera']

mirai_attacks = ['ack', # automatic scan for vulnerable devices
                 'scan', # ack flood
                 'syn', # syn flood
                 'udp', # udp flood
                 'udpplain'] # optimized udp flood

In [4]:
first = True
mirai_data = None
for device in mirai_devices:
    for attack in mirai_attacks:
        if first:
            mirai_data = spark.read.option("inferSchema",True)\
                .option("header", True).csv(f'../data/n_balo_t/{device}/mirai_attacks/{attack}.csv')
            first = False
        else:
            to_add = spark.read.option("inferSchema",True)\
                .option("header", True).csv(f'../data/n_balo_t/{device}/mirai_attacks/{attack}.csv')
            mirai_data = mirai_data.union(to_add)
mirai_data = mirai_data.withColumn('label', lit(0))
malign_total = mirai_data.count()

first = True
benign_data = None
for device in benign:
    if first:
        benign_data = spark.read.option("inferSchema",True)\
            .option("header", True).csv(f'../data/n_balo_t/{device}/benign_traffic.csv')
        first = False
    else:
        to_add = spark.read.option("inferSchema",True)\
            .option("header", True).csv(f'../data/n_balo_t/{device}/benign_traffic.csv')
        benign_data = benign_data.union(to_add)
benign_data = benign_data.withColumn('label', lit(1))
benign_total = benign_data.count()
ratio = benign_total/malign_total
print(f'total malign data = {malign_total}')
print(f'total benign data = {benign_total}')
mirai_data = mirai_data.sample(False, fraction=ratio)
mirai_data = mirai_data.union(benign_data)
mirai_data = mirai_data.withColumn('label', mirai_data['label'].cast(DoubleType()))
print(f'total data for prediciton modeling = {mirai_data.count()}')

total malign data = 3668402
total benign data = 555932
total data for prediciton modeling = 1110992


In [5]:
cols = mirai_data.columns
new_cols = [str(i) for i in range(len(cols))]
mirai_data = mirai_data.toDF(*new_cols)
mirai_data = mirai_data.withColumnRenamed('115', 'label')

In [6]:
benign = mirai_data.filter(mirai_data['label']==1.0).count()
malicious = mirai_data.filter(mirai_data['label']==0.0).count()
print(benign, malicious)

555932 555060


In [7]:
def std(data):
    feature_list = []
    for col in data.columns:
        if col == 'label':
            continue
        else:
            feature_list.append(col)
    labels = data.select('label')
    
    # vector assembler
    asmblr = VectorAssembler(inputCols=feature_list, outputCol='features')
    print("Assembling feature vector..")
    data = asmblr.transform(data).select(['features'])
    
    # standardize the features
    standardizer = StandardScaler(withMean=True, withStd=True, inputCol='features', outputCol='std_features')
    print("Standardizing feature vector..")
    standardizer_model = standardizer.fit(data)
    data = standardizer_model.transform(data)
    
    # add label column
    print("Adding labels..")
    data = data.join(labels)
    
    return data
    
def pca(data):
    feature_list = []
    for col in data.columns:
        if col == 'label':
            continue
        else:
            feature_list.append(col)
    labels = data.select('label')
    
    # vector assembler
    asmblr = VectorAssembler(inputCols=feature_list, outputCol='features')
    print("Assembling feature vector..")
    data = asmblr.transform(data).select(['features'])
    
    # standardize the features
    standardizer = StandardScaler(withMean=True, withStd=True, inputCol='features', outputCol='std_features')
    print("Standardizing feature vector..")
    standardizer_model = standardizer.fit(data)
    data = standardizer_model.transform(data)
    
    # use PCA estimator directly on the standardized df, to generate pca_features
    num_principal_components = 13
    pca = PCA(k=num_principal_components, inputCol='std_features', outputCol='pca_features')
    print("PCA estimation..")
    pca_model = pca.fit(data)
    data = pca_model.transform(data)

    # data with principal features and label
    data = data.drop('features').drop('std_features')
    print("Replacing feature vector..")
    principal_features_list = feature_list[:num_principal_components]
    data = data.select("pca_features")
    
    # disassemble pca_features vector into individual columns
    print("Assembling new dataframe..")
    data = data.rdd.map(lambda x:[float(y) for y in x['pca_features']]).toDF(principal_features_list)
    
    # add label column
    print("Adding labels..")
    data = data.join(labels)
    
    # use new_data to run algorithm as before ...
    
    # set up feature and labels as input and output (with pca_features)
    asmblr_2 = VectorAssembler(inputCols=principal_features_list, outputCol="features")
    print("Preparing new dataframe with PCA features..")
    data = asmblr_2.transform(data).select(['label', 'features'])
    
    return data

def data_prep(data):
    feature_list = []
    for col in data.columns:
        if col == 'label':
            continue
        else:
            feature_list.append(col)
            
    # set up feature and labels as input and output
    asmblr = VectorAssembler(inputCols=feature_list, outputCol="features")
    
    assembled_data=asmblr.transform(data)
    
    return assembled_data
    
def reg_rf(data, trees, maxD):
    rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=trees, maxDepth=maxD)
    
    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    model = rf.fit(trainingData)
    preds = model.transform(testData)
    return preds

In [8]:
def printout(m):
    print(f'Accuracy: {m.accuracy}')
    print(f'F1: {m.fMeasure()}')
    print(f'False Positive Rate for Benign: {m.falsePositiveRate(1.0)}')
    print(f'True Positive Rate for Benign: {m.truePositiveRate(1.0)}')
    print(f'False Positive Rate for Malicious: {m.falsePositiveRate(0.0)}')
    print(f'True Positive Rate for Malicious: {m.truePositiveRate(0.0)}')
    print(f'Precision: {m.precision()}')
    print(f'Recall: {m.recall()}')

In [9]:
big_sample, small_sample = mirai_data.randomSplit([0.95, 0.05])
small_sample = small_sample.withColumn('order', rand(seed=123)).orderBy('order').drop('order')

In [10]:
prepared = data_prep(small_sample)
predictions = reg_rf(prepared, 5, 10)
predictions_rdd = predictions.select(['label', 'prediction']).rdd.map(tuple)
metrics = MulticlassMetrics(predictions_rdd)
printout(metrics)


Accuracy: 1.0
F1: 1.0
False Positive Rate for Benign: 0.0
True Positive Rate for Benign: 1.0
False Positive Rate for Malicious: 0.0
True Positive Rate for Malicious: 1.0
Precision: 1.0
Recall: 1.0
