In [1]:
import numpy as np
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").appName('Exercise_1').getOrCreate()
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/18 22:34:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
def readFile(filename:str):
    rdd = sc.textFile(filename)
    return (
        rdd.map(lambda x: x.split(","))
        .map(lambda x: tuple(map(float, x)))
        .map(lambda x: (x[:-1], int(x[-1])))
    )

In [4]:
rdd = readFile("data/botnet_reduced_10k_l.csv")
TOTAL = rdd.count()
rdd.take(2)

                                                                                

[((3545.3018916840147,
   3198.0139469522546,
   80.00015469454229,
   1.0000019205086303,
   444960086.4643011,
   476.8073064086366,
   12.999999992936175,
   -7.710424743123667e-09,
   87.00000148917339,
   19099430.15956688,
   2468368394.7593513),
  0),
 ((-1.21308761436012e-06,
   7.999989079213265,
   65499.99887793755,
   0.9999997452701503,
   61.99988768910407,
   69.99980788311223,
   11.999999995904211,
   -2.35688187855132e-08,
   8.00000498340475,
   2468369617.006896,
   2468368392.7377276),
  1)]

In [5]:
def normalize(rdd_xy):
    rdd_X = rdd_xy.map(lambda x: x[0])
    _ = rdd_xy.map(lambda x: x[1])

    mu, rows = rdd_X.map(lambda x: (np.array(x), 1)).reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]))
    mu /= rows
    std = (rdd_X.map(lambda x: (np.array(x) - mu) ** 2).reduce(lambda x, y: x + y) / rows) ** 0.5

    return rdd_xy.map(lambda x: ((np.array(x[0]) - mu) / std, x[1]))

In [6]:
normalize(rdd).take(2)

[(array([ 1.41281668, -0.75956071, -0.41204268, -0.45940748,  1.39580934,
         -0.35386218,  0.74057161, -0.8945781 , -0.40163083, -2.91039199,
          0.1567098 ]),
  0),
 (array([-0.79867306, -0.89146231,  3.64034447, -0.45940748, -0.52296869,
         -0.35387136,  0.55184122, -0.8945781 , -1.27444226,  0.4762026 ,
          0.1567098 ]),
  1)]

In [7]:
def train(rdd_xy, iterations, learning_rate, lambda_reg):
    its_to_print = list(range(0, iterations, iterations // 10)) + [iterations]
    k = len(rdd_xy.first()[0])
    m = rdd_xy.count()
    w = np.random.rand(k)
    b = 0

    for it in range(iterations):
        # Gradients
        dw = (
            rdd_xy.map(lambda x: (predict_proba(w, b, x[0]) - x[1]) * np.array(x[0]))
            .reduce(lambda x, y: x + y) / m
        )
        db = (
            rdd_xy.map(lambda x: predict_proba(w, b, x[0]) - x[1])
            .reduce(lambda x, y: x + y) / m
        )

        # Regularization
        dw += lambda_reg / k * w

        # Update parameters
        w -= learning_rate * dw
        b -= learning_rate * db

        # Logging
        if it in its_to_print:
            c = cost(w, b, rdd_xy, lambda_reg, k, m)
            acc = accuracy(w, b, rdd_xy)
            print(f"Iteration {it}/{iterations}, Cost: {c:.4f}, Accuracy: {acc:.4f}")

    return w, b


def sigmoid(logit):
    return 1 / (1 + np.exp(-np.clip(logit, -250, 250)))


def predict_proba(w, b, X):
    X = np.array(X)
    w = np.array(w)
    return sigmoid(np.dot(w, X) + b)


def predict(w, b, X):
    return 1 if predict_proba(w, b, X) > 0.5 else 0


def accuracy(w, b, rdd_xy):
    preds = rdd_xy.map(lambda x: (int(predict(w, b, x[0]) == x[1]), 1)).reduce(
        lambda x, y: (x[0] + y[0], x[1] + y[1])
    )
    return preds[0] / preds[1]


def cost(w, b, rdd_xy, lambda_reg, k, m):
    return (
        rdd_xy.map(lambda x: (predict_proba(w, b, x[0]), x[1]))
        .map(lambda x: (x[1] * np.log(x[0]) + (1 - x[1]) * np.log(1 - x[0])))
        .reduce(lambda x, y: x + y) / (-m) + lambda_reg * (w**2).sum() / (2 * k)
    )

In [8]:
path = "data/botnet_reduced_10k_l.csv"  # botnet_tot_syn_l.csv // botnet_reduced_10k_l.csv
nIter = 20
leaning_rate =  1.5
lambda_reg = 0

# Read data
data = readFile(path).cache()
print(f"Total samples: {data.count()}")

# Standardize
data = normalize(data)
print("Data normalized.")

sums, n = data.map(lambda x: (x[0], 1)).reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]))
medias = sums / n
print(f"Feature means after normalization: {np.array2string(medias, precision=4, suppress_small=True)}")
desvs = (data.map(lambda x: (x[0]-medias)**2).reduce(lambda x, y: x + y) / (n-1))**0.5
print(f"Feature stds after normalization: {np.array2string(desvs, precision=4, suppress_small=True)}")

w_b = train(data, nIter, leaning_rate, lambda_reg)
acc = accuracy(w_b[0], w_b[1], data)
print(f"Accuracy: {acc:.4f}")

                                                                                

Total samples: 10000
Data normalized.
Iteration 0/20, Cost: 0.7325, Accuracy: 0.6027
Iteration 2/20, Cost: 0.3688, Accuracy: 0.8613
Iteration 4/20, Cost: 0.2881, Accuracy: 0.9007
Iteration 6/20, Cost: 0.2535, Accuracy: 0.9155
Iteration 8/20, Cost: 0.2338, Accuracy: 0.9223
Iteration 10/20, Cost: 0.2211, Accuracy: 0.9256
Iteration 12/20, Cost: 0.2121, Accuracy: 0.9282
Iteration 14/20, Cost: 0.2054, Accuracy: 0.9292
Iteration 16/20, Cost: 0.2002, Accuracy: 0.9300
Iteration 18/20, Cost: 0.1961, Accuracy: 0.9304
Accuracy: 0.9310


In [9]:
sc.stop()