In [242]:
from pyspark.sql.types import StructType, StructField, FloatType, BooleanType
from pyspark.sql.types import DoubleType, IntegerType, StringType
import pyspark
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import lit
from pyspark.sql.functions import udf, concat
 
from pyspark import SQLContext
 
conf = pyspark.SparkConf() 
 
sc = pyspark.SparkContext.getOrCreate(conf=conf)
sqlcontext = SQLContext(sc)

schema = StructType([
    StructField("ts", StringType(),True),    
    StructField("uid", StringType(),True),     
    StructField("origh", StringType(),True),         
    StructField("origp", StringType(),True),     
    StructField("resph", StringType(),True),      
    StructField("respp", StringType(),True),   
    StructField("proto", StringType(),True),     
    StructField("service" , StringType(),True),        
    StructField("duration", FloatType(),True),     
    StructField("origbytes", StringType(),True),     
    StructField("respbytes", StringType(),True),       
    StructField("connstate", StringType(),True),      
    StructField("localorig", StringType(),True),   
    StructField("missedbytes", StringType(),True),      
    StructField("history", StringType(),True),     
    StructField("origpkts", IntegerType(),True),     
    StructField("origipbytes", IntegerType(),True),       
    StructField("resppkts", IntegerType(),True),      
    StructField("respipbytes", IntegerType(),True),     
    StructField("tunnelparents", StringType(),True)    
              ])
        

df = sqlcontext.read.csv(path="/home/ubuntu/Documents/forensics/bigger.log", sep="\t", schema=schema) 




In [243]:
colsInt = udf(lambda z: toInt(z), IntegerType())

spark.udf.register("colsInt", colsInt)

def toInt(s):
    if isinstance(s, str) == True:
        st = [str(ord(i)) for i in s]
        return(int(''.join(st)))
    else:
         return s


In [244]:
a = df.withColumn( 'iorigp',colsInt('origp'))
c = a.withColumn( 'irespp',colsInt('respp'))
d = c.withColumn( 'iproto',colsInt('proto'))
f = d.withColumn( 'iorigbytes',colsInt( 'origbytes'))
g = f.withColumn( 'irespbytes',colsInt('respbytes'))
h = g.withColumn(  'iorigpkts',colsInt( 'origpkts'))
i = h.withColumn( 'iorigipbytes',colsInt('origipbytes'))

columns =  ['iorigp','irespp','iproto','iorigbytes','irespbytes','iorigpkts','iorigipbytes']
    
    

In [245]:
vecAssembler = VectorAssembler(inputCols=columns, outputCol="features")
router = vecAssembler.transform(i)

In [246]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

 
kmeans = KMeans().setK(7).setSeed(1)
model = kmeans.fit(router)

In [247]:
 # Make predictions
predictions = model.transform(router)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)


 

 

Silhouette with squared euclidean distance = 0.7482953608682851
Cluster Centers: 
[1.06630230e+09 4.21529639e+07 1.28108950e+07 5.78024926e+04
 2.65607199e+05 1.04445282e+00 6.47468450e+01]
[1.14502802e+09 9.60932775e+08 1.17538658e+07 3.38826753e+02
 4.50085714e+01 1.00987013e+00 4.67948052e+01]
[9.31856653e+08 9.90159420e+08 1.16991120e+07 4.50622055e+01
 4.50622055e+01 1.00000000e+00 6.00000000e+01]
[ 9.79548551e+08  6.80980843e+08  1.16991120e+07 -4.68320645e+05
  1.49027856e+05  3.12326870e+00  2.82536981e+03]
[3.30118333e+05 3.30117056e+05 2.00917452e+09 2.96344167e+04
 4.52500000e+01 1.75000000e+00 1.34166667e+02]
[8.51441469e+08 4.25112538e+07 1.30679562e+07 2.53330172e+05
 1.48135183e+04 1.03984652e+00 7.00513577e+01]
[9.58129996e+08 4.31570110e+07 1.22355585e+07 1.06560307e+04
 1.02323933e+04 1.01771173e+00 6.17966205e+01]


In [248]:
 
predictions.select("prediction").show()

+----------+
|prediction|
+----------+
|         0|
|         5|
|         3|
|         2|
|         5|
|         6|
|         6|
|         0|
|         6|
|         3|
|         5|
|         5|
|         0|
|         6|
|         1|
|         0|
|         3|
|         6|
|         6|
|         6|
+----------+
only showing top 20 rows



In [252]:

from plotly.offline import plot

import pandas as pd



p = predictions.groupby('prediction').count()
q = p.toPandas() 


import plotly.graph_objects as go
fig = go.Figure(
    data=[go.Bar(x=q.index.values,y=q['prediction'])],
    layout_title_text="xxxx"
)
fig.show()



 
 

In [250]:
 q

Unnamed: 0,prediction,count
0,1,3850
1,6,4912
2,3,1444
3,5,3388
4,4,36
5,2,1061
6,0,5309


In [255]:
 suspect = predictions.filter("prediction == 4")
 
 

In [258]:
suspect.select('iorigp','irespp','iproto','iorigbytes','irespbytes','iorigpkts','iorigipbytes').show()



+------+------+----------+----------+----------+---------+------------+
|iorigp|irespp|    iproto|iorigbytes|irespbytes|iorigpkts|iorigipbytes|
+------+------+----------+----------+----------+---------+------------+
|495152|495151|2009174520|        45|        45|        1|         104|
|495152|495151|2009174520|        45|        45|        1|         104|
|    51|    49|2009174520|        45|        45|        1|          56|
|    51|    49|2009174520|      5354|        48|        2|         112|
|495152|495151|2009174520|        45|        45|        1|         104|
|495152|495151|2009174520|        45|        45|        1|         104|
|495152|495151|2009174520|        45|        45|        1|         104|
|495152|495151|2009174520|        45|        45|        1|         104|
|495152|495151|2009174520|        45|        45|        1|         104|
|495152|495151|2009174520|        45|        45|        1|         104|
|    51|    49|2009174520|        45|        45|        1|      