In [30]:
from pyspark.sql.types import StructType, StructField, FloatType, BooleanType
from pyspark.sql.types import DoubleType, IntegerType, StringType
import pyspark
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import lit
from pyspark.sql.functions import udf, concat
 
from pyspark import SQLContext
 
conf = pyspark.SparkConf() 
 
sc = pyspark.SparkContext.getOrCreate(conf=conf)
sqlcontext = SQLContext(sc)

schema = StructType([
    StructField("ts", StringType(),True),    
    StructField("uid", StringType(),True),     
    StructField("origh", StringType(),True),         
    StructField("origp", StringType(),True),     
    StructField("resph", StringType(),True),      
    StructField("respp", StringType(),True),   
    StructField("proto", StringType(),True),     
    StructField("service" , StringType(),True),        
    StructField("duration", FloatType(),True),     
    StructField("origbytes", StringType(),True),     
    StructField("respbytes", StringType(),True),       
    StructField("connstate", StringType(),True),      
    StructField("localorig", StringType(),True),   
    StructField("missedbytes", StringType(),True),      
    StructField("history", StringType(),True),     
    StructField("origpkts", IntegerType(),True),     
    StructField("origipbytes", IntegerType(),True),       
    StructField("resppkts", IntegerType(),True),      
    StructField("respipbytes", IntegerType(),True),     
    StructField("tunnelparents", StringType(),True)    
              ])
        

df = sqlcontext.read.csv(path="/home/ubuntu/Documents/forensics/bigger.log", sep="\t", schema=schema) 
df2 = df.fillna(0)



In [31]:
colsInt = udf(lambda z: toInt(z), IntegerType())

sqlcontext.udf.register("colsInt", colsInt)

def toInt(s):
    if not s:
        return 0
    if isinstance(s, str) == True:
        st = [str(ord(i)) for i in s]
        return(int(''.join(st)))
    else:
        return s


In [32]:
a = df2.withColumn( 'iorigp',colsInt('origp'))
c = a.withColumn( 'irespp',colsInt('respp'))
d = c.withColumn( 'iproto',colsInt('proto'))
e = d.withColumn('iorigh',colsInt('origh'))
f = e.withColumn( 'iorigbytes',colsInt( 'origbytes'))
g = f.withColumn( 'irespbytes',colsInt('respbytes'))
h = g.withColumn(  'iorigpkts',colsInt( 'origpkts'))
i = h.withColumn( 'iorigipbytes',colsInt('origipbytes'))

columns =  ['iorigp','irespp','iproto', 'iorigbytes','irespbytes','iorigpkts','iorigipbytes']
    
    

In [33]:
vecAssembler = VectorAssembler(inputCols=columns, outputCol="features")
router = vecAssembler.transform(i)

In [34]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

 
kmeans = KMeans().setK(7).setSeed(1)
model = kmeans.fit(router)

In [35]:

predictions = model.transform(router)


 

 

In [45]:
from plotly.offline import plot

import pandas as pd



p = predictions.groupby('prediction').count()
q = p.toPandas() 


import plotly.graph_objects as go
fig = go.Figure(
    data=[go.Bar(x=q['prediction'],y=q['count'])],
    layout_title_text="K Means Count"
)
fig.show()

In [37]:
 
q

 
 

Unnamed: 0,prediction,count
0,1,40303
1,6,17685
2,3,6321
3,5,3197
4,4,12807
5,2,171
6,0,19516


In [38]:
 suspect = predictions.filter("prediction == 2")
 
 

In [42]:
x = suspect.select('ts','uid','origh','resph').toPandas() 



In [43]:
x 

Unnamed: 0,ts,uid,origh,resph
0,1332016597.410000,CmZcN8jNeZKMZDT7,fe80::216:47ff:fe9d:f2c6,ff02::1
1,1332016531.870000,C4D90720LAlhxtEYh4,192.168.204.1,192.168.204.57
2,1332016611.240000,CbeFYr43ZvrnMRmese,fe80::216:47ff:fe9d:f2c4,ff02::1
3,1332016614.490000,Cq7Eix3vot0Uvll2jb,192.168.202.1,192.168.202.81
4,1332016617.040000,Cm7oZnWokMdxTovJ6,fe80::216:47ff:fe9d:f2c3,ff02::1
5,1332016638.250000,CO69a21Q8ofevEiez8,fe80::216:47ff:fe9d:f2c8,ff02::1
6,1332016651.850000,C4lMSQ1lo9AUpyEqp3,fe80::216:47ff:fe9d:f2d5,ff02::1
7,1332016653.960000,CEaig24DTLOU25hrH4,fe80::216:47ff:fe9d:f2d6,ff02::1
8,1332016656.870000,C9qyMu1jzGCcfmBPD1,192.168.202.1,192.168.202.62
9,1332016664.130000,CxrpXirJ1SduvlIug,fe80::216:47ff:fe9d:f2c2,fe80::5e26:aff:fe11:4a0d
