In [1]:
from pyspark.sql.types import StructType, StructField, FloatType, BooleanType
from pyspark.sql.types import DoubleType, IntegerType, StringType
import pyspark
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import lit
from pyspark.sql.functions import udf
 
from pyspark import SQLContext
 
conf = pyspark.SparkConf() 
 
sc = pyspark.SparkContext.getOrCreate(conf=conf)
sqlcontext = SQLContext(sc)

schema = StructType([
    StructField("ts", StringType(),True),    
    StructField("uid", StringType(),True),     
    StructField("origh", StringType(),True),         
    StructField("origp", StringType(),True),     
    StructField("resph", StringType(),True),      
    StructField("respp", StringType(),True),   
    StructField("proto", StringType(),True),     
    StructField("service" , StringType(),True),        
    StructField("duration", FloatType(),True),     
    StructField("origbytes", StringType(),True),     
    StructField("respbytes", StringType(),True),       
    StructField("connstate", StringType(),True),      
    StructField("localorig", BooleanType(),True),   
    StructField("missedbytes", StringType(),True),      
    StructField("history", StringType(),True),     
    StructField("origpkts", IntegerType(),True),     
    StructField("origipbytes", IntegerType(),True),       
    StructField("resppkts", IntegerType(),True),      
    StructField("respipbytes", IntegerType(),True),     
    StructField("tunnelparents", StringType(),True)    
              ])
        

df = sqlcontext.read.csv(path="/home/ubuntu/Documents/forensics/tail.log", sep="\t", schema=schema) 





 


In [4]:
colsInt = udf(lambda z: toInt(z), IntegerType())
spark.udf.register("colsInt", colsInt)

def toInt(s):
    if isinstance(s, str) == True:
        st = [str(ord(i)) for i in s]
        return(int(''.join(st)))
    else:
         return type(s)


 

In [2]:
df.columns
    
    

['ts',
 'uid',
 'origh',
 'origp',
 'resph',
 'respp',
 'proto',
 'service',
 'duration',
 'origbytes',
 'respbytes',
 'connstate',
 'localorig',
 'missedbytes',
 'history',
 'origpkts',
 'origipbytes',
 'resppkts',
 'respipbytes',
 'tunnelparents']

In [5]:
 

a = df.withColumn( 'iorigp',colsInt('origp'))
b = a.withColumn('iresph',colsInt('resph'))
c = b.withColumn( 'irespp',colsInt('respp'))
d = c.withColumn( 'iproto',colsInt('proto'))
e = d.withColumn(  'iduration',colsInt( 'duration'))
f = e.withColumn( 'iorigbytes',colsInt( 'origbytes'))
g = f.withColumn( 'irespbytes',colsInt('respbytes'))
h = g.withColumn(  'iorigpkts',colsInt( 'origpkts'))
i = h.withColumn( 'iorigipbytes',colsInt('origipbytes'))

columns =  [
  'iorigp',
    'iresph',
    'irespp',
    'iproto',
    'iduration',
    'iorigbytes',
    'irespbytes',
    'iorigpkts',
    'origpkts',
    'iorigipbytes',
    'origipbytes'
]
 

 

In [6]:
columns =  [
  'iorigp',
    'iresph',
    'irespp',
    'iproto',
    'iduration',
    'iorigbytes',
    'irespbytes',
    'iorigpkts',
    'origpkts',
    'iorigipbytes',
    'origipbytes'
]

In [7]:

 
vecAssembler = VectorAssembler(inputCols=columns, outputCol="features")
router = vecAssembler.transform(i)


 
 

In [8]:
router.columns

['ts',
 'uid',
 'origh',
 'origp',
 'resph',
 'respp',
 'proto',
 'service',
 'duration',
 'origbytes',
 'respbytes',
 'connstate',
 'localorig',
 'missedbytes',
 'history',
 'origpkts',
 'origipbytes',
 'resppkts',
 'respipbytes',
 'tunnelparents',
 'iorigp',
 'iresph',
 'irespp',
 'iproto',
 'iduration',
 'iorigbytes',
 'irespbytes',
 'iorigpkts',
 'iorigipbytes',
 'features']

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

 
kmeans = KMeans().setK(7).setSeed(1)
model = kmeans.fit(router)



In [None]:
# Make predictions
predictions = model.transform(router)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)
