In [None]:
from pyspark.sql.types import StructType, StructField, FloatType, BooleanType
from pyspark.sql.types import DoubleType, IntegerType, StringType
import pyspark
 
from pyspark import SQLContext
 
conf = pyspark.SparkConf() 
 
sc = pyspark.SparkContext.getOrCreate(conf=conf)
sqlcontext = SQLContext(sc)

schema = StructType([
    StructField("ts", StringType(),True),   
    StructField("uid", StringType(),True),    
    StructField("id.orig_h", StringType(),True),       
    StructField("id.orig_p", StringType(),True),     
    StructField("id.resp_h", StringType(),True),     
    StructField("id.resp_p", StringType(),True),   
    StructField("proto", StringType(),True),    
    StructField("service" , StringType(),True),      
    StructField("duration", FloatType(),True),    
    StructField("orig_bytes", StringType(),True),    
    StructField("resp_bytes", StringType(),True),      
    StructField("conn_state", StringType(),True),     
    StructField("local_orig", BooleanType(),True),  
    StructField("missed_bytes", StringType(),True),    
    StructField("history", StringType(),True),     
    StructField("orig_pkts", IntegerType(),True),     
    StructField("orig_ip_bytes", IntegerType(),True),       
    StructField("resp_pkts", IntegerType(),True),     
    StructField("resp_ip_bytes", IntegerType(),True),    
    StructField("tunnel_parents", StringType(),True)   
              ])
        

df = sqlcontext.read.csv(path="/home/ubuntu/Documents/forensics/tail.log", sep="\t",schema=schema) 


origColumns = { "ts": 0,
                "uid" : 1,
               "id.orig_h" : 2 ,
               "id.orig_p" : 3,
               "id.resp_h" : 4,
               "id.resp_p" : 5, 
               "proto" : 6,
               "service" : 7,
               "duration" : 8, 
               "orig_bytes" : 9,
               "resp_bytes" : 10,
               "conn_state" : 11,  
               "local_orig" : 12,
               "missed_bytes" : 13,
               "history" : 14,  
               "orig_pkts" : 15, 
               "orig_ip_bytes" : 16,
               "resp_pkts" : 17, 
               "resp_ip_bytes" : 18, 
               "tunnel_parents" : 19, 
               "orig_cc" : 20 ,
               "resp_cc" : 21 }

columns = [ "id.orig_h",  "id.orig_p",   "id.resp_h", 
           "id.resp_p", "proto", "duration", 
           "orig_bytes", "resp_bytes",    
              "orig_pkts", "orig_ip_bytes",  "resp_pkts", 
           "resp_ip_bytes"  ]


In [None]:
def filterFormat(l):
   return [toInt(l[origColumns[i]]) for i in columns]

def toInt(s):
    if isinstance(s, str) == True:
        st = [str(ord(i)) for i in s]
        return(int(''.join(st)))
    else:
        return s


In [None]:
router = df.rdd.map(filterFormat).toDF(columns)

In [None]:
router.show()

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

 

# Trains a k-means model.
kmeans = KMeans().setK(7).setSeed(1)
model = kmeans.fit(router)

# Make predictions
predictions = model.transform(router)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)


In [None]:
router.take(2)