In [95]:
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import re
import csv
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report,precision_recall_fscore_support
from IPython.display import Image  
from sklearn.tree import export_graphviz
from IPython.display import SVG
# from graphviz import Source
from IPython.display import display

In [96]:
def split_len(x):
    return len(x.split('.'))

In [97]:
df_train_benin = pd.read_csv('Data/Train_Benign_Traffic.csv',encoding="latin1")
df_train_mal = pd.read_csv('Data/Train_Detections.csv')
df_test = pd.read_csv('Data/Test_Set_Hackathon.csv')
df_train_ips = pd.read_csv('Data/Train_IPS.csv')
df_ips = pd.read_csv('Data/IPTypes.csv').set_index('ip')

In [98]:
# easter_eggs = df[df.dst_ip.map(split_len) > 4 ].dst_ip
df_train_benin.dst_port = df_train_benin.dst_port.fillna(value=-1)
df_train_benin['label'] = 'benign'
df_train = pd.concat([df_train_benin,df_train_mal]).reset_index()
df_train = df_train.drop(['index','Unnamed: 0','app_name','app_risk'],axis=1)
df_train = df_train[df_train.dst_ip.map(split_len) < 5 ].reset_index()
df_train = df_train[df_train.src_ip.map(split_len) < 5 ].reset_index().drop(['index','level_0'],axis=1)
df_train.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


Unnamed: 0,client,dst_ip,dst_port,epoch_time,label,protocol,received_bytes,sent_bytes,src_ip
0,2,222.58.210.46,443.0,1547242292,benign,TCP,44096,67712,10.197.88.33
1,5,110.244.46.16,443.0,1547468725,benign,TCP,7712,1990,10.0.11.69
2,2,23.130.21.5,443.0,1547415821,benign,TCP,12928,8464,192.168.174.137
3,2,71.55.163.78,443.0,1547152973,benign,TCP,19808,4376,192.168.190.183
4,5,230.0.0.1,-1.0,1547070189,benign,TCP,0,252,10.0.11.2


In [99]:
df_train.protocol = df_train.protocol.replace({'UDP':2,'TCP':1})
df_train = df_train.join(df_ips,on='src_ip',rsuffix='_src').drop('Unnamed: 0',axis=1)
df_train = df_train.join(df_ips,on='src_ip',rsuffix='_src').drop('Unnamed: 0',axis=1)
df_train = df_train.join(df_ips,on='dst_ip',lsuffix='_dst').drop(['type', 'subtype', 'Unnamed: 0'],axis=1)
df_train['log_sent_bytes'] = np.log1p(df_train.sent_bytes).astype(int)
df_train['log_received_bytes'] = np.log1p(df_train.received_bytes).astype(int)
df_train =  pd.concat([pd.DataFrame(df_train.dst_ip.apply(lambda x: x.split('.')).values.tolist(),
             columns=['dst_int_1','dst_int_2','dst_int_3','dst_int_4']),df_train],axis=1)
df_train =  pd.concat([pd.DataFrame(df_train.src_ip.apply(lambda x: x.split('.')).values.tolist(),
             columns=['src_int_1','src_int_2','src_int_3','src_int_4']),df_train],axis=1)


df_train.label = df_train.label.replace({'benign':False,'Malware':True,'Adware':True})

df_train.type_dst = df_train.type_dst.replace({'Internal':0,'External':1,'Multicast':2,'Broadcast address':3})
df_train.type_src = df_train.type_src.replace({'Internal':0,'External':1,'Multicast':2,'Broadcast address':3})
df_train.subtype_dst = df_train.subtype_dst.replace(
    {'Class A':0, 'Class C':1, 'External':2,'Multicast':3,'Class B':4,'Broadcast address':5})
df_train.subtype_src = df_train.subtype_src.replace(
    {'Class A':0, 'Class C':1, 'External':2,'Multicast':3,'Class B':4,'Broadcast address':5})

In [102]:
df_train.head()

Unnamed: 0,src_int_1,src_int_2,src_int_3,src_int_4,dst_int_1,dst_int_2,dst_int_3,dst_int_4,client,dst_ip,...,protocol,received_bytes,sent_bytes,src_ip,type_dst,subtype_dst,type_src,subtype_src,log_sent_bytes,log_received_bytes
0,10,197,88,33,222,58,210,46,2,222.58.210.46,...,1,44096,67712,10.197.88.33,0,0,0,0,11,10
1,10,0,11,69,110,244,46,16,5,110.244.46.16,...,1,7712,1990,10.0.11.69,0,0,0,0,7,8
2,192,168,174,137,23,130,21,5,2,23.130.21.5,...,1,12928,8464,192.168.174.137,0,1,0,1,9,9
3,192,168,190,183,71,55,163,78,2,71.55.163.78,...,1,19808,4376,192.168.190.183,0,1,0,1,8,9
4,10,0,11,2,230,0,0,1,5,230.0.0.1,...,1,0,252,10.0.11.2,0,0,0,0,5,0


In [103]:
def plot_count(series):
    counts = series.value_counts()
    plt.figure(figsize=(5,2))
    sns.barplot(counts.index,counts)

In [104]:
def plot_pca(df):
    pca = PCA(n_components=2)
    to_pca = df.drop(['dst_ip','src_ip'],axis=1)
    to_pca = to_pca.dropna()
    labels_pca = to_pca.label
    proj = pca.fit_transform(to_pca.drop(['label'],axis=1))
    plt.figure(figsize=(10,6))
    sns.scatterplot(proj[:,0],proj[:,1],hue=labels_pca)

In [105]:
#separate df for client
df_train = pd.concat([df_train,pd.get_dummies(df_train.dst_port,prefix='port')],axis=1)
df_1 = df_train[df_train.client == 1].drop('client',axis=1)
df_2 = df_train[df_train.client == 2].drop('client',axis=1)
df_3 = df_train[df_train.client == 3].drop('client',axis=1)
df_4 = df_train[df_train.client == 4].drop('client',axis=1)
df_5 = df_train[df_train.client == 5].drop('client',axis=1)

In [106]:
list_of_df = [df_1,df_2,df_3,df_4,df_5]

# for i,df in enumerate(list_of_df):
#     plot_count(df.label)
#     plt.title(f'client {i+1}')
#     plt.show()

In [107]:
# for i, df in enumerate(list_of_df):
#     plot_pca(df)
#     plt.title(f'client {i+1}')
#     plt.show()

In [108]:
prediction = []
models = []
scores = []
for df in list_of_df:
    X = df.drop(['label','src_ip','dst_ip'],axis=1)
    y = df.label
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    rf = RandomForestClassifier(class_weight='balanced')
    rf.fit(X_train,y_train)
    p = rf.predict(X_test)
    prediction.append(p)
    scores.append(precision_recall_fscore_support(y_test,p))
    models.append(rf)
    

  'precision', 'predicted', average, warn_for)


In [117]:
def preprocessing(df):
    df.protocol = df.protocol.replace({'UDP':2,'TCP':1})
    df = df.join(df_ips,on='src_ip',rsuffix='_src').drop('Unnamed: 0',axis=1)
    df = df.join(df_ips,on='src_ip',rsuffix='_src').drop('Unnamed: 0',axis=1)
    df = df.join(df_ips,on='dst_ip',lsuffix='_dst').drop(['type', 'subtype', 'Unnamed: 0'],axis=1)
    df['log_sent_bytes'] = np.log1p(df.sent_bytes).astype(int)
    df['log_received_bytes'] = np.log1p(df.received_bytes).astype(int)
    df =  pd.concat([pd.DataFrame(df.dst_ip.apply(lambda x: x.split('.')).values.tolist(),
                 columns=['dst_int_1','dst_int_2','dst_int_3','dst_int_4']),df],axis=1)
    df =  pd.concat([pd.DataFrame(df.src_ip.apply(lambda x: x.split('.')).values.tolist(),
                 columns=['src_int_1','src_int_2','src_int_3','src_int_4']),df],axis=1)

    df.type_dst = df.type_dst.replace({'Internal':0,'External':1,'Multicast':2,'Broadcast address':3})
    df.type_src = df.type_src.replace({'Internal':0,'External':1,'Multicast':2,'Broadcast address':3})
    df.subtype_dst = df.subtype_dst.replace(
        {'Class A':0, 'Class C':1, 'External':2,'Multicast':3,'Class B':4,'Broadcast address':5})
    df.subtype_src = df.subtype_src.replace(
        {'Class A':0, 'Class C':1, 'External':2,'Multicast':3,'Class B':4,'Broadcast address':5})
    df = pd.concat([df,pd.get_dummies(df.dst_port,prefix='port')],axis=1)
    
    df_1 = df[df.client == 1].drop('client',axis=1)
    df_2 = df[df.client == 2].drop('client',axis=1)
    df_3 = df[df.client == 3].drop('client',axis=1)
    df_4 = df[df.client == 4].drop('client',axis=1)
    df_5 = df[df.client == 5].drop('client',axis=1)
    return df, [df_1,df_2,df_3,df_4,df_5]

In [122]:
df_test = pd.read_csv('Data/Test_Set_Hackathon.csv')
df_test = df_test.drop('Unnamed: 0',axis=1)
df_test.head()

Unnamed: 0,client,dst_ip,dst_port,protocol,received_bytes,sent_bytes,src_ip
0,2,230.0.0.251,5353.0,UDP,0,816,10.197.208.76
1,2,24.234.32.175,443.0,TCP,15200,374,10.197.208.78
2,2,10.200.105.183,443.0,TCP,4712,5760,10.200.44.127
3,5,178.217.10.142,443.0,TCP,52352,114560,10.0.11.80
4,5,44.67.14.232,80.0,TCP,92,132,10.0.11.94


In [123]:
df_test,list_test = preprocessing(df_test)
df_test = df_test.drop(['src_ip','dst_ip'],axis=1)

Index(['src_int_1', 'src_int_2', 'src_int_3', 'src_int_4', 'dst_int_1',
       'dst_int_2', 'dst_int_3', 'dst_int_4', 'client', 'dst_port', 'protocol',
       'received_bytes', 'sent_bytes', 'type_dst', 'subtype_dst', 'type_src',
       'subtype_src', 'log_sent_bytes', 'log_received_bytes', 'port_22.0',
       'port_23.0', 'port_53.0', 'port_80.0', 'port_135.0', 'port_137.0',
       'port_138.0', 'port_192.0', 'port_264.0', 'port_389.0', 'port_443.0',
       'port_445.0', 'port_500.0', 'port_623.0', 'port_993.0', 'port_1812.0',
       'port_1900.0', 'port_2152.0', 'port_2380.0', 'port_3268.0',
       'port_3389.0', 'port_3478.0', 'port_3544.0', 'port_3702.0',
       'port_3911.0', 'port_4500.0', 'port_5004.0', 'port_5060.0',
       'port_5061.0', 'port_5065.0', 'port_5070.0', 'port_5080.0',
       'port_5190.0', 'port_5223.0', 'port_5228.0', 'port_5242.0',
       'port_5353.0', 'port_5355.0', 'port_5440.0', 'port_5601.0',
       'port_5938.0', 'port_7003.0', 'port_7275.0', 'port_8080.0

In [158]:
def my_pred(row):
    idx = row['client']-1
    model = models[idx]
    arr = row.drop('client').values[:,np.newaxis]
    return model.predict(arr)

In [None]:
preds = df_test.apply(lambda x: my_pred(x),axis=1)

In [None]:
preds = df_test.dst_ip.apply(lambda x: True if x == '8.8.8.8').astype(bool)

In [81]:
ips = df_train.src_ip
ips.append(df_train.dst_ip)
ips.append(df_test.src_ip)
ips.append(df_test.dst_ip)
pd.DataFrame(ips.unique()).to_csv('ips.csv',index=False)

In [None]:
preds = {v:bool(k) == True for v,k in dict(preds).items()}

In [None]:
r = requests.post('http://192.168.1.24:30000/submission',
                   json={ 'team':'DeepZuchinis',
                          'password':'4lbertKmus',
                          'submission': preds
                         },
                   verify=False)

In [None]:
r.text