In [4]:
DATA_DIR = "/home/user/newnew/"

In [5]:
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.externals import joblib

import socket, struct

# Загрузка данных

In [6]:
packets_1 = pd.read_csv(DATA_DIR + "1")
packets_2 = pd.read_csv(DATA_DIR + "2")
packets_3 = pd.read_csv(DATA_DIR + "3")
packets_4 = pd.read_csv(DATA_DIR + "4")
packets_1['uid'] = 1
packets_2['uid'] = 2
packets_3['uid'] = 3
packets_4['uid'] = 4
packets = packets_1.append([packets_2, packets_3, packets_4])

In [7]:
packets.head()

Unnamed: 0,No.,Time,Source,Destination,Protocol,Length,Info,uid
0,1,"02:54:43,303578",ZyxelCom_9f:c2:bc,Broadcast,ARP,42,Who has 192.168.1.34? Tell 192.168.1.1,1
1,2,"02:54:46,539389",192.168.1.37,192.168.1.1,DNS,67,Standard query 0x3a69 A dnzl.ru,1
2,3,"02:54:46,544422",192.168.1.37,192.168.1.1,DNS,80,Standard query 0x978b A adservice.google.com,1
3,4,"02:54:46,547250",192.168.1.37,192.168.1.1,DNS,79,Standard query 0x81fc A adservice.google.ru,1
4,5,"02:54:46,548343",192.168.1.1,192.168.1.37,DNS,296,Standard query response 0x3a69 A dnzl.ru A 77....,1


In [8]:
def ip2int(addr):                                                               
    try:
        return struct.unpack("!I", socket.inet_aton(addr))[0]                       
    except OSError:
        return 0

In [9]:
def encode(x, enc):
    try:
        return enc.transform(x)
    except ValueError:
        return -1

In [10]:
def preprocessing(packets, label_encoder):
    pd.options.mode.chained_assignment = None
    packets["Source"] = packets["Source"].apply(lambda ip : ip2int(ip)).astype(np.int32)
    packets["Destination"] = packets["Destination"].apply(lambda ip : ip2int(ip)).astype(np.int32)
    packets["Protocol"] = packets["Protocol"].apply(lambda x: encode(x, label_encoder))

# Подготовка данных

In [11]:
%%time
le = LabelEncoder().fit(packets["Protocol"])
X = packets[["Source", "Destination", "Protocol", "Length"]]
preprocessing(X, le)
y = packets["uid"]

CPU times: user 1.03 s, sys: 52 ms, total: 1.08 s
Wall time: 1.02 s


# Разделение на тренировочную и тестовую выборки

In [16]:
%%time
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.3, random_state=17)

CPU times: user 8 ms, sys: 4 ms, total: 12 ms
Wall time: 12.6 ms


# Обучение

In [17]:
%%time
forest = RandomForestClassifier(n_jobs=-1, random_state=17)
forest.fit(X_train, y_train)
print(f1_score(y_holdout, forest.predict(X_holdout), average='macro'))

0.933995715999
CPU times: user 108 ms, sys: 8 ms, total: 116 ms
Wall time: 233 ms


# Проверка на случайных данных

In [18]:
p1 = packets_1.sample(n=100)[["Source", "Destination", "Protocol", "Length"]]
p2 = packets_2.sample(n=100)[["Source", "Destination", "Protocol", "Length"]]
p3 = packets_3.sample(n=100)[["Source", "Destination", "Protocol", "Length"]]
p4 = packets_4.sample(n=100)[["Source", "Destination", "Protocol", "Length"]]
preprocessing(p1, le)
preprocessing(p2, le)
preprocessing(p3, le)
preprocessing(p4, le)

In [19]:
print(np.mean(forest.predict_proba(p1), axis=0))
print(np.mean(forest.predict_proba(p2), axis=0))
print(np.mean(forest.predict_proba(p3), axis=0))
print(np.mean(forest.predict_proba(p4), axis=0))

[ 0.9425806   0.00271667  0.          0.05470273]
[ 0.00413333  0.99386667  0.002       0.        ]
[ 0.00874986  0.09746667  0.88226676  0.01151671]
[ 0.07156195  0.001       0.00260014  0.92483792]


# Сохранение обученной модели на диск

In [20]:
joblib.dump(forest, DATA_DIR + 'model.pkl') 
forest_1 = joblib.load(DATA_DIR + 'model.pkl') 

In [21]:
joblib.dump(le, DATA_DIR + 'label_encoder.pkl') 
label_encoder_1 = joblib.load(DATA_DIR + 'label_encoder.pkl') 