In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split as splitter
from xgboost import XGBClassifier
import cProfile
import pstats
import os
import sys
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

__version__ = "0.1"
__author__ = 'Benedetto Marco Serinelli'


def train_and_test(dataset, data):
    for column in data.columns:
        if data[column].dtype == type(object):
            print(column)
            print(data[column])
            le = LabelEncoder()
            data[column] = le.fit_transform(data[column])
            if column == 'result':
                print ("ok")
                print(column)
                print(data[column])
    y = data.result
    x = data.drop('result', axis=1)
    profile = cProfile.Profile()
    x_train, x_test, y_train, y_test = splitter(x, y, test_size=0.3)
    profile.enable()
    # train and test
    model = XGBClassifier(objective='multi:softprob', booster='gbtree', verbosity=0)
    model.fit(x_train, y_train, eval_metric='mlogloss')
    y_pred = model.predict(x_test)
    profile.disable()
    profile.dump_stats('output.prof')
    stream = open('result/' + dataset + '_profiling.txt', 'w')
    stats = pstats.Stats('output.prof', stream=stream)
    stats.sort_stats('cumtime')
    stats.print_stats()
    os.remove('output.prof')
    conf_matrix = confusion_matrix(y_test, y_pred)
    f = open('result/' + dataset + '_output.txt', 'w')
    sys.stdout = f
    print(conf_matrix)
    print(classification_report(y_test, y_pred))


if __name__ == "__main__":
    data = pd.read_csv('./dataset/kdd_prediction.csv', delimiter=',',
                       dtype={'protocol_type': str, 'service': str, 'flag': str, 'result': str})
    train_and_test('xgboost_kdd_', data)
    data = pd.read_csv('./dataset/kdd_prediction_NSL.csv', delimiter=',',
                       dtype={'protocol_type': str, 'service': str, 'flag': str, 'result': str})
    train_and_test('xgboost_nsl_kdd', data)


protocol_type
0        tcp
1        tcp
2        tcp
3        tcp
4        udp
        ... 
13446    tcp
13447    tcp
13448    tcp
13449    tcp
13450    tcp
Name: protocol_type, Length: 13451, dtype: object
service
0            smtp
1            http
2            http
3             ftp
4        domain_u
           ...   
13446        http
13447        nntp
13448        smtp
13449        nnsp
13450        link
Name: service, Length: 13451, dtype: object
flag
0          SF
1          SF
2          SF
3          SF
4          SF
         ... 
13446      SF
13447    RSTO
13448      SF
13449     REJ
13450      S0
Name: flag, Length: 13451, dtype: object
result
0        normal
1        normal
2        normal
3           r2l
4        normal
          ...  
13446    normal
13447       dos
13448    normal
13449       dos
13450       dos
Name: result, Length: 13451, dtype: object
ok
result
0        1
1        1
2        1
3        3
4        1
        ..
13446    1
13447    0
13448    1
13449   



