In [6]:
import pandas as pd
import numpy as np
import pickle
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.ensemble import  GradientBoostingClassifier
from sklearn.metrics import (
        f1_score, classification_report, 
        confusion_matrix, roc_curve, 
        roc_auc_score, accuracy_score,
        log_loss)
from datetime import datetime
import time

In [23]:
def data_prepare(device_number):
    result=pd.DataFrame()
    file_names = ["benign","mirai.ack", "mirai.scan", "mirai.syn", "mirai.udp", "mirai.udpplain",
                 "gafgyt.combo", "gafgyt.junk", "gafgyt.scan", "gafgyt.tcp", "gafgyt.udp"]
    for file_name in file_names:
        if os.path.exists('C://Users//HP//dataset//'+str(device_number)+'.'+ file_name +'.csv'):
            d = pd.read_csv('C://Users//HP//dataset//'+str(i)+'.'+ file_name +'.csv')
            d["Class"] = file_name
            result = pd.concat([result,d], axis=0, sort=False, ignore_index=True)
    return result

In [None]:
dataframe = {"device_1": data_prepare(1), 
            "device_2": data_prepare(2) ,
            "device_3": data_prepare(3) ,
            "device_4": data_prepare(4) ,
            "device_5": data_prepare(5) ,
            "device_6": data_prepare(6) ,
            "device_7": data_prepare(7) ,
            "device_8": data_prepare(8) ,
            "device_9": data_prepare(9) 
            }

In [None]:
def gradient_boosting_classifier(data, device_name, t_size):
    X = data.drop(['Class'], axis=1)
    y = data['Class']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=t_size, random_state=47)
        
    clf = GradientBoostingClassifier()
    model_res = clf.fit(X_train, y_train)
    y_pred = model_res.predict(X_test)
    y_pred_prob = model_res.predict_proba(X_test)
    lr_probs = y_pred_prob[:,1]
    importances = pd.DataFrame({'feature':X.columns,'importance':np.round(clf.feature_importances_,3)})
    importances = importances.sort_values('importance',ascending=False).set_index('feature')
 
    print(importances.head(20))
    
    print('Accuracy: ')
    print(accuracy_score(y_test, y_pred))
    print('f1-score: ')
    print(f1_score(y_test, y_pred, average='weighted'))
    print('Confusion Matrix: ')
    print(confusion_matrix(y_test, y_pred))
    print('Classification Report: ')
    print(classification_report(y_test, y_pred))

In [None]:
for k in dataframe:
    print("-------------------------------------------------------------------------------")
    print(k)
    gradient_boosting_classifier(dataframe[k], k, 0.30)
    print("-------------------------------------------------------------------------------")

In [None]:
def time_depend_test_size(ts):
    start_time = time.time() 
    for k in dataframe:
        print(k)
        X = dataframe[k].drop(['Class'], axis=1)
        y = dataframe[k]['Class']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts, random_state=47)
        clf = GradientBoostingClassifier()
        model_res = clf.fit(X_train, y_train)
        y_pred = model_res.predict(X_test)
        y_pred_prob = model_res.predict_proba(X_test)
        lr_probs = y_pred_prob[:,1]
    end_time = time.time()
    elapsed_time = end_time - start_time
    return elapsed_time

In [None]:
time_df = pd.DataFrame({"0.20": [time_depend_test_size(0.20)],
                       "0.30":[time_depend_test_size(0.30)],
                       "0.50":[time_depend_test_size(0.50)],
                       "0.75":[time_depend_test_size(0.75)]})