In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder# instantiate labelencoder object
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV# Create the parameter grid based on the results of random search 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import pickle

sns.set(rc={'figure.figsize':(11.7,8.27)})

In [2]:
#Load data

data_path = './malicious-connection-dataset';
train_bad_data_path = './malicious-connection-dataset/df_bad.csv';
train_good_data_path = './malicious-connection-dataset/df_good.csv';
classify_data_path = './malicious-connection-dataset/df_good.csv';

class SimpleLoader:
    
    def __init__ (self, is_training_session = True):
        self._is_training_session = is_training_session
    
    def load(self, data_path = '',
                train_bad_data_path = '',
                train_good_data_path = '',
                classify_data_path = ''):
        
        for dirname, _, filenames in os.walk(data_path):
            for filename in filenames:
                os.path.join(dirname, filename)
                
        df = None
        if(self._is_training_session):
            df_bad = pd.read_csv(train_bad_data_path, encoding='ISO-8859-2')
            df_bad.rename(columns={'Unnamed: 0':'unnamed'}, inplace=True)
            df_bad.drop('unnamed', axis=1, inplace=True)
            df_bad.insert(0, 'label', 0)

            df_good = pd.read_csv(train_good_data_path, encoding='ISO-8859-2')  
            df_good.rename(columns={'Unnamed: 0':'unnamed'}, inplace=True)
            df_good.drop('unnamed', axis=1, inplace=True)
            df_good.insert(0, 'label', 1)
            df = pd.concat([df_good, df_bad], ignore_index=True)
        else:
            df = pd.read_csv(classify_data_path, encoding='ISO-8859-2')
            df.rename(columns={'Unnamed: 0':'unnamed'}, inplace=True)
            df.drop('unnamed', axis=1, inplace=True)
        return df

    
#train_loader = SimpleLoader(True)
#df = train_loader.load(data_path=data_path, 
#                  train_bad_data_path=train_bad_data_path, 
#                  train_good_data_path = train_good_data_path)

classify_loader = SimpleLoader(False)
df_classify = classify_loader.load(data_path=data_path, 
                                   classify_data_path=classify_data_path)

In [3]:
class SimplePreProcessor:
    def __init__ (self):
        pass
    
    def preprocess_all(self, df):
        df = self.convert_time(df)
        df = self.convert_numericals(df)
        df = self.encode_categorical(df)
        df = self.normalize_features(df)
        return df
        
    def encode_categorical(self, df):
        cols = ['ip', 'port']
        le = LabelEncoder()
        df[cols] = df[cols].apply(lambda col: le.fit_transform(col))
        return df
    
    def normalize_features(self, df):
        cols = ['times', 'di', 'do', 'pi', 'po', 'ip', 'port']
        df[cols] = (df[cols]-df[cols].mean())/df[cols].std()
        return df

    def convert_time(self, df):
        df[['times']] = df[['times']].apply(self._time_converter, axis=0)
        return df
    
    def convert_numericals(self, df):
        df[['di']] = df[['di']].apply(self._num_converter, axis=0)
        df[['do']] = df[['do']].apply(self._num_converter, axis=0)
        df[['pi']] = df[['pi']].apply(self._num_converter, axis=0)
        df[['po']] = df[['po']].apply(self._num_converter, axis=0)
        return df
        
    def _num_converter(self, s):
        s = s.fillna(0).astype(str).str.split(",", expand = False).apply(
                lambda x : [int(y) for y in x]
            )
        s = s.apply(lambda x : np.std(x))
        return s
        
    def _time_converter(self, s):
        s = s.fillna(0).astype(str).str.split("|", expand = False).apply(
                lambda x : [[int(i) for i in y.split(',')] for y in x]
            )
        s = s.apply(lambda x : np.std(np.concatenate(x)))
        return s

#train_processor = SimplePreProcessor(True)
#preprocessor = SimplePreProcessor()

#df = preprocessor.preprocess_all(df)
#df_classify = preprocessor.preprocess_all(df_classify)

In [4]:
#Plot corellation
cols = ['label', 'times', 'di', 'do', 'pi', 'po', 'ip', 'port']
corr = df[cols].corr()

ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

NameError: name 'df' is not defined

In [5]:
class SimpleModel():
    def __init__ (self):
        self._feature_selector_model = None
        self._classifier_model = None
        
    def train(self, df):
        model, X_train, X_test, X_train, X_test, y_train, y_test = self._selective_train_test_split(df);
        self._feature_selector_model = model
        param_grid = {
        'bootstrap': [True],
        'max_depth': [2, 20, 40],
        'max_features': [1, 3, 7],
        'min_samples_leaf': [2, 4, 8],
        'min_samples_split': [5, 10, 10],
        'n_estimators': [5, 25, 50]
        }
        rf = RandomForestClassifier()
        clf = GridSearchCV(estimator = rf, param_grid = param_grid, 
                                  cv = 3, n_jobs = -1, verbose = 2)


        clf.fit(X_train, y_train)
        y_pred = (clf.predict(X_test))
        target_names = ['good', 'bad']
        self._classifier_model = clf
        
        return (classification_report(y_test, y_pred, target_names=target_names),
                accuracy_score(y_test, y_pred), confusion_matrix(y_test, y_pred))

        
        
    def classify(self, df):
        if self._feature_selector_model is None or self._classifier_model is None:
            raise('Model appears to be untrained.')
            
        X = self._feature_selector_model.transform(df[['ip', 'port', 'times', 'di', 'do', 'pi', 'po']])
        y_pred = self._classifier_model.predict(X)
        return y_pred
            
        
    def _selective_train_test_split(self, df):
        #Train Test split
        #Feature selection via cheap classifier
        X = df[['ip', 'port', 'times', 'di', 'do', 'pi', 'po']]
        y = df[['label']]

        X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42, shuffle=True, stratify=y)

        #Feature selection via cheap classifier
        clf = ExtraTreesClassifier(n_estimators=50)
        clf = clf.fit(X_train, y_train)

        model = SelectFromModel(clf, prefit=True, threshold='0.5*mean')
        X_train = model.transform(X_train)
        X_test = model.transform(X_test)
        return model, X_train, X_test, X_train, X_test, y_train, y_test

#simple_model = SimpleModel()
#simple_model.train(df)

In [6]:
class SimplePipeline:
    def __init__(self):
        self._train_loader = SimpleLoader(True)
        self._classify_loader = SimpleLoader(False)
        self._preprocessor = SimplePreProcessor()
        self._simple_model = SimpleModel()

    def fit_csv(self, data_path, train_bad_data_path, train_good_data_path):
        df = self._train_loader.load(data_path=data_path, 
                  train_bad_data_path=train_bad_data_path, 
                  train_good_data_path = train_good_data_path)
        df = self._preprocessor.preprocess_all(df)
        self._simple_model = SimpleModel()
        return self._simple_model.train(df)


    def predict_csv(self, data_path, classify_data_path):
        df = self._classify_loader.load(classify_data_path=classify_data_path)
        df = self._preprocessor.preprocess_all(df)
        return self._simple_model.classify(df)

    def predict_df(self, df):
        df = self._preprocessor.preprocess_all(df)
        return self._simple_model.classify(df)

data_path = './malicious-connection-dataset';
train_bad_data_path = './malicious-connection-dataset/df_bad.csv';
train_good_data_path = './malicious-connection-dataset/df_good.csv';
classify_data_path = './malicious-connection-dataset/df_good.csv';

simple_pipeline = SimplePipeline()
report, accuracy, confusion = simple_pipeline.fit_csv(data_path, train_bad_data_path, train_good_data_path)
y_pred_csv = simple_pipeline.predict_csv(data_path, classify_data_path)


  clf = clf.fit(X_train, y_train)


Fitting 3 folds for each of 243 candidates, totalling 729 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 729 out of 729 | elapsed:  5.4min finished
  self.best_estimator_.fit(X, y, **fit_params)


In [7]:
y_pred_df = simple_pipeline.predict_df(df_classify)

In [9]:
import pickle
filename = 'client/model.sav'
pickle.dump(simple_pipeline, open(filename, 'wb'))

In [6]:
df = pd.read_csv('malicious-connection-dataset/df_bad.csv', encoding='ISO-8859-2')

print(df.loc[0])

Unnamed: 0                                                    0
ip                                              141.8.224.169\n
port                                                     http\n
times         73951142,74099359|74100335,74261630|74262194,7...
di            0,268,0,0,0,0,268,0,0,0,0,268,0,0,0,0,0,0,268,...
do            0,0,2087,0,0,0,0,0,2091,0,0,0,0,1420,657,0,0,0...
pi            1,2,1,0,0,0,2,0,1,0,0,2,0,0,0,0,0,0,2,0,0,1,0,...
po            0,2,3,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...
type                        eb7c74c66f801abde07e0d1a72cbec79(1)
Name: 0, dtype: object
