### Load  Data

In [27]:
import sys
import inspect
import pandas as pd 
sys.path.insert(0, './scripts')

In [2]:
from dataprep import load_dataset, mal_categorizer

In [3]:
train_df = load_dataset('KDDTrain+', 'Field Names', verbose=True)


 ************************ Reading the dataset KDDTrain+ *************************

 It has 125973 rows and 41 columns

 ************************* It has the following columns *************************
Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'is_host_login', 'is_guest_login', 'count',
       'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
       'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
       'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
       'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
       'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
       'dst_host_serror_rate', 'dst_host_srv_serror_rate',
       'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack

#### Categorize data

labels_5cat: [normal, DOS, R2L, U2R, probing]

labels_2cat: [normal, malicious]

In [8]:
train_df = mal_categorizer(train_df)
train_df[['attack_name', 'labels_5cat', 'labels_2cat']].head(10)

Unnamed: 0,attack_name,labels_5cat,labels_2cat
0,normal,normal,normal
1,normal,normal,normal
2,neptune,DoS,malicious
3,normal,normal,normal
4,normal,normal,normal
5,neptune,DoS,malicious
6,neptune,DoS,malicious
7,neptune,DoS,malicious
8,neptune,DoS,malicious
9,neptune,DoS,malicious


#### Create Training set



In [9]:
def get_data(dset):
    """ Create set X,y for training or test dataset
    
    Args:
    dset : a dataframe {train_df,test_df}
    
    Returns:
    df: a tuple of pandas dataframe (X) and pandas series (x)
    """
    dset_y = dset.pop('labels_2cat')
    dset_X = dset.drop(columns=['attack_name', 'labels_5cat'])
    return dset_X, dset_y

In [10]:
train_X, train_y = get_data(train_df)

In [37]:
dummied = pd.get_dummies(train_X, columns=['protocol_type', 'service', 'flag'], dummy_na=True)

In [39]:
dummied.columns

Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       ...
       'flag_RSTO', 'flag_RSTOS0', 'flag_RSTR', 'flag_S0', 'flag_S1',
       'flag_S2', 'flag_S3', 'flag_SF', 'flag_SH', 'flag_nan'],
      dtype='object', length=124)

## Preprocessor skeleton - in progress

In [40]:
class preprocessor:
    
    def __init__(self, cols_to_filter=None):
        
        self.cols_to_filter = cols_to_filter
        self.was_fit = False
    
    def fit(self, X, y=None):
        """learn any information from the training data we may need to transform the test data"""
        # chain fit and predict methods        
        # > p = preprocessor()
        # > p.fit(X).transform(X)
        
        self.was_fit = True
        
        #get categorical values
        categorical_features = train_X.dtypes[train_X.dtypes == 'object'].index
        self.categorical_features = [x for x in categorical_features]
        
        return self
    
    def transform(self, X, y=None):
        """transform the training or test data"""
        # transform the training or test data based on class attributes learned in the `fit` step
        
        if not self.was_fit:
            raise Error("need to fit preprocessor first")
        
        # dummy code
        X_new = pd.get_dummies(X, columns=self.categorical_features, dummy_na=True)     
           
        return X_new

In [48]:
p = preprocessor(cols_to_filter=['protocol_type', 'service', 'flag'])
p.fit(train_X)
train_X_transformed = p.transform(train_X)

In [49]:
train_X_transformed.head(10)

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,flag_nan
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


#### Working on feature selection method (Attribute Ratio)

In [None]:
from collections import OrderedDict
from operator import itemgetter 

def getAR(df, numericCols: list, labelCols: str):
    """TODO documentation: Feature Selection - Attribute Ratio 
    """
    ratio_dict = {}
    
    if numericCols:
        avg_dict = dict(df[numericCols].mean())
        numeric_catCols = numericCols + [labelCols]
        max_cr = dict(df[numeric_catCols].groupby(labelCols).mean().max())
        
        ratio_dict.update({k: max_cr[k]/avg_dict[k] for k in avg_dict.keys()})

    #TODO calculation for binary columns
        
    return OrderedDict(sorted(ratio_dict.items(), key=itemgetter(1), reverse=True))

In [None]:
getAR(train_feature_df, numeric_cols, 'labels_5cat')