## Preprocessing Script

**Expected output features:**
- SRC_PORT
- DST_PORT
- PROTOCOL
- TCP_FLAGS
- FLOW_DURATION_MILLISECONDS
- BYTES
- PKTS

**Expected input features** (Normalized Cisco Netflow format):
- srcport
- dstport
- prot
- tcp_flags
- dOctets
- dPkts
- first
- last

**Input file format:** CSV (Assumption)

#### Library import

In [8]:
from catboost import CatBoostClassifier
import pandas as pd
import json
from io import StringIO
import sys
sys.path.insert(1, './app/')
from helpers import constants

### Functions

In [9]:
def model_fn(model_dir):
    """Funtion that load the machine learning trained model
    
    Args:
        model_dir: Path in which the model is located
    
    Returns: Catboost ML trained model.
    """
    model = CatBoostClassifier()
    model.load_model(model_dir,
                    format='cbm')
    return model

In [10]:
def input_fn(input_data, content_type):
    """A input function that can handle JSON and CSV formats.

    Args:
        input_data: the request payload serialized in the content_type format
        content_type: the request content_type

    Returns: input_data transformed and ready to be ingested into the model.
    """
    if content_type == 'text/csv':
        temp_data = StringIO(input_data)
        record = pd.read_csv(temp_data,
                           names=constants.netflow_v5_column_names[:-1], 
                           sep = ',')
    elif content_type == 'application/json':
        record = pd.DataFrame(json.loads(input_data)['features'], columns = constants.netflow_v5_column_names[:-1])
    else:
        print('wrong content type')
    #Rename to standard feature names
    data = record.copy()
    data.rename(columns=constants.column_rename, inplace = True)

    # DST/SRC port mapping
    data['SRC_PORT'] = data['SRC_PORT'].map(constants.port_map)
    data.SRC_PORT.fillna('Others', inplace = True)

    data['DST_PORT'] = data['DST_PORT'].map(constants.port_map)
    data.DST_PORT.fillna('Others', inplace = True)

    #Protocol mapping (cat. variable convertion)
    data['PROTOCOL'] = data['PROTOCOL'].map(constants.protocol_map) 
    data.PROTOCOL.fillna('Others', inplace = True)

    #tcp flags dtype conversion
    data['TCP_FLAGS'] = data['TCP_FLAGS'].astype(str)

    #flow duration calculation
    data['FLOW_DURATION_MILLISECONDS'] = data['Last'] - data['First']
        ##data.drop(['Last', 'First'], axis = 1, inplace = True)
    
    data = data[['SRC_PORT', 'DST_PORT', 'PROTOCOL',
                           'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 
                           'BYTES', 'PKTS']]
        
    return data, record

In [11]:
def predict_fn(data, model):
        """A default predict_fn for PyTorch. Calls a model on data deserialized in input_fn.
        Runs prediction on GPU if cuda is available.

        Args:
            data: post-processed input data for prediction
            model: Catboost model loaded in memory by model_fn

        Returns: a prediction
        """
        prediction_temp = model.predict(data)
        prediction = [constants.attack_map[value] for [value] in prediction_temp]
        
        return prediction

In [12]:
def output_fn(data, prediction):
        """Output function. Serializes predictions from predict_fn to JSON

        Args:
            prediction: a prediction result from predict_fn
 
        Returns: output data serialized
        """
        data['prediction'] = prediction
        response = {'prediction' : data.values.tolist()}
        return json.dumps(response, indent = 4)

### Unit test

Import model

In [14]:
model = model_fn('../../components/netflow-ai/notebooks/stage_3/model_training/trained_models/model_v12')

Ingest data - preprocessing

In [15]:
input_data_csv = "1234,3456,2345,12,13,3,120,1000,3078,0,80,2,32,17,3,12,3,24,123,4\n1234,3456,2345,12,13,2,280,1000,3078,0,80,2,2,6,3,12,3,24,123,4"
content_type_csv = "text/csv"

input_data_json = json.dumps({'features' : [[1234,3456,2345,12,13,3,120,1000,3078,0,80,2,32,17,3,12,3,24,123,4], [1234,3456,2345,12,13,2,280,1000,3078,0,80,2,2,6,3,12,3,24,123,4]] 
                             })
content_type_json = "application/json"

data = input_fn(input_data_json, content_type_json)
data[1]

Unnamed: 0,srcaddr,dstaddr,nexthop,input,output,dPkts,dOctets,First,Last,srcport,dstport,pad1,tcp_flags,prot,tos,src_as,dst_as,src_mask,dst_mask,pad2
0,1234,3456,2345,12,13,3,120,1000,3078,0,80,2,32,17,3,12,3,24,123,4
1,1234,3456,2345,12,13,2,280,1000,3078,0,80,2,2,6,3,12,3,24,123,4


Make predictions

In [16]:
prediction = predict_fn(data[0], model)
print(prediction)

['DoS', 'DoS']


Output values - ready to be sent

In [17]:
output = output_fn(data[1], prediction)

In [19]:
pd.DataFrame(json.loads(output)['prediction'], columns = constants.netflow_v5_column_names)

Unnamed: 0,srcaddr,dstaddr,nexthop,input,output,dPkts,dOctets,First,Last,srcport,...,pad1,tcp_flags,prot,tos,src_as,dst_as,src_mask,dst_mask,pad2,prediction
0,1234,3456,2345,12,13,3,120,1000,3078,0,...,2,32,17,3,12,3,24,123,4,DoS
1,1234,3456,2345,12,13,2,280,1000,3078,0,...,2,2,6,3,12,3,24,123,4,DoS
