In [3]:
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import time
import numpy as np

from scipy.special import boxcox

### Preprocessing

In [None]:
# Encode a numeric column as zscores
def zscore_normalization(df, name):
    mean = df[name].mean()
    sd = df[name].std()

    df[name] = (df[name] - mean) / sd

def encode_text(df, name):
    enc = OrdinalEncoder()
    # dummies = pd.get_dummies(df[name])
    data = enc.fit_transform(df[name].values.reshape(-1,1))
    df[name]=data.flatten()

def column_names():
    """Reads column names for dataframe into array"""
    f = open("kddcup.names.txt")
    s = f.read()
    arr = s.split("\n")[1:-1]
    cols = [a[0:a.index(":")] for a in arr]
    cols.append("outcome")
    return cols



def preprocess(df):
    df.columns = column_names()

    df.loc[df['outcome'] != "normal.", 'virus']  = 1
    df.loc[df['outcome'] == "normal.", 'normal']  = 0

    for col in df.columns:
        t = (df[col].dtype)
        if t == int or t == float:
            df[col] = boxcox(df[col], 0.5)
            zscore_normalization(df, col)
        else:
            encode_text(df, col)
    
    df.dropna(inplace=True,axis=1)
    for col in df.columns:
        if len(df[col].unique()) == 1:
            df.drop(col, inplace=True,axis=1)

    correlation = df.corrwith(df['outcome'])

    # Feature selection
    row = 0
    for num in correlation:
     if num >= -0.05 and num <= 0.05:
         df.drop(df.columns[row], axis=1, inplace=True)
         row += 1

    return df

In [None]:
def traintest_split(df):
    X = df.drop(columns=["outcome"])
    y = df["outcome"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    return X_train, X_test, y_train, y_test

### Model Training and Testing

In [10]:
import pickle
import tensorflow as tf

def training_basic_classifier(model, ann, X_train, X_test, y_train, y_test):
    if ann == False:
        start_train = time.time()
        model.fit(X_train, y_train)
        end_train = time.time()
        
        start_test = time.time()
        y_pred2=model.predict(X_test)
        end_test = time.time()
        
        train_time = end_train-start_train
        test_time = end_test-start_test
        accuracy = accuracy_score(y_test, y_pred2)
        f1 = f1_score(y_test, y_pred2)
        print(f"RFC Accuracy: {accuracy}")
        print(f"RFC F1 Score: {f1}")
    else:
        input_shape = [X_train.shape[1]]

        start_train = time.time()

        model = tf.keras.Sequential([
        
            tf.keras.layers.Dense(units=64, activation='relu', input_shape=input_shape),
            tf.keras.layers.Dense(units=64, activation='relu'),
            tf.keras.layers.Dense(units=1)
        ])

        model.build()

        print(model.summary())

        model.compile(optimizer='adam', loss='mae',  metrics=['accuracy'])  

        history = model.fit(X_train, y_train, validation_data=(X_test,y_test), batch_size=256, epochs=12,)

        end_train=time.time()

    with open(f'model.pkl', 'wb') as f:
        pickle.dump(model, f)


In [None]:
def train(file_path):
    df = pd.read_csv(file_path, names=column_names())
    df = preprocess(df)
    X_train, X_test, y_train, y_test = traintest_split(df)
    training_basic_classifier(RandomForestClassifier(),X_train, X_test, y_train, y_test, ann=False)

### Predict

In [None]:
def predict(test_data):
    with open (f'model.pkl', 'rb') as f:
        model = pickle.load(f)

    y_pred = model.predict(test_data)
    return y_pred