In [21]:
# !pip install scikit-learn kfp numpy
import pandas as pd 
import pickle
import tensorflow as tf
import numpy as np 
import time
import os

from kfp import components

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

### Read and Preprocess Data

Prepares the dataset by standardizing and encoding the data appropriately, eliminating redundancy, and ensuring data integrity.

In [24]:
def read_file() -> None:
    import os 
    import pandas as pd
    import numpy as np
    
    from minio import Minio
    from scipy.special import boxcox
    from sklearn.model_selection import train_test_split

    def zscore_normalization(df, name):
        mean = df[name].mean()
        sd = df[name].std()

        df[name] = (df[name] - mean) / sd

    def preprocess(df):
        df = df.drop(columns=['Name', 'md5'])
        
        for i in df.columns:
            if i != 'legitimate':
                #convert data to fit normal distribution
                df[i] = boxcox(df[i], 0.5)
                #normalize all numerical columns
                zscore_normalization(df, i)
        
        correlation_matrix = df.corr()
        cols_to_drop = []
        
        for i in df.columns:
            for j in df.columns:
                #drop columns with low correlation to target variable
                if i != j and i != 'legitimate' and j != 'legitimate' and abs(correlation_matrix[i][j]) > 0.6 and i not in cols_to_drop and j not in cols_to_drop:
                    cols_to_drop.append(i)
        
        cols_to_drop = set(cols_to_drop)
        df.drop(columns=cols_to_drop, inplace=True)
        return df    
    
    file_path = 'https://raw.githubusercontent.com/tsimhadri-ews/internproject/malware-detection-0/src/MalwareData.csv'
    df = pd.read_csv(file_path, sep='|')
    df = preprocess(df)
    
    minio_client = Minio(
        "192.168.40.147:9000",
        access_key="minio",
        secret_key="minio123",
        secure=False
    )
    minio_bucket = "mlpipeline"
    
    X = df.drop(columns=['legitimate'])
    y = df['legitimate']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    folder_path = './tmp/malware'

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder '{folder_path}' created successfully.")
    else:
        print(f"Folder '{folder_path}' already exists.")
        
    df.to_csv("tmp/malware/malware_data.csv")
    minio_client.fput_object(minio_bucket,"malware/malware_data","tmp/malware/malware_data.csv")

    np.save("tmp/malware/X_train.npy",X_train)
    minio_client.fput_object(minio_bucket,"malware/X_train","tmp/malware/X_train.npy")

    np.save("tmp/malware/y_train.npy",y_train)
    minio_client.fput_object(minio_bucket,"malware/y_train","tmp/malware/y_train.npy")

    np.save("tmp/malware/X_test.npy",X_test)
    minio_client.fput_object(minio_bucket,"malware/X_test","tmp/malware/X_test.npy")

    np.save("tmp/malware/y_test.npy",y_test)
    minio_client.fput_object(minio_bucket,"malware/y_test","tmp/malware/y_test.npy")
    
#read_file()

In [23]:
def train_op() -> None:
    import pickle
    import pandas as pd
    import numpy as np 
    import json
    import os 
    import time
    import tensorflow as tf
    
    from minio import Minio
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.svm import SVC
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

    #Define minIO Client
    
    minio_client = Minio(
        "192.168.40.147:9000",
        access_key="minio",
        secret_key="minio123",
        secure=False
    )
    minio_bucket = "mlpipeline"
    
    
    #Load training and testing data from minIO
    
    minio_client.fget_object(minio_bucket,"malware/X_train","/tmp/malware/X_train.npy")
    X_train = np.load("/tmp/malware/X_train.npy")
    
    minio_client.fget_object(minio_bucket,"malware/y_train","/tmp/malware/y_train.npy")
    y_train = np.load("/tmp/malware/y_train.npy")
    
    minio_client.fget_object(minio_bucket,"malware/X_test","/tmp/malware/X_test.npy")
    X_test = np.load("/tmp/malware/X_test.npy")
    
    minio_client.fget_object(minio_bucket,"malware/y_test","/tmp/malware/y_test.npy")
    y_test = np.load("/tmp/malware/y_test.npy")
    
    # Define dataframe to store model metrics
    metrics = pd.DataFrame(columns=["Model", "Accuracy", "F1", "Precision", "Recall", "Train_Time", "Test_Time"])
    
    models_path = './tmp/malware/models'

    if not os.path.exists(models_path):
        os.makedirs(models_path)
        print(f"Folder '{models_path}' created successfully.")
    else:
        print(f"Folder '{models_path}' already exists.")
    
    #Logistic Regression
    
    start_train = time.time()
    lrc = LogisticRegression(random_state=0, max_iter=1000)
    lrc.fit(X_train, y_train)
    end_train = time.time()

    start_test = time.time()
    ypredlr = lrc.predict(X_test)
    end_test = time.time()
    
    accuracy = accuracy_score(y_test, ypredlr)
    f1 = f1_score(y_test, ypredlr)
    precision = precision_score(y_test, ypredlr)
    recall = recall_score(y_test, ypredlr)

    metrics.loc[len(metrics.index)] = ['Logistic Regression', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test]
    
    with open('./tmp/malware/models/lrc.pkl', 'wb') as f:
        pickle.dump(lrc, f)
    
    minio_client.fput_object(minio_bucket,"malware/models/lrc.pkl","tmp/malware/models/lrc.pkl")
    
    #Random Forest Classifier
    
    start_train = time.time()
    rfc = RandomForestClassifier()
    rfc.fit(X_train, y_train)
    end_train = time.time()

    start_test = time.time()
    y_pred2=rfc.predict(X_test)
    end_test = time.time()
    
    accuracy = accuracy_score(y_test, y_pred2)
    f1 = f1_score(y_test, y_pred2)
    precision = precision_score(y_test, y_pred2)
    recall = recall_score(y_test, y_pred2)

    metrics.loc[len(metrics.index)] = ['Random Forest Classifier', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test] 
    
    with open('./tmp/malware/models/rfc.pkl', 'wb') as f:
        pickle.dump(rfc, f)
    
    minio_client.fput_object(minio_bucket,"malware/models/rfc.pkl","tmp/malware/models/rfc.pkl")
        
    #Decision Tree
    start_train = time.time()
    dtc = DecisionTreeClassifier()
    dtc.fit(X_train, y_train)
    end_train = time.time()

    start_test = time.time()
    y_pred3=dtc.predict(X_test)
    end_test = time.time()
    
    accuracy = accuracy_score(y_test,y_pred3)
    f1 = f1_score(y_test, y_pred3)
    precision = precision_score(y_test, y_pred3)
    recall = recall_score(y_test, y_pred3)
    
    metrics.loc[len(metrics.index)] = ['Decision Tree', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test] 

    with open('./tmp/malware/models/dtc.pkl', 'wb') as f:
        pickle.dump(dtc, f)
    
    minio_client.fput_object(minio_bucket,"malware/models/dtc.pkl","tmp/malware/models/dtc.pkl")
    
    #Support Vector Machine 
    
    start_train = time.time()
    svc = SVC()
    svc.fit(X_train, y_train)
    end_train = time.time()

    start_test = time.time()
    y_pred4=svc.predict(X_test)
    end_test = time.time()
    
    accuracy = accuracy_score(y_test,y_pred4)
    f1 = f1_score(y_test,y_pred4)
    precision = precision_score(y_test, y_pred4)
    recall = recall_score(y_test, y_pred4)
    metrics.loc[len(metrics.index)] = ['Support Vector Machine', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test] 

    with open('./tmp/malware/models/svc.pkl', 'wb') as f:
        pickle.dump(svc, f)
    
    minio_client.fput_object(minio_bucket,"malware/models/svc.pkl","tmp/malware/models/svc.pkl")
        
    #Gradient Boost
    start_train = time.time()
    gbc = GradientBoostingClassifier()
    gbc.fit(X_train, y_train)
    end_train = time.time()

    start_test = time.time()
    y_pred5=gbc.predict(X_test)
    end_test = time.time()
    
    accuracy = accuracy_score(y_test,y_pred5)
    f1 = f1_score(y_test, y_pred5)
    precision = precision_score(y_test, y_pred5)
    recall = (recall_score(y_test, y_pred5))
    
    metrics.loc[len(metrics.index)] = ['Gradient Boost', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test] 

    with open('./tmp/malware/models/gbc.pkl', 'wb') as f:
        pickle.dump(gbc, f)
    
    minio_client.fput_object(minio_bucket,"malware/models/gbc.pkl","tmp/malware/models/gbc.pkl")
        
    #Gaussian Naive Bayes
    start_train = time.time()
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    end_train = time.time()

    start_test = time.time()
    y_pred6=gnb.predict(X_test)
    end_test = time.time()
    
    accuracy = accuracy_score(y_test,y_pred6)
    f1 = f1_score(y_test, y_pred6)
    precision = precision_score(y_test,y_pred6)
    recall = recall_score(y_test, y_pred6)

    metrics.loc[len(metrics.index)] = ['Gaussian Naive Bayes', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test] 

    with open('./tmp/malware/models/gnb.pkl', 'wb') as f:
        pickle.dump(gnb, f)
    
    minio_client.fput_object(minio_bucket,"malware/models/gnb.pkl","tmp/malware/models/gnb.pkl")
        
    #Artificial Neural Network

    input_shape = [X_train.shape[1]]
    start_train = time.time()

    model = tf.keras.Sequential([
        tf.keras.layers.Dense(units=64, activation='relu', input_shape=input_shape),
        tf.keras.layers.Dense(units=64, activation='relu'),
        tf.keras.layers.Dense(units=1)
    ])

    model.build()
    model.compile(optimizer='adam', loss='binary_crossentropy',  metrics=['accuracy'])  

    history = model.fit(X_train, y_train, validation_data=(X_test,y_test), batch_size=256, epochs=25,)
    end_train=time.time()
        
    accuracy = history.history['accuracy'][11]

    metrics.loc[len(metrics.index)] = ['ANN', accuracy, 0, 0, 0, end_train-start_train, 0] 
    
    with open('./tmp/malware/models/ann.pkl', 'wb') as f:
        pickle.dump(model, f)
    
    minio_client.fput_object(minio_bucket,"malware/models/ann.pkl","tmp/malware/models/ann.pkl")
    
    
    #Save metrics to minIO
    metrics_path = './tmp/malware/model_metrics'

    if not os.path.exists(metrics_path):
        os.makedirs(metrics_path)
        print(f"Folder '{metrics_path}' created successfully.")
    else:
        print(f"Folder '{metrics_path}' already exists.")
    
    np.save("tmp/malware/model_metrics/metrics.npy", metrics)
    minio_client.fput_object(minio_bucket,"model_metrics/metrics","tmp/malware/model_metrics/metrics.npy")
    
# train_op()

In [None]:
# Sample Code to test retrieval of models from minIO to predict


# minio_client = Minio(
#         "192.168.40.147:9000",
#         access_key="minio",
#         secret_key="minio123",
#         secure=False
#     )
# minio_bucket = "mlpipeline"

# minio_client.fget_object(minio_bucket,"lrc.pkl","/tmp/malware/models/lrc.pkl")
# with open('/tmp/malware/models/lrc.pkl', 'rb') as f:
#     lrc_model = pickle.load(f)
    
# test = np.array([-0.36050749977695973,
#  -0.37628545401330016,
#  0.7319818837331914,
#  -0.2189044761644313,
#  -0.1275414123374678,
#  -0.10815786799751105,
#  0.8442979845829507,
#  -0.014265954069633165,
#  -0.0278588345453147,
#  -3.783301043369854,
#  -0.08601248526414634,
#  -8.146929400009729,
#  -0.19993743505992262,
#  -0.3001477619107106,
#  18.95581955051609,
#  -1.183864237669148,
#  0.36320900044578625,
#  -0.18911824926078025,
#  0.12754896101328386,
#  -0.006541198444001606,
#  1.6857085474025533,
#  0.7753814443160313,
#  0.2730052092310096,
#  0.11133533433974843,
#  -0.34515657909522784,
#  0.30883469505008077,
#  -1.912781258638494,
#  -0.38693628302552796,
#  -0.24618753485745942,
#  -0.3640673696238398,
#  0.23496484147320668,
#  -1.076479730529731,
#  0.020404402782747594,
#  -0.07738580403782455,
#  -0.027152620679608528,
#  0.5424616990219193]).reshape(1, -1)

# lrc_model.predict(test)

In [22]:
import kfp
from kfp import dsl

read_csv_op = components.func_to_container_op(func=read_file, output_component_file='preprocess.yaml', base_image='python:3.7', packages_to_install=['pandas','scikit-learn', 'kfp', 'numpy', 'minio'])

train_op = components.func_to_container_op(func=train_op, output_component_file='train.yaml', base_image='python:3.7', packages_to_install=['pandas', 'scikit-learn','numpy','minio', 'tensorflow'])

read_data_op = kfp.components.load_component_from_file('preprocess.yaml')
train_op = kfp.components.load_component_from_file('train.yaml')

# @dsl.pipeline(
#     name='Machine Learning Pipeline',
#     description='A pipeline to preprocess, train, and predict using sklearn and tensorflow'
# )

def ml_pipeline():
    preprocess = read_csv_op()
    train = train_op().after(preprocess)

# Compile the pipeline
kfp.compiler.Compiler().compile(ml_pipeline, 'malware_pipeline.yaml')