# Read in the csv file from Github
# Save the preprocessed data to Minio 

In [None]:

def read_file():
    from sklearn.model_selection import train_test_split
    import os 
    import pandas as pd
    import numpy as np 
    from sklearn import metrics
    from minio import Minio
    
    #Perform normalization
    def zscore_normalization(df, name):
        mean = df[name].mean()
        sd = df[name].std()
        df[name] = (df[name] - mean) / sd
    
    #Encode text 
    def encode_text(df, name):
        from sklearn.preprocessing import OrdinalEncoder
        enc = OrdinalEncoder()
        data = enc.fit_transform(df[name].values.reshape(-1,1))
        df[name] = data.flatten()

    #Data preprocessing
    def preprocess(df):
        df.columns = [
            'duration',
            'protocol_type',
            'service',
            'flag',
            'src_bytes',
            'dst_bytes',
            'land',
            'wrong_fragment',
            'urgent',
            'hot',
            'num_failed_logins',
            'logged_in',
            'num_compromised',
            'root_shell',
            'su_attempted',
            'num_root',
            'num_file_creations',
            'num_shells',
            'num_access_files',
            'num_outbound_cmds',
            'is_host_login',
            'is_guest_login',
            'count',
            'srv_count',
            'serror_rate',
            'srv_serror_rate',
            'rerror_rate',
            'srv_rerror_rate',
            'same_srv_rate',
            'diff_srv_rate',
            'srv_diff_host_rate',
            'dst_host_count',
            'dst_host_srv_count',
            'dst_host_same_srv_rate',
            'dst_host_diff_srv_rate',
            'dst_host_same_src_port_rate',
            'dst_host_srv_diff_host_rate',
            'dst_host_serror_rate',
            'dst_host_srv_serror_rate',
            'dst_host_rerror_rate',
            'dst_host_srv_rerror_rate',
            'outcome'
        ]
        for col in df.columns:
            t = (df[col].dtype)
            if col != 'outcome':
                if (t == int or t == float):
                    zscore_normalization(df, col)
               
                else:
                    df[col] = df[col].astype(str)
                    encode_text(df, col)
                            
        for col in df.columns:
            if len(df[col].unique()) == 1:
                df.drop(col, inplace=True,axis=1)

        df.loc[df['outcome'] != "normal.", 'outcome']  = 1
        df.loc[df['outcome'] == "normal.", 'outcome']  = 0
        encode_text(df, "outcome")
        correlation = df.corrwith(df["outcome"])
      
        
        row = 0 
        for num in correlation:
            if num >= -0.05 and num <= 0.05:
                df.drop(df.columns[row], axis=1, inplace=True)
                row += 1
        return df
    
    #Create Minio client for object storage 
    minio_client = Minio(
        "", #IP will need to change on ec2 instance restart 
        access_key="",
        secret_key="",
        secure=False
    )
    
    
    minio_bucket = "mlpipeline"
    
    #Read data file in from github 
    file_path = 'https://raw.githubusercontent.com/tsimhadri-ews/internproject/intrusion-detection-0/src/kddcup.data_10_percent_corrected.csv'
    df = pd.read_csv(file_path)
    df = preprocess(df)
    
    
    #Data 
    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["outcome"]), df["outcome"], test_size=0.2)
    
    def save_to_s3(data, filename, access_point_alias, s3_key):
        np.save(filename, data, allow_pickle=True)
        minio_client3.fput_object(access_point_alias, s3_key, filename)

    # Save and upload training data
    save_to_s3(X_train, "tmp/intrusion/X_train.npy", access_point_alias, "intrusion/X_train.npy")
    save_to_s3(y_train, "tmp/intrusion/y_train.npy", access_point_alias, "intrusion/y_train.npy")

    # Save and upload testing data
    save_to_s3(X_test, "tmp/intrusion/X_test.npy", access_point_alias, "intrusion/X_test.npy")
    save_to_s3(y_test, "tmp/intrusion/y_test.npy", access_point_alias, "intrusion/y_test.npy")


    #Create file storage in Minio 
    folder_path = './tmp/intrusion'
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder '{folder_path}' created successfully.")
    else:
        print(f"Folder '{folder_path}' already exists.")
    
    #Save dataframe 
    
    df.to_csv("tmp/intrusion/intrusion_data.csv")
   
    try:
        minio_client2.fput_object(minio_bucket,"intrusion/intrusion_data.csv","tmp/intrusion/intrusion_data.csv")
        print("Object uploaded successfully!")
    except Exception as e:
        print(f"Error uploading object: {e}")

 
    #Save training and testing data to Minio 
    np.save("tmp/intrusion/X_train.npy",X_train, allow_pickle=True)
    minio_client.fput_object(minio_bucket,"intrusion/X_train.npy","tmp/intrusion/X_train.npy")
    np.save("tmp/intrusion/y_train.npy",y_train, allow_pickle=True)
    minio_client.fput_object(minio_bucket,"intrusion/y_train.npy","tmp/intrusion/y_train.npy")
    np.save("tmp/intrusion/X_test.npy",X_test, allow_pickle=True)
    minio_client.fput_object(minio_bucket,"intrusion/X_test.npy","tmp/intrusion/X_test.npy")
    np.save("tmp/intrusion/y_test.npy",y_test, allow_pickle=True)
    minio_client.fput_object(minio_bucket,"intrusion/y_test.npy","tmp/intrusion/y_test.npy")
   

# Read in model data from Minio 
# Train model 
# Output trained model and metrics to Minio 

In [23]:

def train_op():
    import pickle
    import pandas as pd
    import numpy as np 
    import os 
    from minio import Minio 
    import time 
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.svm import SVC
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, f1_score, precision_score
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    import tensorflow as tf
    

    
   
   #Create Minio client for object storage 
    minio_client1 = Minio(
        "", #IP will need to change on ec2 instance restart
        access_key="",
        secret_key="",
        secure=False
    )
    
    
    
    minio_bucket = "mlpipeline"
    
    #Load training and testing data from Minio bucket
    
    
    minio_client.fget_object(minio_bucket,"intrusion/X_train.npy","/tmp/intrusion/X_train.npy")
    X_train = np.load("/tmp/intrusion/X_train.npy", allow_pickle=True)
    minio_client.fget_object(minio_bucket,"intrusion/y_train.npy","tmp/intrusion/y_train.npy")
    y_train = np.load("tmp/intrusion/y_train.npy", allow_pickle=True)
    minio_client.fget_object(minio_bucket,"intrusion/X_test.npy","tmp/intrusion/X_test.npy")
    X_test = np.load("tmp/intrusion/X_test.npy", allow_pickle=True)
    minio_client.fget_object(minio_bucket,"intrusion/y_test.npy","tmp/intrusion/y_test.npy")
    y_test = np.load("tmp/intrusion/y_test.npy", allow_pickle=True)
    
    #Create data frame for storing metrics 
    metrics = pd.DataFrame(columns=["Model", "Accuracy", "F1", "Precision", "Recall", "Train_Time", "Test_Time"])
    
    
    models_path = './tmp/intrusion/models'

    if not os.path.exists(models_path):
        os.makedirs(models_path)
        print(f"Folder '{models_path}' created successfully.")
    else:
        print(f"Folder '{models_path}' already exists.")
    
    #Logistic Regression
    start_train = time.time()
    lrc = LogisticRegression(random_state=0, max_iter=1000)

    lrc.fit(X_train, y_train)
    end_train = time.time()

    start_test = time.time()
    ypredlr = lrc.predict(X_test)
    end_test = time.time()
    
    accuracy = accuracy_score(y_test, ypredlr)

    f1 = f1_score(y_test, ypredlr)

    precision = precision_score(y_test, ypredlr)

    recall = recall_score(y_test, ypredlr)

    metrics.loc[len(metrics.index)] = ['Logistic Regression', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test]
    
    with open('./tmp/intrusion/models/lrc.pkl', 'wb') as f:
        pickle.dump(lrc, f)
    
    minio_client.fput_object(minio_bucket,"intrusion/models/lrc.pkl","./tmp/intrusion/models/lrc.pkl")
    
    #Random Forest Classifier
    start_train = time.time()
    rfc = RandomForestClassifier()
    rfc.fit(X_train, y_train)
    end_train = time.time()

    start_test = time.time()
    y_pred2=rfc.predict(X_test)
    end_test = time.time()
    
    accuracy = accuracy_score(y_test, y_pred2)
    f1 = f1_score(y_test, y_pred2)
    precision = precision_score(y_test, y_pred2)
    recall = recall_score(y_test, y_pred2)

    metrics.loc[len(metrics.index)] = ['Random Forest Classifier', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test] 
    
    with open('./tmp/intrusion/models/rfc.pkl', 'wb') as f:
        pickle.dump(rfc, f)
    
    minio_client.fput_object(minio_bucket,"intrusion/models/rfc.pkl","tmp/intrusion/models/rfc.pkl")
    
    #Decision Tree
    start_train = time.time()
    dtc = DecisionTreeClassifier()
    dtc.fit(X_train, y_train)
    end_train = time.time()

    start_test = time.time()
    y_pred3=dtc.predict(X_test)
    end_test = time.time()
    
    accuracy = accuracy_score(y_test,y_pred3)
    f1 = f1_score(y_test, y_pred3)
    precision = precision_score(y_test, y_pred3)
    recall = recall_score(y_test, y_pred3)
    
    metrics.loc[len(metrics.index)] = ['Decision Tree', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test] 

    with open('./tmp/intrusion/models/dtc.pkl', 'wb') as f:
        pickle.dump(dtc, f)
    
    minio_client.fput_object(minio_bucket,"intrusion/models/dtc.pkl","tmp/intrusion/models/dtc.pkl")
    
    #Support Vector Machine 
    start_train = time.time()
    svc = SVC()
    svc.fit(X_train, y_train)
    end_train = time.time()

    start_test = time.time()
    y_pred4=svc.predict(X_test)
    end_test = time.time()
    
    accuracy = accuracy_score(y_test,y_pred4)
    f1 = f1_score(y_test,y_pred4)
    precision = precision_score(y_test, y_pred4)
    recall = recall_score(y_test, y_pred4)
    metrics.loc[len(metrics.index)] = ['Support Vector Machine', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test] 

    with open('./tmp/intrusion/models/svc.pkl', 'wb') as f:
        pickle.dump(svc, f)
    
    minio_client.fput_object(minio_bucket,"intrusion/models/svc.pkl","tmp/intrusion/models/svc.pkl")
    
    #Gradient Boost
    start_train = time.time()
    gbc = GradientBoostingClassifier()
    gbc.fit(X_train, y_train)
    end_train = time.time()

    start_test = time.time()
    y_pred5=gbc.predict(X_test)
    end_test = time.time()
    
    accuracy = accuracy_score(y_test,y_pred5)
    f1 = f1_score(y_test, y_pred5)
    precision = precision_score(y_test, y_pred5)
    recall = (recall_score(y_test, y_pred5))
    
    metrics.loc[len(metrics.index)] = ['Gradient Boost', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test] 

    with open('./tmp/intrusion/models/gbc.pkl', 'wb') as f:
        pickle.dump(gbc, f)
    
    minio_client.fput_object(minio_bucket,"intrusion/models/gbc.pkl","tmp/intrusion/models/gbc.pkl")
    
    #Gaussian Naive Bayes
    start_train = time.time()
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    end_train = time.time()

    start_test = time.time()
    y_pred6=gnb.predict(X_test)
    end_test = time.time()
    
    accuracy = accuracy_score(y_test,y_pred6)
    f1 = f1_score(y_test, y_pred6)
    precision = precision_score(y_test,y_pred6)
    recall = recall_score(y_test, y_pred6)

    metrics.loc[len(metrics.index)] = ['Gaussian Naive Bayes', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test] 

    with open('./tmp/intrusion/models/gnb.pkl', 'wb') as f:
        pickle.dump(gnb, f)
    
    minio_client.fput_object(minio_bucket,"intrusion/models/gnb.pkl","tmp/intrusion/models/gnb.pkl")
    
    #Artificial Neural Network
    import tensorflow as tf
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report


    input_shape = [X_train.shape[1]]

    start_train = time.time()

    model = tf.keras.Sequential([

        tf.keras.layers.Dense(units=64, activation='relu', input_shape=input_shape),
        tf.keras.layers.Dense(units=64, activation='relu'),
        tf.keras.layers.Dense(units=1)
    ])

    model.build()


    model.compile(optimizer='adam', loss='binary_crossentropy',  metrics=['accuracy'])  

    history = model.fit(X_train, y_train, validation_data=(X_test,y_test), batch_size=256, epochs=25,)

    end_train=time.time()
    accuracy = model.evaluate(X_test, y_test)[1]
    
    accuracy = history.history['accuracy'][11]

    metrics.loc[len(metrics.index)] = ['ANN', accuracy, 0, 0, 0, end_train-start_train, 0] 
    
    with open('./tmp/intrusion/models/ann.pkl', 'wb') as f:
        pickle.dump(model, f)
    
    minio_client.fput_object(minio_bucket,"intrusion/models/ann.pkl","tmp/intrusion/models/ann.pkl")
    
    #Save model metrics
    metrics_path = './tmp/intrusion/model_metrics'

    if not os.path.exists(metrics_path):
        os.makedirs(metrics_path)
        print(f"Folder '{metrics_path}' created successfully.")
    else:
        print(f"Folder '{metrics_path}' already exists.")
    
    np.save("tmp/intrusion/model_metrics/metrics.npy", metrics)
    minio_client.fput_object(minio_bucket,"model_metrics/metrics","tmp/intrusion/model_metrics/metrics.npy")
    

# Create each component in the pipeline 

In [16]:
from kfp import components

read_csv_op = components.func_to_container_op(func=read_file, output_component_file='read.yaml', base_image='python:3.7', packages_to_install=['pandas','scikit-learn', 'kfp', 'numpy', 'minio'])
train_op = components.func_to_container_op(func=train_op, output_component_file='train.yaml', base_image='python:3.7', packages_to_install=['pandas', 'scikit-learn', 'numpy','minio', 'tensorflow'])


# Build the pipeline using the components 

In [18]:
import kfp
from kfp import dsl

read_data_op = kfp.components.load_component_from_file('read.yaml')
train_op = kfp.components.load_component_from_file('train.yaml')


@dsl.pipeline(
    name='Machine Learning Pipeline',
    description='A pipeline to preprocess, train, and predict using sklearn and tensorflow'
)


def ml_pipeline(): 
    preprocess = read_csv_op()
    train = train_op().after(preprocess)    
    
# Compile the pipeline
kfp.compiler.Compiler().compile(ml_pipeline, 'intrusionpipeline.yaml')

    