In [1]:
!pip install scikit-learn==1.0.1
!pip install pandas 
!pip install psycopg2-binary
!pip install sqlalchemy
!pip install kfp numpy

import pandas as pd 
import pickle
import tensorflow as tf
import numpy as np 
import time
import os

from kfp import components

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

Collecting scikit-learn==1.0.1
  Using cached scikit_learn-1.0.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.7 MB)
Collecting joblib>=0.11
  Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.0.1 threadpoolctl-3.5.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting psycopg2-binary
  Using cached psycopg2_binar

2024-07-15 13:20:26.533055: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX512F
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-15 13:20:26.936773: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
2024-07-15 13:20:26.936933: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.
2024-07-15 13:20:27.041548: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.


In [None]:
def read_file() -> None:
    import os
    import pandas as pd
    import numpy as np
    from minio import Minio
    from scipy.special import boxcox
    from sklearn.model_selection import train_test_split
    import boto3
    import json
    
    import psycopg2
    from psycopg2 import sql
    from sqlalchemy import create_engine, text
    import datetime
    
    def get_secret():

        secret_name = "DBCreds"
        region_name = "us-east-1"

        # Create a Secrets Manager client
        session = boto3.session.Session()
        client = session.client(
            service_name='secretsmanager',
            region_name=region_name
        )

        try:
            get_secret_value_response = client.get_secret_value(
                SecretId=secret_name
            )
        except ClientError as e:
            raise e

        secret = get_secret_value_response['SecretString']
    
        # Parse the secret string to get the credentials
        secret_dict = json.loads(secret)
        username = secret_dict['username']
        password = secret_dict['password']
        host = secret_dict['host']
        port = secret_dict['port']
        dbname = secret_dict['dbname']

        return username, password, host, port, dbname


    (user,pswd,host,port,db) = get_secret()
    
#     #Dictionary to save mean, sd, and, encoder
#     preprocess_df = {'version':1}
    
    #Perform normalization
    def zscore_normalization(df, name):
        mean = df[name].mean()
        sd = df[name].std()
        df[name] = (df[name] - mean) / sd
        # preprocess_df[name] = (mean, sd)
    def preprocess(df):
        df = df.drop(columns=['Name', 'md5'])
        for i in df.columns:
            if i != 'legitimate':
                #convert data to fit normal distribution
                df[i] = boxcox(df[i], 0.5)
                #normalize all numerical columns
                zscore_normalization(df, i)
        correlation_matrix = df.corr()
        cols_to_drop = []
        for i in df.columns:
            for j in df.columns:
                #drop columns with low correlation to target variable
                if i != j and i != 'legitimate' and j != 'legitimate' and abs(correlation_matrix[i][j]) > 0.6 and i not in cols_to_drop and j not in cols_to_drop:
                    cols_to_drop.append(i)
        cols_to_drop = set(cols_to_drop)
        for i in df.columns:
            if i != 'legitimate' and i in cols_to_drop:
                preprocess_df[i] = None
        df.drop(columns=cols_to_drop, inplace=True)
        return df
    
    db_details = {
        'dbname': db,
        'user': user,
        'password': pswd,
        'host': host,
        'port': port
    }
    
    # Connect to PostgreSQL
    engine = create_engine(f'postgresql+psycopg2://{db_details["user"]}:{db_details["password"]}@{db_details["host"]}:{db_details["port"]}/{db_details["dbname"]}')
    
    try:
        conn = psycopg2.connect(**db_details)
        cursor = conn.cursor()
        print("Connected to PostgreSQL successfully.")
    except Exception as e:
        print(f"Failed to connect to PostgreSQL: {e}")
        exit()


    batch_size = 10000
    fetch_query = "SELECT * FROM malware_data where outcome != 2;"
            
    try:
        df = pd.DataFrame()

        df = pd.read_sql(fetch_query, conn)

    except Exception as e:
        print(f"Failed to fetch data: {e}")
        return None
        
    print(df.columns)
    
    df = df.drop(columns=['timestamp','uid'])
    df = preprocess(df)
    X = df.drop(columns=['outcome'])
    y = df['outcome']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    bucket_name="malware-pipeline"
    role_arn = 'arn:aws:iam::533267059960:role/aws-s3-access'
    session_name = 'kubeflow-pipeline-session'
    sts_client = boto3.client('sts')
    response = sts_client.assume_role(RoleArn=role_arn, RoleSessionName=session_name)
    credentials = response['Credentials']
    # Configure AWS SDK with temporary credentials
    s3_client = boto3.client('s3',
                      aws_access_key_id=credentials['AccessKeyId'],
                      aws_secret_access_key=credentials['SecretAccessKey'],
                      aws_session_token=credentials['SessionToken'])
    
    
    
    folder_path = './tmp/malware'
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder '{folder_path}' created successfully.")
    else:
        print(f"Folder '{folder_path}' already exists.")
        

    df.to_csv("./tmp/malware/malware_data.csv")
    s3_client.upload_file("./tmp/malware/malware_data.csv", bucket_name, "malware_dataset.csv")
    np.save("./tmp/malware/X_train.npy",X_train)
    s3_client.upload_file("./tmp/malware/X_train.npy", bucket_name, "X_train.npy")
    np.save("./tmp/malware/y_train.npy",y_train)
    s3_client.upload_file("./tmp/malware/y_train.npy", bucket_name, "y_train.npy")
    np.save("./tmp/malware/X_test.npy",X_test)
    s3_client.upload_file("./tmp/malware/X_test.npy", bucket_name, "X_test.npy")
    np.save("./tmp/malware/y_test.npy",y_test)
    s3_client.upload_file("./tmp/malware/y_test.npy", bucket_name, "y_test.npy")