In [None]:
!pip install pandas 
!pip install psycopg2-binary
!pip install sqlalchemy

In [None]:
def check_condition() -> bool:
    import os
    import pandas as pd
    from sqlalchemy import create_engine, text
    import boto3
    import json

    def get_secret():

        secret_name = "DBCreds"
        region_name = "us-east-1"

        # Create a Secrets Manager client
        session = boto3.session.Session()
        client = session.client(
            service_name='secretsmanager',
            region_name=region_name
        )

        try:
            get_secret_value_response = client.get_secret_value(
                SecretId=secret_name
            )
        except ClientError as e:
            raise e

        secret = get_secret_value_response['SecretString']

        # Parse the secret string to get the credentials
        secret_dict = json.loads(secret)
        username = secret_dict['username']
        password = secret_dict['password']
        host = secret_dict['host']
        port = secret_dict['port']
        dbname = secret_dict['dbname']

        return username, password, host, port, dbname


    (user,pswd,host,port,db) = get_secret()

    db_details = {
        'dbname': db,
        'user': user,
        'password': pswd,
        'host': host,
        'port': port
    }

    # Connect to PostgreSQL
    engine = create_engine(f'postgresql+psycopg2://{db_details["user"]}:{db_details["password"]}@{db_details["host"]}:{db_details["port"]}/{db_details["dbname"]}', connect_args={'connect_timeout': 60})

    try:
        with engine.connect() as conn:
            query = text('select count(*) from cyber_data as cyd join cyber_outcomes as cyo on cyo.uid = cyd.uid where cyd.outcome is not NULL;')
            data = pd.read_sql_query(query, conn)
            count = data.iloc[0]['count']
    except Exception as e:
        print(e)
    
    try:
        with engine.connect() as conn:
            query = text('SELECT count(*) FROM metadata_table_cyber;')
            data = pd.read_sql_query(query, conn)
            meta_count = data.iloc[0]['count']
    except Exception as e:
        print(e)

    try:
        with engine.connect() as conn:
            query = text('select count(*) from cyber_data as cyd join cyber_outcomes as cyo on cyo.uid = cyd.uid where cyo.outcome!=cyd.outcome and cyd.outcome is not NULL;')
            data = pd.read_sql_query(query, conn)
            amount_incorrect = data.iloc[0]['count']
    except Exception as e:
        print(e)
        
    if count >= 10 or amount_incorrect > 2:
        try:
            with engine.connect() as conn:
                delete_query = text("DELETE FROM cyber_outcomes cyo USING cyber_data cyd WHERE cyo.uid = cyd.uid;")
                result = conn.execute(delete_query)
                conn.commit()
        except Exception as e:
            print(e)
        
    if (count >= 10 and count != 0) or meta_count == 0 or amount_incorrect > 2:
        return True
    else:
        return False

In [None]:
def read_file() -> None:
    import os
    import pandas as pd
    import numpy as np
    from minio import Minio
    from scipy.special import boxcox
    from sklearn.model_selection import train_test_split
    import boto3
    import json
    import pickle
    
    import base64
    import psycopg2
    from psycopg2 import sql
    from sqlalchemy import create_engine, text
    import datetime
    
    def get_secret():

        secret_name = "DBCreds"
        region_name = "us-east-1"

        # Create a Secrets Manager client
        session = boto3.session.Session()
        client = session.client(
            service_name='secretsmanager',
            region_name=region_name
        )

        try:
            get_secret_value_response = client.get_secret_value(
                SecretId=secret_name
            )
        except ClientError as e:
            raise e

        secret = get_secret_value_response['SecretString']
    
        # Parse the secret string to get the credentials
        secret_dict = json.loads(secret)
        username = secret_dict['username']
        password = secret_dict['password']
        host = secret_dict['host']
        port = secret_dict['port']
        dbname = secret_dict['dbname']

        return username, password, host, port, dbname


    (user,pswd,host,port,db) = get_secret()
    preprocess_df = {'version':1}
    
    def zscore_normalization(df, name):
        mean = df[name].mean()
        sd = df[name].std()
        df[name] = (df[name] - mean) / sd
        preprocess_df[name] = (mean, sd)

    def encode_text(df, name):
        from sklearn.preprocessing import OrdinalEncoder
        enc = OrdinalEncoder()
        data = enc.fit_transform(df[name].values.reshape(-1,1))
        df[name] = data.flatten()
        preprocess_df[name] = base64.b64encode(pickle.dumps(enc)).decode('utf-8')

        
    def preprocess(df):        
        for c in df.columns:
            if len(df[c].unique()) == 1:
                df.drop(columns=[c], inplace=True)
                preprocess_df[c] = None
        
        for col in df.columns:
            if col != 'outcome':
                t = (df[col].dtype)
                if t == 'int64' or t == 'float64':
                    df[col] = boxcox(df[col], 0.5)
                    zscore_normalization(df, col)
                else:
                    encode_text(df, col)

        df.drop(columns=["label"], inplace=True)
        preprocess_df['label'] = None

        corr_matrix = df.corr()
        target_corr = corr_matrix['outcome']
        threshold=0.05
        drop_features = target_corr[abs(target_corr)<=threshold].index.tolist()
        for i in drop_features:
            preprocess_df[i] = None
        df.drop(columns=drop_features, inplace=True)
                
        return df

    db_details = {
        'dbname': db,
        'user': user,
        'password': pswd,
        'host': host,
        'port': port
    }

    
    engine = create_engine(f'postgresql+psycopg2://{db_details["user"]}:{db_details["password"]}@{db_details["host"]}:{db_details["port"]}/{db_details["dbname"]}')

    df = pd.DataFrame()
            
    try:
        with engine.connect() as conn:
            query = text('SELECT * FROM cyber_data WHERE outcome is not NULL;')
            chunksize = 10000 

            chunks = pd.read_sql_query(query, conn, chunksize=chunksize)

            features_list = []

            for chunk in chunks:
                features_df = pd.json_normalize(chunk['features'])
                features_df['outcome'] = chunk['outcome']
                
                df = pd.concat([df, features_df], ignore_index=True)

    except Exception as e:
        print(f"Failed to fetch data: {e}")


    df = preprocess(df)
    
    X = df.drop(columns=["outcome"])
    y = df["outcome"]
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    
    bucket_name="multiclasspipeline"
    role_arn = 'arn:aws:iam::533267059960:role/aws-s3-access'
    session_name = 'kubeflow-pipeline-session'
    sts_client = boto3.client('sts')
    response = sts_client.assume_role(RoleArn=role_arn, RoleSessionName=session_name)
    credentials = response['Credentials']
    # Configure AWS SDK with temporary credentials
    s3_client = boto3.client('s3',
                      aws_access_key_id=credentials['AccessKeyId'],
                      aws_secret_access_key=credentials['SecretAccessKey'],
                      aws_session_token=credentials['SessionToken'])
    
    print(s3_client)
    
    folder_path = './tmp/cyber'
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder '{folder_path}' created successfully.")
    else:
        print(f"Folder '{folder_path}' already exists.")
        
    try:
        with engine.connect() as conn:
            query = text('SELECT * FROM metadata_table_cyber ORDER BY version DESC LIMIT 1;')
            data = pd.read_sql_query(query, conn)
            version = data['version'].iloc[0] + 1
            print(version)
    except Exception as e:
        version = 1
    
    df.to_csv("./tmp/cyber/cyber_data.csv")
    s3_client.upload_file("./tmp/cyber/cyber_data.csv", bucket_name, f"version{version}/cyber_dataset.csv")
    np.save("./tmp/cyber/X_train.npy",X_train)
    s3_client.upload_file("./tmp/cyber/X_train.npy", bucket_name, f"version{version}/X_train.npy")
    np.save("./tmp/cyber/y_train.npy",y_train)
    s3_client.upload_file("./tmp/cyber/y_train.npy", bucket_name, f"version{version}/y_train.npy")
    np.save("./tmp/cyber/X_test.npy",X_test)
    s3_client.upload_file("./tmp/cyber/X_test.npy", bucket_name, f"version{version}/X_test.npy")
    np.save("./tmp/cyber/y_test.npy",y_test)
    s3_client.upload_file("./tmp/cyber/y_test.npy", bucket_name, f"version{version}/y_test.npy")
        

    preprocess_df['version'] = version
    mean_df = pd.DataFrame([preprocess_df])
    meta_df = pd.DataFrame(data = [[version, datetime.datetime.now(), len(X.columns), json.dumps(df.dtypes.astype(str).to_dict()),mean_df.iloc[0].to_json()]], columns = ['version', 'date', 'features', 'types','factor'])
    meta_df.to_sql("metadata_table_cyber", engine, if_exists='append', index=False)

#make some changes to the file 
#run pipeline 

In [None]:
def train_op() -> None:
    import pickle
    import pandas as pd
    import numpy as np
    import json
    import os
    import time
    import tensorflow as tf
    import boto3
    from minio import Minio
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import LogisticRegression

    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    from sqlalchemy import create_engine
    from sqlalchemy import create_engine, Table, Column, Float, Integer, String, MetaData, ARRAY
    from sqlalchemy import select, desc, insert, text
    from io import BytesIO
    
    import psycopg2
    from psycopg2 import sql
    from sqlalchemy import create_engine
    
    def get_secret():

        secret_name = "DBCreds"
        region_name = "us-east-1"

        # Create a Secrets Manager client
        session = boto3.session.Session()
        client = session.client(
            service_name='secretsmanager',
            region_name=region_name
        )

        try:
            get_secret_value_response = client.get_secret_value(
                SecretId=secret_name
            )
        except ClientError as e:
            raise e

        secret = get_secret_value_response['SecretString']
    
        # Parse the secret string to get the credentials
        secret_dict = json.loads(secret)
        username = secret_dict['username']
        password = secret_dict['password']
        host = secret_dict['host']
        port = secret_dict['port']
        dbname = secret_dict['dbname']

        return username, password, host, port, dbname


    (user,pswd,host,port,db) = get_secret()
    
    bucket_name="multiclasspipeline"
    role_arn = 'arn:aws:iam::533267059960:role/aws-s3-access'
    session_name = 'kubeflow-pipeline-session'
    sts_client = boto3.client('sts')
    response = sts_client.assume_role(RoleArn=role_arn, RoleSessionName=session_name)
    credentials = response['Credentials']
    
    # Configure AWS SDK with temporary credentials
    s3_client = boto3.client('s3',
                      aws_access_key_id=credentials['AccessKeyId'],
                      aws_secret_access_key=credentials['SecretAccessKey'],
                      aws_session_token=credentials['SessionToken'])
    
    
    db_details = {
        'dbname': db,
        'user': user,
        'password': pswd,
        'host': host,
        'port': port
    }
        
        
    
    # Connect to PostgreSQL
    try:
        conn = psycopg2.connect(**db_details)
        cursor = conn.cursor()
        print("Connected to PostgreSQL successfully.")
    except Exception as e:
        print(f"Failed to connect to PostgreSQL: {e}")
        exit()
        
    # Query to fetch data from the table
    try:
        fetch_query = "SELECT * FROM metadata_table_cyber ORDER BY date DESC LIMIT 1;"
        df = pd.read_sql(fetch_query, conn)
    except Exception as e:
        print(f"Failed to fetch data: {e}")
    
    if(not df.empty):
        version = df['version'][0]
    else:
        version = 1
        
    folder_path = f"version{version}"
    
    cursor.close()
    conn.close()
    
    print(f"version{version}/X_train.npy")
    
    response = s3_client.get_object(Bucket=bucket_name, Key=f"version{version}/X_train.npy")
    data = response['Body'].read()
    X_train = np.load(BytesIO(data))
    X_train = pd.DataFrame(X_train)
    
    response = s3_client.get_object(Bucket=bucket_name, Key=f"version{version}/y_train.npy")
    data = response['Body'].read()
    y_train = np.load(BytesIO(data))

    
    response = s3_client.get_object(Bucket=bucket_name, Key=f"version{version}/X_test.npy")
    data = response['Body'].read()
    X_test = np.load(BytesIO(data))
    X_test = pd.DataFrame(X_test)
    
    response = s3_client.get_object(Bucket=bucket_name, Key=f"version{version}/y_test.npy")
    data = response['Body'].read()
    y_test = np.load(BytesIO(data))
    
    
    # Define dataframe to store model metrics
    metrics = pd.DataFrame(columns=["Version", "Model", "Accuracy", "F1", "Precision", "Recall", "Train_Time", "Test_Time"])
    models_path = './tmp/cyber/models'
    
    
    if not os.path.exists(models_path):
        os.makedirs(models_path)
        print(f"Folder '{models_path}' created successfully.")
    else:
        print(f"Folder '{models_path}' already exists.")
        
    #Random Forest Classifier
    start_train = time.time()
    rfc = RandomForestClassifier()
    rfc.fit(X_train, y_train)
    end_train = time.time()

    start_test = time.time()
    y_pred2=rfc.predict(X_test)
    end_test = time.time()

    accuracy = accuracy_score(y_test, y_pred2)
    
    precision = precision_score(y_test, y_pred2, average='macro')
    recall = recall_score(y_test, y_pred2, average='macro')
    f1 = f1_score(y_test, y_pred2, average="macro")

    print("Precision:", precision)
    print("Recall:", recall)
    
    metrics.loc[len(metrics.index)] = [version, 'rfc', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test]
    with open('./tmp/cyber/models/rfc.pkl', 'wb') as f:
        pickle.dump(rfc, f)
    s3_client.upload_file("tmp/cyber/models/rfc.pkl", bucket_name, f"{folder_path}/rfc/model.pkl")


    # Decision Tree

    start_train = time.time()
    dtc = DecisionTreeClassifier()
    dtc.fit(X_train, y_train)
    end_train = time.time()

    start_test = time.time()
    y_pred3=dtc.predict(X_test)
    end_test = time.time()

    accuracy = accuracy_score(y_test, y_pred3)
    f1 = f1_score(y_test, y_pred3, average="macro")
    precision = precision_score(y_test, y_pred3, average="macro")
    recall = recall_score(y_test, y_pred3, average="macro")

    metrics.loc[len(metrics.index)] = [version, 'dtc', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test]
    with open('./tmp/cyber/models/dtc.pkl', 'wb') as f:
        pickle.dump(rfc, f)
    s3_client.upload_file("tmp/cyber/models/dtc.pkl", bucket_name, f"{folder_path}/dtc/model.pkl")


    #KNN

    start_train = time.time()
    knn = KNeighborsClassifier(n_neighbors=2)
    knn.fit(X_train, y_train)
    end_train = time.time()

    start_test = time.time()
    y_pred4=knn.predict(X_test)
    end_test = time.time()

    accuracy = accuracy_score(y_test, y_pred4)
    f1 = f1_score(y_test, y_pred4, average="macro")
    precision = precision_score(y_test, y_pred4, average="macro")
    recall = recall_score(y_test, y_pred4, average="macro")

    metrics.loc[len(metrics.index)] = [version, 'knn', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test]
    with open('./tmp/cyber/models/knn.pkl', 'wb') as f:
        pickle.dump(rfc, f)
    s3_client.upload_file("tmp/cyber/models/knn.pkl", bucket_name, f"{folder_path}/knn/model.pkl")

    #SGD

    start_train = time.time()
    sgd = SGDClassifier(max_iter=1000, tol=1e-3)
    sgd.fit(X_train, y_train)
    end_train = time.time()

    start_test = time.time()
    y_pred5=sgd.predict(X_test)
    end_test = time.time()

    accuracy = accuracy_score(y_test, y_pred5)
    f1 = f1_score(y_test, y_pred5, average="macro")
    precision = precision_score(y_test, y_pred5, average="macro")
    recall = recall_score(y_test, y_pred5, average="macro")

    metrics.loc[len(metrics.index)] = [version, 'sgd', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test]
    with open('./tmp/cyber/models/sgd.pkl', 'wb') as f:
        pickle.dump(rfc, f)
    s3_client.upload_file("tmp/cyber/models/sgd.pkl", bucket_name, f"{folder_path}/sgd/model.pkl")

    #Logistic Regression

    # start_train = time.time()
    # lrc = LogisticRegression(random_state=0, max_iter=1000)
    # lrc.fit(X_train, y_train)
    # end_train = time.time()

    # start_test = time.time()
    # y_pred6=lrc.predict(X_test)
    # end_test = time.time()

    # accuracy = accuracy_score(y_test, y_pred6)
    # f1 = f1_score(y_test, y_pred6)
    # precision = precision_score(y_test, y_pred6)
    # recall = recall_score(y_test, y_pred6)

    # metrics.loc[len(metrics.index)] = [version, 'lrc', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test]
    # with open('./tmp/cyber/models/lrc.pkl', 'wb') as f:
    #     pickle.dump(rfc, f)
    # s3_client.upload_file("tmp/cyber/models/lrc.pkl", bucket_name, f"{folder_path}/lrc/model.pkl")

    db_details = {
        'dbname': db,
        'user': user,
        'password': pswd,
        'host': host,
        'port': port
    }
        
    insert_query = """
        INSERT INTO cyber_model_metrics (name, version, URI, in_use, accuracy, f1, precision, recall, train_time, test_time)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        ON CONFLICT (name, version) DO NOTHING;
    """
    try:
        conn = psycopg2.connect(**db_details)
        cursor = conn.cursor()
        print("Connected to PostgreSQL successfully.")

        # Iterate through DataFrame rows and insert into the table
        for index, row in metrics.iterrows():
            cursor.execute(insert_query, (
                row['Model'], 
                row['Version'], 
                f"s3://multiclasspipeline/version{version}/{row['Model']}/model.pkl", 
                False, 
                row['Accuracy'], 
                row['F1'], 
                row['Precision'], 
                row['Recall'], 
                row['Train_Time'], 
                row['Test_Time']
            ))
    
        conn.commit()
        print("Data inserted successfully.")

        cursor.close()
        conn.close()
        print("PostgreSQL connection closed.")
    except Exception as e:
        print(f"Failed to connect to PostgreSQL or insert data: {e}")

In [None]:
def model_eval_deploy() -> None:
    import pickle
    import pandas as pd
    import numpy as np 
    import json
    import os 
    import time
    import tensorflow as tf
    
    import boto3
    
    import psycopg2
    from psycopg2 import sql
    from sqlalchemy import create_engine
    
    from kubernetes import client 
    from kserve import KServeClient
    from kserve import constants
    from kserve import utils
    from kserve import V1beta1InferenceService
    from kserve import V1beta1InferenceServiceSpec
    from kserve import V1beta1PredictorSpec
    from kserve import V1beta1SKLearnSpec
    
    def get_secret():

        secret_name = "DBCreds"
        region_name = "us-east-1"

        # Create a Secrets Manager client
        session = boto3.session.Session()
        client = session.client(
            service_name='secretsmanager',
            region_name=region_name
        )

        try:
            get_secret_value_response = client.get_secret_value(
                SecretId=secret_name
            )
        except ClientError as e:
            raise e

        secret = get_secret_value_response['SecretString']
    
        # Parse the secret string to get the credentials
        secret_dict = json.loads(secret)
        username = secret_dict['username']
        password = secret_dict['password']
        host = secret_dict['host']
        port = secret_dict['port']
        dbname = secret_dict['dbname']

        return username, password, host, port, dbname


    (user,pswd,host,port,db) = get_secret()
    
    
    db_details = {
        'dbname': db,
        'user': user,
        'password': pswd,
        'host': host,
        'port': port
    }
    
    # Connect to PostgreSQL
    try:
        conn = psycopg2.connect(**db_details)
        cursor = conn.cursor()
        print("Connected to PostgreSQL successfully.")
    except Exception as e:
        print(f"Failed to connect to PostgreSQL: {e}")
        exit()
    
    try:
        fetch_query = "SELECT * FROM cyber_model_metrics ORDER BY created_at desc LIMIT 1;"
        df = pd.read_sql(fetch_query, conn)
    except Exception as e:
        print(f"Failed to fetch data: {e}") 
    
    if(not df.empty):
        version = df['version'][0]
    else:
        version = 1
    
    try:
        fetch_query = f"SELECT * FROM cyber_model_metrics where version={version} order by accuracy desc limit 1;"
        model = pd.read_sql(fetch_query, conn)
        accuracy = model['accuracy'][0]
    except Exception as e:
        print(f"Failed to fetch data: {e}")
    
    try:
        fetch_query = "SELECT * FROM cyber_model_metrics where in_use is true LIMIT 1;"
        old_model = pd.read_sql(fetch_query, conn)
    except Exception as e:
        print(f"Failed to fetch data: {e}") 
    
    
    if accuracy >= .85:
        # Query to fetch data from the table

        name = f"{model['name'][0]}-version{version}-cyd"
        print(name)
        namespace = utils.get_default_target_namespace()
        kserve_version='v1beta1'
        api_version = constants.KSERVE_GROUP + '/' + kserve_version

        isvc2 = V1beta1InferenceService(
            api_version=api_version,
            kind=constants.KSERVE_KIND,
            metadata=client.V1ObjectMeta(
                name=name,
                namespace=namespace,
                annotations={'sidecar.istio.io/inject': 'false'}
            ),
            spec=V1beta1InferenceServiceSpec(
                predictor=V1beta1PredictorSpec(
                    service_account_name="s3-service-account",
                    sklearn=V1beta1SKLearnSpec(
                        storage_uri=model['uri'][0]
                    )
                )
            )
        )


        KServe = KServeClient()
        KServe.create(isvc2)



        update_query_new = """
            UPDATE cyber_model_metrics
            SET in_use = true
            WHERE name = %s and version = %s;
        """

        update_query_old = """
            UPDATE cyber_model_metrics
            SET in_use = false
            WHERE name = %s and version = %s;
        """
        try:
            cursor.execute(update_query_new, (model['name'][0], int(model['version'][0])))
            if(not old_model.empty):
                cursor.execute(update_query_old, (old_model['name'][0], int(old_model['version'][0])))
            conn.commit()
        except Exception as e:
            print(f"Failed to fetch data: {e}")

        if(not old_model.empty):
            del_name = f"{old_model['name'][0]}-version{old_model['version'][0]}-cyd"
            namespace = utils.get_default_target_namespace()

            # Initialize the KServe client
            KServe = KServeClient()

            # Delete the inference service
            KServe.delete(del_name, namespace)
    else:
        print("Bad Accuracy: Email Dovelopers!")
    
    cursor.close()
    conn.close()