In [11]:
def read_file(is_experiment: bool = False) -> None:
    import os
    import pandas as pd
    import numpy as np
    from minio import Minio
    from scipy.special import boxcox
    from sklearn.model_selection import train_test_split
    import boto3
    import json
    
    import psycopg2
    from psycopg2 import sql
    from sqlalchemy import create_engine, text
    import datetime
    
    def get_secret():

        secret_name = "DBCreds"
        region_name = "us-east-1"

        # Create a Secrets Manager client
        session = boto3.session.Session()
        client = session.client(
            service_name='secretsmanager',
            region_name=region_name
        )

        try:
            get_secret_value_response = client.get_secret_value(
                SecretId=secret_name
            )
        except ClientError as e:
            raise e

        secret = get_secret_value_response['SecretString']
    
        # Parse the secret string to get the credentials
        secret_dict = json.loads(secret)
        username = secret_dict['username']
        password = secret_dict['password']
        host = secret_dict['host']
        port = secret_dict['port']
        dbname = secret_dict['dbname']

        return username, password, host, port, dbname


    (user,pswd,host,port,db) = get_secret()
    preprocess_df = {'version':1}
    
    def zscore_normalization(df, name):
        mean = df[name].mean()
        sd = df[name].std()
        df[name] = (df[name] - mean) / sd
        preprocess_df[name] = (mean, sd)
    def preprocess(df):
        df = df.drop(columns=['url'])
        preprocess_df['url'] = None
        
        for c in df.columns:
            if len(df[c].unique()) == 1:
                preprocess_df[c] = None
                df.drop(columns=[c], inplace=True)
        
        corr_matrix = df.corr()
        target_corr = corr_matrix['outcome']
        threshold=0.1
        drop_features = target_corr[abs(target_corr)<=threshold].index.tolist()
        for i in drop_features:
            preprocess_df[i] = None
        df.drop(columns=drop_features, inplace=True)
        
        for i in df.columns:
            if i != 'outcome':
                zscore_normalization(df, i)
                
        return df

    db_details = {
        'dbname': db,
        'user': user,
        'password': pswd,
        'host': host,
        'port': port
    }

    
    engine = create_engine(f'postgresql+psycopg2://{db_details["user"]}:{db_details["password"]}@{db_details["host"]}:{db_details["port"]}/{db_details["dbname"]}')

    df = pd.DataFrame()
            
    try:
        with engine.connect() as conn:
            query = text('SELECT * FROM phishing_data WHERE outcome != 2;')
            chunksize = 10000 

            chunks = pd.read_sql_query(query, conn, chunksize=chunksize)

            features_list = []

            for chunk in chunks:
                features_df = pd.json_normalize(chunk['features'])
                features_df['outcome'] = chunk['outcome']
                
                df = pd.concat([df, features_df], ignore_index=True)

    except Exception as e:
        print(f"Failed to fetch data: {e}")


    df = preprocess(df)
    
    X = df.drop(columns=['outcome'])
    y = df['outcome']
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    
    bucket_name="phishingpipeline"
    role_arn = 'arn:aws:iam::533267059960:role/aws-s3-access'
    session_name = 'kubeflow-pipeline-session'
    sts_client = boto3.client('sts')
    response = sts_client.assume_role(RoleArn=role_arn, RoleSessionName=session_name)
    credentials = response['Credentials']
    # Configure AWS SDK with temporary credentials
    s3_client = boto3.client('s3',
                      aws_access_key_id=credentials['AccessKeyId'],
                      aws_secret_access_key=credentials['SecretAccessKey'],
                      aws_session_token=credentials['SessionToken'])
    
    print(s3_client)
    
    folder_path = './tmp/phishing'
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder '{folder_path}' created successfully.")
    else:
        print(f"Folder '{folder_path}' already exists.")
        
    
    
    df.to_csv("./tmp/phishing/phishing_data.csv")
    np.save("./tmp/phishing/X_train.npy",X_train)
    np.save("./tmp/phishing/y_train.npy",y_train)
    np.save("./tmp/phishing/X_test.npy",X_test)  
    np.save("./tmp/phishing/y_test.npy",y_test)
    
        
    if(not is_experiment):
        
        try:
            with engine.connect() as conn:
                query = text('SELECT * FROM metadata_table_phishing ORDER BY version DESC LIMIT 1;')
                data = pd.read_sql_query(query, conn)
                version = data['version'].iloc[0] + 1
                print(version)
        except Exception as e:
            version = 1
        
        s3_client.upload_file("./tmp/phishing/phishing_data.csv", bucket_name, f"version{version}/phishing_dataset.csv")
        s3_client.upload_file("./tmp/phishing/X_train.npy", bucket_name, f"version{version}/X_train.npy")
        s3_client.upload_file("./tmp/phishing/y_train.npy", bucket_name, f"version{version}/y_train.npy")
        s3_client.upload_file("./tmp/phishing/X_test.npy", bucket_name, f"version{version}/X_test.npy")
        s3_client.upload_file("./tmp/phishing/y_test.npy", bucket_name, f"version{version}/y_test.npy")
        
        preprocess_df['version'] = version
        mean_df = pd.DataFrame([preprocess_df])
        meta_df = pd.DataFrame(data = [[version, datetime.datetime.now(), len(X.columns), json.dumps(df.dtypes.astype(str).to_dict()),mean_df.iloc[0].to_json()]], columns = ['version', 'date', 'features', 'types','factor'])
        meta_df.to_sql("metadata_table_phishing", engine, if_exists='append', index=False)
    else:
        s3_client.upload_file("./tmp/phishing/phishing_data.csv", bucket_name, f"experiment/phishing_dataset.csv")
        s3_client.upload_file("./tmp/phishing/X_train.npy", bucket_name, f"experiment/X_train.npy")
        s3_client.upload_file("./tmp/phishing/y_train.npy", bucket_name, f"experiment/y_train.npy")
        s3_client.upload_file("./tmp/phishing/X_test.npy", bucket_name, f"experiment/X_test.npy")
        s3_client.upload_file("./tmp/phishing/y_test.npy", bucket_name, f"experiment/y_test.npy")
    


In [12]:
def train_op(is_experiment: bool = False) -> None:
    import pickle
    import pandas as pd
    import numpy as np
    import json
    import os
    import time
    import tensorflow as tf
    import boto3
    from minio import Minio
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.svm import SVC
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    from sqlalchemy import create_engine
    from sqlalchemy import create_engine, Table, Column, Float, Integer, String, MetaData, ARRAY
    from sqlalchemy import select, desc, insert, text
    from io import BytesIO
    
    import psycopg2
    from psycopg2 import sql
    from sqlalchemy import create_engine
    
    def get_secret():

        secret_name = "DBCreds"
        region_name = "us-east-1"

        # Create a Secrets Manager client
        session = boto3.session.Session()
        client = session.client(
            service_name='secretsmanager',
            region_name=region_name
        )

        try:
            get_secret_value_response = client.get_secret_value(
                SecretId=secret_name
            )
        except ClientError as e:
            raise e

        secret = get_secret_value_response['SecretString']
    
        # Parse the secret string to get the credentials
        secret_dict = json.loads(secret)
        username = secret_dict['username']
        password = secret_dict['password']
        host = secret_dict['host']
        port = secret_dict['port']
        dbname = secret_dict['dbname']

        return username, password, host, port, dbname


    (user,pswd,host,port,db) = get_secret()
    
    bucket_name="phishingpipeline"
    role_arn = 'arn:aws:iam::533267059960:role/aws-s3-access'
    session_name = 'kubeflow-pipeline-session'
    sts_client = boto3.client('sts')
    response = sts_client.assume_role(RoleArn=role_arn, RoleSessionName=session_name)
    credentials = response['Credentials']
    
    # Configure AWS SDK with temporary credentials
    s3_client = boto3.client('s3',
                      aws_access_key_id=credentials['AccessKeyId'],
                      aws_secret_access_key=credentials['SecretAccessKey'],
                      aws_session_token=credentials['SessionToken'])
    
    if(not is_experiment):
        db_details = {
            'dbname': db,
            'user': user,
            'password': pswd,
            'host': host,
            'port': port
        }



        # Connect to PostgreSQL
        try:
            conn = psycopg2.connect(**db_details)
            cursor = conn.cursor()
            print("Connected to PostgreSQL successfully.")
        except Exception as e:
            print(f"Failed to connect to PostgreSQL: {e}")
            exit()

        # Query to fetch data from the table
        try:
            fetch_query = "SELECT * FROM metadata_table_phishing ORDER BY date DESC LIMIT 1;"
            df = pd.read_sql(fetch_query, conn)
        except Exception as e:
            print(f"Failed to fetch data: {e}")

        if(not df.empty):
            version = df['version'][0]
        else:
            version = 1

        folder_path = f"version{version}"

        cursor.close()
        conn.close()

        print(f"version{version}/X_train.npy")

        response = s3_client.get_object(Bucket=bucket_name, Key=f"version{version}/X_train.npy")
        data = response['Body'].read()
        X_train = np.load(BytesIO(data))
        X_train = pd.DataFrame(X_train)

        response = s3_client.get_object(Bucket=bucket_name, Key=f"version{version}/y_train.npy")
        data = response['Body'].read()
        y_train = np.load(BytesIO(data))


        response = s3_client.get_object(Bucket=bucket_name, Key=f"version{version}/X_test.npy")
        data = response['Body'].read()
        X_test = np.load(BytesIO(data))
        X_test = pd.DataFrame(X_test)

        response = s3_client.get_object(Bucket=bucket_name, Key=f"version{version}/y_test.npy")
        data = response['Body'].read()
        y_test = np.load(BytesIO(data))
    
    else:
        version = 0
        folder_path = 'experiment'
    
        response = s3_client.get_object(Bucket=bucket_name, Key=f"experiment/X_train.npy")
        data = response['Body'].read()
        X_train = np.load(BytesIO(data))
        X_train = pd.DataFrame(X_train)

        response = s3_client.get_object(Bucket=bucket_name, Key=f"experiment/y_train.npy")
        data = response['Body'].read()
        y_train = np.load(BytesIO(data))


        response = s3_client.get_object(Bucket=bucket_name, Key=f"experiment/X_test.npy")
        data = response['Body'].read()
        X_test = np.load(BytesIO(data))
        X_test = pd.DataFrame(X_test)

        response = s3_client.get_object(Bucket=bucket_name, Key=f"experiment/y_test.npy")
        data = response['Body'].read()
        y_test = np.load(BytesIO(data))
    
    # Define dataframe to store model metrics
    metrics = pd.DataFrame(columns=["Version", "Model", "Accuracy", "F1", "Precision", "Recall", "Train_Time", "Test_Time"])
    models_path = './tmp/phishing/models'
    
    
    if not os.path.exists(models_path):
        os.makedirs(models_path)
        print(f"Folder '{models_path}' created successfully.")
    else:
        print(f"Folder '{models_path}' already exists.")
        
    
    #Logistic Regression
    start_train = time.time()
    lrc = LogisticRegression(random_state=0, max_iter=1000)
    lrc.fit(X_train, y_train)
    end_train = time.time()
    start_test = time.time()
    ypredlr = lrc.predict(X_test)
    end_test = time.time()
    accuracy = accuracy_score(y_test, ypredlr)
    f1 = f1_score(y_test, ypredlr)
    precision = precision_score(y_test, ypredlr)
    recall = recall_score(y_test, ypredlr)
    metrics.loc[len(metrics.index)] = [version,'lrc', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test]
    with open('./tmp/phishing/models/lrc.pkl', 'wb') as f:
        pickle.dump(lrc, f)
    s3_client.upload_file("tmp/phishing/models/lrc.pkl", bucket_name, f"{folder_path}/lrc/model.pkl")
    
    
    #Random Forest Classifier
    start_train = time.time()
    rfc = RandomForestClassifier()
    rfc.fit(X_train, y_train)
    end_train = time.time()
    start_test = time.time()
    y_pred2=rfc.predict(X_test)
    end_test = time.time()
    accuracy = accuracy_score(y_test, y_pred2)
    f1 = f1_score(y_test, y_pred2)
    precision = precision_score(y_test, y_pred2)
    recall = recall_score(y_test, y_pred2)
    metrics.loc[len(metrics.index)] = [version, 'rfc', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test]
    with open('./tmp/phishing/models/rfc.pkl', 'wb') as f:
        pickle.dump(rfc, f)
    s3_client.upload_file("tmp/phishing/models/rfc.pkl", bucket_name, f"{folder_path}/rfc/model.pkl")
    
    
    #Decision Tree
    start_train = time.time()
    dtc = DecisionTreeClassifier()
    dtc.fit(X_train, y_train)
    end_train = time.time()
    start_test = time.time()
    y_pred3=dtc.predict(X_test)
    end_test = time.time()
    accuracy = accuracy_score(y_test,y_pred3)
    f1 = f1_score(y_test, y_pred3)
    precision = precision_score(y_test, y_pred3)
    recall = recall_score(y_test, y_pred3)
    metrics.loc[len(metrics.index)] = [version, 'dtc', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test]
    with open('./tmp/phishing/models/dtc.pkl', 'wb') as f:
        pickle.dump(dtc, f)
    s3_client.upload_file("tmp/phishing/models/dtc.pkl", bucket_name, f"{folder_path}/dtc/model.pkl")
    
    
    
    #Support Vector Machine
    start_train = time.time()
    svc = SVC()
    svc.fit(X_train, y_train)
    end_train = time.time()
    start_test = time.time()
    y_pred4=svc.predict(X_test)
    end_test = time.time()
    accuracy = accuracy_score(y_test,y_pred4)
    f1 = f1_score(y_test,y_pred4)
    precision = precision_score(y_test, y_pred4)
    recall = recall_score(y_test, y_pred4)
    metrics.loc[len(metrics.index)] = [version, 'svc', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test]
    with open('./tmp/phishing/models/svc.pkl', 'wb') as f:
        pickle.dump(svc, f)
    s3_client.upload_file("tmp/phishing/models/svc.pkl", bucket_name, f"{folder_path}/svc/model.pkl")
    
    
    
    #Gradient Boost
    start_train = time.time()
    gbc = GradientBoostingClassifier()
    gbc.fit(X_train, y_train)
    end_train = time.time()
    start_test = time.time()
    y_pred5=gbc.predict(X_test)
    end_test = time.time()
    accuracy = accuracy_score(y_test,y_pred5)
    f1 = f1_score(y_test, y_pred5)
    precision = precision_score(y_test, y_pred5)
    recall = (recall_score(y_test, y_pred5))
    metrics.loc[len(metrics.index)] = [version, 'gbc', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test]
    with open('./tmp/phishing/models/gbc.pkl', 'wb') as f:
        pickle.dump(gbc, f)
    s3_client.upload_file("tmp/phishing/models/gbc.pkl", bucket_name, f"{folder_path}/gbc/model.pkl")
    
    
    #Gaussian Naive Bayes
    start_train = time.time()
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    end_train = time.time()
    start_test = time.time()
    y_pred6=gnb.predict(X_test)
    end_test = time.time()
    accuracy = accuracy_score(y_test,y_pred6)
    f1 = f1_score(y_test, y_pred6)
    precision = precision_score(y_test,y_pred6)
    recall = recall_score(y_test, y_pred6)
    metrics.loc[len(metrics.index)] = [version, 'gnb', accuracy, f1, precision, recall, end_train-start_train, end_test-start_test]  
    with open('./tmp/phishing/models/gnb.pkl', 'wb') as f:
        pickle.dump(gnb, f)      
    s3_client.upload_file("tmp/phishing/models/gnb.pkl", bucket_name, f"{folder_path}/gnb/model.pkl")
    
    
    
    #Artificial Neural Network
    input_shape = [X_train.shape[1]]
    start_train = time.time()
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(units=64, activation='relu', input_shape=input_shape),
        tf.keras.layers.Dense(units=64, activation='relu'),
        tf.keras.layers.Dense(units=1, activation='sigmoid')
    ])
    model.build()
    model.compile(optimizer='adam', loss='binary_crossentropy',  metrics=['accuracy'])
    history = model.fit(X_train, y_train, validation_data=(X_test,y_test), batch_size=256, epochs=25)
    end_train=time.time()
    start_test = time.time()
    y_pred7 = model.predict(X_test)
    y_pred7 = (y_pred7 > 0.5).astype(np.int32)
    end_test = time.time()
    print(y_pred7)
    accuracy = accuracy_score(y_test,y_pred7)
    f1 = f1_score(y_test, y_pred7)
    precision = precision_score(y_test,y_pred7)
    recall = recall_score(y_test, y_pred7)
    # accuracy = history.history['accuracy'][11]
    metrics.loc[len(metrics.index)] = [version, 'ann', accuracy, f1, precision, recall, end_test-start_test, 0]
    with open('./tmp/phishing/models/ann.pkl', 'wb') as f:
        pickle.dump(model, f)
    s3_client.upload_file("tmp/phishing/models/ann.pkl", bucket_name, f"{folder_path}/ann/model.pkl")

    if(not is_experiment):
        db_details = {
            'dbname': db,
            'user': user,
            'password': pswd,
            'host': host,
            'port': port
        }

        insert_query = """
            INSERT INTO phishing_model_metrics (name, version, URI, in_use, accuracy, f1, precision, recall, train_time, test_time)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON CONFLICT (name, version) DO NOTHING;
        """
        try:
            conn = psycopg2.connect(**db_details)
            cursor = conn.cursor()
            print("Connected to PostgreSQL successfully.")

            # Iterate through DataFrame rows and insert into the table
            for index, row in metrics.iterrows():
                cursor.execute(insert_query, (
                    row['Model'], 
                    row['Version'], 
                    f"s3://phishingpipeline/version{version}/{row['Model']}/model.pkl", 
                    False, 
                    row['Accuracy'], 
                    row['F1'], 
                    row['Precision'], 
                    row['Recall'], 
                    row['Train_Time'], 
                    row['Test_Time']
                ))

            conn.commit()
            print("Data inserted successfully.")

            cursor.close()
            conn.close()
            print("PostgreSQL connection closed.")
        except Exception as e:
            print(f"Failed to connect to PostgreSQL or insert data: {e}")
    else:
        print(metrics)
    

In [13]:
def run_functions() -> None:
    read_file(True)
    train_op(True)

In [14]:
run_functions()

<botocore.client.S3 object at 0x7fd627550160>
Folder './tmp/phishing' already exists.
Folder './tmp/phishing/models' created successfully.


2024-07-30 19:22:51.266928: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX512F
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-30 19:22:51.268023: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
[[0]
 [1]
 [1]
 ...
 [1]
 [0]
 [0]]
INFO:tensorflow:Assets written to: ram://0598068a-3383-4e9e-8b70-5ffd53a9ae1d/assets
   Version Model  Accuracy        F1  Precision    Recall  Train_Time  \
0        0   lrc  0.938758  0.937332   0.944094  0.930667    1.441647   
1        0   rfc  0.964129  0.963588   0.962733  0.964444    2.105220   
2        0   dtc  0.939633  0.938061   0.947416  0.928889    0.216407   
3        0   svc  0.955381  0.954383   0.960396  0.948444    1.711653   
4        0   gbc  0.951006  0.950045   0.953447  0.946667    5.155077   
5        0   gnb  0.753281  0.684916   0.921805  0.544889    0.007924   
6        0   ann  0.954943  0.953708   0.964545  0.943111    0.890404   

   