In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import tensorflow as tf
import pickle
from scipy.special import boxcox

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from keras import callbacks

2024-06-27 13:43:15.037751: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def zscore_normalization(df, name):
    mean = df[name].mean()
    sd = df[name].std()

    df[name] = (df[name] - mean) / sd

def preprocess(df):
    df = df.drop(columns=['Name', 'md5'])
    for i in df.columns:
        if i != 'legitimate':
            df[i] = boxcox(df[i], 0.5)
            zscore_normalization(df, i)
    correlation_matrix = df.corr()
    cols_to_drop = []
    for i in df.columns:
        for j in df.columns:
            if i != j and i != 'legitimate' and j != 'legitimate' and abs(correlation_matrix[i][j]) > 0.6 and i not in cols_to_drop and j not in cols_to_drop:
                cols_to_drop.append(i)
    cols_to_drop = set(cols_to_drop)
    df.drop(columns=cols_to_drop, inplace=True)
    return df


In [4]:
def traintest_split(df):
    X = df.drop(columns=['legitimate'])
    y = df['legitimate']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    return X_train, X_test, y_train, y_test

In [5]:
def predict(test_data):
    with open (f'model.pkl', 'rb') as f:
        model = pickle.load(f)

    y_pred = model.predict(test_data)
    return y_pred

In [6]:
def train_random_forest(X_train, X_test, y_train, y_test):
    model_type = "Random Forest"
    print(model_type, "classifier:") 
    model = RandomForestClassifier()
    start_time_train = time.time()  # Start time

    model.fit(X_train, y_train)  # Fit the classifier
        
    end_time_train = time.time()  # End time
    time_taken_train = end_time_train - start_time_train  # Time taken to run the code

    print(f"Time taken to train the {model_type} model: {time_taken_train} seconds")
        
    # Make predictions
    start_test = time.time()
    y_pred = model.predict(X_test)
    end_test = time.time()
    time_taken_test = end_test - start_test  # Time taken to run the code

    print(f"Time taken to test the {model_type} model: {time_taken_test} seconds")

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", recall)
    print("F1 score:", f1)

    with open(f'model.pkl', 'wb') as f:
        pickle.dump(model, f)

In [7]:
def train_logistic_regression(X_train, X_test, y_train, y_test):
    model_type = "Logistic Regression"
    print(model_type, "classifier:") 
    model = LogisticRegression(random_state=0, max_iter=1000)
    start_time_train = time.time()  # Start time

    model.fit(X_train, y_train)  # Fit the classifier
        
    end_time_train = time.time()  # End time
    time_taken_train = end_time_train - start_time_train  # Time taken to run the code

    print(f"Time taken to train the {model_type} model: {time_taken_train} seconds")
        
    # Make predictions
    start_test = time.time()
    y_pred = model.predict(X_test)
    end_test = time.time()
    time_taken_test = end_test - start_test  # Time taken to run the code

    print(f"Time taken to test the {model_type} model: {time_taken_test} seconds")

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", recall)
    print("F1 score:", f1)

    with open(f'model.pkl', 'wb') as f:
        pickle.dump(model, f)

In [None]:
def train_ann(X_train, X_test, y_train, y_test):
    input_shape = [X_train.shape[1]]
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(units=64, activation='relu', input_shape=input_shape),
        tf.keras.layers.Dense(units=64, activation='relu'),
        tf.keras.layers.Dense(units=1)
    ])

    model.build()

    print(model.summary())
    start_time_train = time.time()  # Start time

    model.compile(optimizer='adam', loss='mae', metrics=['accuracy'])  
    earlystopping = callbacks.EarlyStopping(monitor="val_loss",
                                                mode="min",
                                                patience=5,
                                                restore_best_weights=True)

    history = model.fit(X_train, y_train, validation_data=(X_test,y_test), batch_size=256, epochs=60,callbacks=[earlystopping])
            
    end_time_train = time.time()  # End time
    time_taken_train = end_time_train - start_time_train  # Time taken to run the code
    with open(f'model.pkl', 'wb') as f:
        pickle.dump(model, f)


In [None]:
def train(file_path):
    df = pd.read_csv(file_path)
    df = preprocess(df)
    X_train, X_test, y_train, y_test = traintest_split(df)

In [None]:
def predict(test_data):
    with open (f'model.pkl', 'rb') as f:
        model = pickle.load(f)

    y_pred = model.predict(test_data)
    return y_pred

In [None]:
def cross_validate(model, X, y):
    start_time_cv = time.time() 
    
    cv_scores = cross_val_score(model, X, y, cv=5)
    
    end_time_cv = time.time()  # End time
    time_taken_cv = end_time_cv - start_time_cv  # Time taken to run the code

    print(f"Time taken to cross-validate the model: {time_taken_cv} seconds")
    print("Cross-validation scores:", cv_scores)
    print("Mean CV accuracy:", cv_scores.mean())
    print("Standard deviation of CV accuracy:", cv_scores.std())

In [8]:
df = pd.read_csv('MalwareData.csv', sep='|')
df = preprocess(df)
X_train, X_test, y_train, y_test = traintest_split(df)
train_logistic_regression(X_train, X_test, y_train, y_test)

Logistic Regression classifier:
Time taken to train the Logistic Regression model: 0.3835327625274658 seconds
Time taken to test the Logistic Regression model: 0.004948139190673828 seconds
Accuracy: 0.9798623687069902
Precision: 0.9730991278712688
Recall: 0.959196028574888
F1 score: 0.9660975609756097
