In [1]:
#importing required libraries
import os
import glob
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
 
#importing keras libraries
import keras
from keras.models import Model, Sequential
from tensorflow.keras.applications import VGG16
from keras.applications.vgg16 import preprocess_input
from keras.utils import np_utils
from keras.preprocessing import image
from keras.preprocessing.image import load_img, img_to_array
from keras.layers import Dense, Activation ,Flatten, Conv2D, MaxPool2D

#importing sklearn libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score

In [2]:
# imagedir = "MalImg"
imagedir = "/home/sanjeev/DL_POC/MlaImg_Data/Malimg_data"

In [3]:
cur_dir = os.getcwd()
os.chdir(imagedir)  # the parent folder with sub-folders

# Get number of samples per family
list_fams = sorted(os.listdir(os.getcwd()), key=str.lower)  # vector of strings with family names
mal_family = list_fams[:]
mal_family.sort()
print("Malware Families : ", mal_family, "\n")
no_imgs = []  # No. of samples per family
for i in range(len(list_fams)):
    os.chdir(list_fams[i])
    len1 = len(glob.glob('*.png'))  # assuming the images are stored as 'png'
    no_imgs.append(len1)
    os.chdir('..')
num_samples = np.sum(no_imgs)  # total number of all samples

# Compute the labels
y = np.zeros(num_samples)
pos = 0
label = 0
for i in no_imgs:
    print ("Label:%2d\tFamily: %15s\tNumber of images: %d" % (label, list_fams[label], i))
    for j in range(i):
        y[pos] = label
        pos += 1
    label += 1
num_classes = label

Malware Families :  ['Adialer.C', 'Agent.FYI', 'Allaple.A', 'Allaple.L', 'Alueron.gen!J', 'Autorun.K', 'C2LOP.P', 'C2LOP.gen!g', 'Dialplatform.B', 'Dontovo.A', 'Fakerean', 'Instantaccess', 'Lolyda.AA1', 'Lolyda.AA2', 'Lolyda.AA3', 'Lolyda.AT', 'Malex.gen!J', 'Obfuscator.AD', 'Rbot!gen', 'Skintrim.N', 'Swizzor.gen!E', 'Swizzor.gen!I', 'VB.AT', 'Wintrim.BX', 'Yuner.A'] 

Label: 0	Family:       Adialer.C	Number of images: 122
Label: 1	Family:       Agent.FYI	Number of images: 116
Label: 2	Family:       Allaple.A	Number of images: 2949
Label: 3	Family:       Allaple.L	Number of images: 1591
Label: 4	Family:   Alueron.gen!J	Number of images: 198
Label: 5	Family:       Autorun.K	Number of images: 106
Label: 6	Family:     C2LOP.gen!g	Number of images: 200
Label: 7	Family:         C2LOP.P	Number of images: 146
Label: 8	Family:  Dialplatform.B	Number of images: 177
Label: 9	Family:       Dontovo.A	Number of images: 162
Label:10	Family:        Fakerean	Number of images: 381
Label:11	Family:   In

In [4]:
# Compute the features
width, height,channels = (224,224,3)
X = np.zeros((num_samples, width, height, channels))
cnt = 0
list_paths = [] # List of image paths
print("Processing images ...")
for i in range(len(list_fams)):
    for img_file in glob.glob(list_fams[i]+'/*.png'):
        #print("[%d] Processing image: %s" % (cnt, img_file))
        list_paths.append(os.path.join(os.getcwd(),img_file))
        img = image.load_img(img_file, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)
        X[cnt] = x
        cnt += 1
print("Images processed: %d" %(cnt))

os.chdir(cur_dir)

Processing images ...
Images processed: 9339


In [5]:
# Encoding classes (y) into integers (y_encoded) and then generating one-hot-encoding (Y)
encoder = LabelEncoder()
encoder.fit(y)
y_encoded = encoder.transform(y)
Y = np_utils.to_categorical(y_encoded)

In [6]:
vgg = Sequential()
vgg.add(Conv2D(3, (3, 3), padding='same', input_shape=(224, 224, 3)))
vgg.add(Activation('relu'))

_vgg = VGG16(weights='imagenet', include_top=False)

counter=0
for layer in _vgg.layers:
    layer.trainable = False
    counter+=1

print("VGG's ", counter , " layers are not added to the layer")
vgg.add(_vgg)
print("done")

VGG's  19  layers are not added to the layer
done


In [7]:
vgg.add(Flatten())

vgg.add(Dense(4096,activation='relu'))
vgg.add(Dense(4096,activation='relu'))
vgg.add(Dense(512,activation="relu"))

vgg.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [8]:
vgg.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 224, 224, 3)       84        
                                                                 
 activation (Activation)     (None, 224, 224, 3)       0         
                                                                 
 vgg16 (Functional)          (None, None, None, 512)   14714688  
                                                                 
 flatten (Flatten)           (None, 25088)             0         
                                                                 
 dense (Dense)               (None, 4096)              102764544 
                                                                 
 dense_1 (Dense)             (None, 4096)              16781312  
                                                                 
 dense_2 (Dense)             (None, 512)               2

In [9]:
%%time
feat = vgg.predict(X)
feat.shape

CPU times: user 2h 15min 3s, sys: 27min 53s, total: 2h 42min 56s
Wall time: 14min 52s


(9339, 512)

In [10]:
#np.save("/home/sanjeev/DL_Exp_Kajal/Finetune_stack_features/MalImg/vgg16_feat_malimg_512.npy", feat)

In [11]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [12]:
feat = np.load("/home/sanjeev/DL_Exp_Kajal/Finetune_stack_features/MalImg/vgg16_feat_malimg_512.npy")

In [13]:
y

array([ 0.,  0.,  0., ..., 24., 24., 24.])

In [14]:
#classification model creation using different classifiers
def classify(model, x, y):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=31)
    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)
    
    start_time = time.time()
    
    model.fit(X_train, y_train)   
    print("Training Accuracy: ", round(model.score(X_train, y_train)*100,3))
    
    end_time = time.time()
    training_time = end_time - start_time 
    print("Training Time:", training_time, "seconds")
    ###################################################################
    start_time = time.time() 
    print("Testing Accuracy: ", round(model.score(X_test, y_test)*100,3))
    end_time = time.time()
    training_time = end_time - start_time 
    print("Testing Time:", training_time, "seconds")
    ####################################################################
    acc = round(model.score(X_test, y_test)*100,3)
    
    score = cross_val_score(model, x, y, cv=5)
    print("Model Accuracy for cross validation:", round(np.mean(score)*100, 2))
    cv = round(np.mean(score)*100, 2)
    
    pred = model.predict(X_test)
    print('Precision:', int(round(precision_recall_fscore_support(y_test, pred, average='micro')[0]*100, 0)))
    print('Recall:', int(round(precision_recall_fscore_support(y_test, pred, average='micro')[1]*100, 0)))
    print('F1_Score:', int(round(precision_recall_fscore_support(y_test, pred, average='micro')[2]*100, 0)))
    ls = precision_recall_fscore_support(y_test, pred, average='micro')
    pre, rec, f1 = int(round(ls[0]*100, 0)), int(round(ls[1]*100, 0)), int(round(ls[2]*100, 0))
    print('*---------------------------*')
    return [acc, pre, rec, f1, cv]

classifiers = [KNeighborsClassifier(),
               SVC(random_state=31),
               RandomForestClassifier(random_state=31),
               MLPClassifier(random_state=31, max_iter=500),
               ExtraTreeClassifier(random_state=31),
               GaussianNB()]

ls_acc, ls_pre, ls_rec, ls_f1, ls_cv = [],[],[],[],[]

for classifier in classifiers:
    print(classifier)
    values = classify(classifier, feat, y)       
    
    ls_acc.append(values[0])
    ls_pre.append(values[1])
    ls_rec.append(values[2])
    ls_f1.append(values[3])
    ls_cv.append(values[4])
    
print([*ls_acc, *ls_pre, *ls_rec, *ls_f1, *ls_cv])

KNeighborsClassifier()
(8405, 512) (8405,)
(934, 512) (934,)
Training Accuracy:  98.334
Training Time: 4.141347408294678 seconds
Testing Accuracy:  98.287
Testing Time: 0.9005074501037598 seconds
Model Accuracy for cross validation: 97.64
Precision: 98
Recall: 98
F1_Score: 98
*---------------------------*
SVC(random_state=31)
(8405, 512) (8405,)
(934, 512) (934,)
Training Accuracy:  99.096
Training Time: 25.93344759941101 seconds
Testing Accuracy:  98.394
Testing Time: 1.117750644683838 seconds
Model Accuracy for cross validation: 98.27
Precision: 98
Recall: 98
F1_Score: 98
*---------------------------*
RandomForestClassifier(random_state=31)
(8405, 512) (8405,)
(934, 512) (934,)
Training Accuracy:  100.0
Training Time: 24.547622680664062 seconds
Testing Accuracy:  96.788
Testing Time: 0.08360457420349121 seconds
Model Accuracy for cross validation: 96.53
Precision: 97
Recall: 97
F1_Score: 97
*---------------------------*
MLPClassifier(max_iter=500, random_state=31)
(8405, 512) (8405,)

In [15]:
# #classification model creation using different classifiers
# def classify(model, x, y):
#     X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=31)
#     print(X_train.shape, y_train.shape)
#     print(X_test.shape, y_test.shape)

#     model.fit(X_train, y_train)   
#     print("Training Accuracy: ", round(model.score(X_train, y_train)*100,3))
#     print("Testing Accuracy: ", round(model.score(X_test, y_test)*100,3))
#     acc = round(model.score(X_test, y_test)*100,3)
    
#     score = cross_val_score(model, x, y, cv=5)
#     print("Model Accuracy for cross validation:", round(np.mean(score)*100, 2))
#     cv = round(np.mean(score)*100, 2)
    
#     pred = model.predict(X_test)
#     print('Precision:', int(round(precision_recall_fscore_support(y_test, pred, average='micro')[0]*100, 0)))
#     print('Recall:', int(round(precision_recall_fscore_support(y_test, pred, average='micro')[1]*100, 0)))
#     print('F1_Score:', int(round(precision_recall_fscore_support(y_test, pred, average='micro')[2]*100, 0)))
#     ls = precision_recall_fscore_support(y_test, pred, average='micro')
#     pre, rec, f1 = int(round(ls[0]*100, 0)), int(round(ls[1]*100, 0)), int(round(ls[2]*100, 0))
#     print('*---------------------------*')
#     return [acc, pre, rec, f1, cv]

# classifiers = [KNeighborsClassifier(),
#                SVC(random_state=31),
#                RandomForestClassifier(random_state=31),
#                MLPClassifier(random_state=31, max_iter=600),
#                ExtraTreeClassifier(random_state=31),
#                GaussianNB()]

# ls_acc, ls_pre, ls_rec, ls_f1, ls_cv = [],[],[],[],[]

# for classifier in classifiers:
#     print(classifier)
#     values = classify(classifier, feat, y)       
    
#     ls_acc.append(values[0])
#     ls_pre.append(values[1])
#     ls_rec.append(values[2])
#     ls_f1.append(values[3])
#     ls_cv.append(values[4])
    
# print([*ls_acc, *ls_pre, *ls_rec, *ls_f1, *ls_cv])