In [1]:
import os
import os.path
import glob
import time
import numpy as np
np.random.seed(1)

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.applications.vgg16 import decode_predictions
from keras.applications.vgg16 import VGG16
from keras.models import Sequential
from keras.layers import Dense, Activation ,Flatten, Conv2D
from keras.utils import np_utils
from keras.preprocessing import image
from keras.applications.imagenet_utils import preprocess_input

#importing sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score

In [2]:
# imagedir = "MalImg"
imagedir = "/home/sanjeev/DL_POC/MlaImg_Data/Malimg_data"

In [3]:
cur_dir = os.getcwd()
os.chdir(imagedir)  # the parent folder with sub-folders

# Get number of samples per family
list_fams = sorted(os.listdir(os.getcwd()), key=str.lower)  # vector of strings with family names
mal_family = list_fams[:]
mal_family.sort()
print("Malware Families : ", mal_family, "\n")
no_imgs = []  # No. of samples per family
for i in range(len(list_fams)):
    os.chdir(list_fams[i])
    len1 = len(glob.glob('*.png'))  # assuming the images are stored as 'png'
    no_imgs.append(len1)
    os.chdir('..')
num_samples = np.sum(no_imgs)  # total number of all samples

# Compute the labels
y = np.zeros(num_samples)
pos = 0
label = 0
for i in no_imgs:
    print ("Label:%2d\tFamily: %15s\tNumber of images: %d" % (label, list_fams[label], i))
    for j in range(i):
        y[pos] = label
        pos += 1
    label += 1
num_classes = label

# Compute the features
width, height,channels = (224,224,3)
X = np.zeros((num_samples, width, height, channels))
cnt = 0
list_paths = [] # List of image paths
print("Processing images ...")
for i in range(len(list_fams)):
    for img_file in glob.glob(list_fams[i]+'/*.png'):
        #print("[%d] Processing image: %s" % (cnt, img_file))
        list_paths.append(os.path.join(os.getcwd(),img_file))
        img = image.load_img(img_file, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)
        X[cnt] = x
        cnt += 1
print("Images processed: %d" %(cnt))

os.chdir(cur_dir)

Malware Families :  ['Adialer.C', 'Agent.FYI', 'Allaple.A', 'Allaple.L', 'Alueron.gen!J', 'Autorun.K', 'C2LOP.P', 'C2LOP.gen!g', 'Dialplatform.B', 'Dontovo.A', 'Fakerean', 'Instantaccess', 'Lolyda.AA1', 'Lolyda.AA2', 'Lolyda.AA3', 'Lolyda.AT', 'Malex.gen!J', 'Obfuscator.AD', 'Rbot!gen', 'Skintrim.N', 'Swizzor.gen!E', 'Swizzor.gen!I', 'VB.AT', 'Wintrim.BX', 'Yuner.A'] 

Label: 0	Family:       Adialer.C	Number of images: 122
Label: 1	Family:       Agent.FYI	Number of images: 116
Label: 2	Family:       Allaple.A	Number of images: 2949
Label: 3	Family:       Allaple.L	Number of images: 1591
Label: 4	Family:   Alueron.gen!J	Number of images: 198
Label: 5	Family:       Autorun.K	Number of images: 106
Label: 6	Family:     C2LOP.gen!g	Number of images: 200
Label: 7	Family:         C2LOP.P	Number of images: 146
Label: 8	Family:  Dialplatform.B	Number of images: 177
Label: 9	Family:       Dontovo.A	Number of images: 162
Label:10	Family:        Fakerean	Number of images: 381
Label:11	Family:   In

In [4]:
# Encoding classes (y) into integers (y_encoded) and then generating one-hot-encoding (Y)
encoder = LabelEncoder()
encoder.fit(y)
y_encoded = encoder.transform(y)
Y = np_utils.to_categorical(y_encoded)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.10, random_state=43)

In [6]:
print("X_train.shape = ", X_train.shape)
print("X_test.shape = ", X_test.shape)
print("y_train.shape = ", y_train.shape)
print("y_test.shape = ", y_test.shape)

X_train.shape =  (8405, 224, 224, 3)
X_test.shape =  (934, 224, 224, 3)
y_train.shape =  (8405, 25)
y_test.shape =  (934, 25)


In [7]:
input_shape=X_train.shape[1:]

In [8]:
input_shape

(224, 224, 3)

In [9]:
vgg = Sequential()
# vgg.add(Conv2D(3, (3, 3), padding='same',input_shape=X_train.shape[1:]))
vgg.add(Conv2D(3, (3, 3), padding='same',input_shape=(224, 224, 3)))
vgg.add(Activation('relu'))

# _vgg = VGG16(weights='imagenet', include_top=False)
_vgg = VGG16(weights=None, include_top=False)

counter=0
for layer in _vgg.layers:
    layer.trainable = False
    counter+=1
    
# UnFreeze first block layers
for layer in _vgg.layers[:]:
    if layer.name.startswith('block4'):
        layer.trainable = True

print("VGG's ", counter , " layers are not added to the layer")
vgg.add(_vgg)
print("done")

VGG's  19  layers are not added to the layer
done


In [10]:
vgg.add(Flatten())
vgg.add(Dense(512,activation='relu'))

In [11]:
vgg.layers[2].get_layer('block4_conv1').trainable = False
vgg.layers[2].get_layer('block4_conv2').trainable = False
vgg.layers[2].get_layer('block4_conv3').trainable = False
vgg.layers[2].get_layer('block4_pool').trainable = False

In [12]:
vgg.layers[2].get_layer('block1_conv1').trainable = True
vgg.layers[2].get_layer('block1_conv2').trainable = True
vgg.layers[2].get_layer('block1_pool').trainable = True

In [13]:
# vgg.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
vgg.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), 
            loss='categorical_crossentropy', 
            metrics=['accuracy'])

In [14]:
%%time
feat = vgg.predict(X)
feat.shape

CPU times: user 2h 10min 39s, sys: 29min 38s, total: 2h 40min 18s
Wall time: 7min 32s


(9339, 512)

In [15]:
np.save("/home/sanjeev/DL_Exp_Kajal/Finetune_stack_features/MalImg/vgg16_finetuned_feat_malimg_512.npy", feat)

In [4]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [5]:
feat = np.load("/home/sanjeev/DL_Exp_Kajal/Finetune_stack_features/MalImg/vgg16_finetuned_feat_malimg_512.npy")

In [6]:
y

array([ 0.,  0.,  0., ..., 24., 24., 24.])

In [7]:
#classification model creation using different classifiers
def classify(model, x, y):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=31)
    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)

    model.fit(X_train, y_train)   
    print("Training Accuracy: ", round(model.score(X_train, y_train)*100,3))
    print("Testing Accuracy: ", round(model.score(X_test, y_test)*100,3))
    acc = round(model.score(X_test, y_test)*100,3)
    
    score = cross_val_score(model, x, y, cv=5)
    print("Model Accuracy for cross validation:", round(np.mean(score)*100, 2))
    cv = round(np.mean(score)*100, 2)
    
    pred = model.predict(X_test)
    print('Precision:', int(round(precision_recall_fscore_support(y_test, pred, average='micro')[0]*100, 0)))
    print('Recall:', int(round(precision_recall_fscore_support(y_test, pred, average='micro')[1]*100, 0)))
    print('F1_Score:', int(round(precision_recall_fscore_support(y_test, pred, average='micro')[2]*100, 0)))
    ls = precision_recall_fscore_support(y_test, pred, average='micro')
    pre, rec, f1 = int(round(ls[0]*100, 0)), int(round(ls[1]*100, 0)), int(round(ls[2]*100, 0))
    print('*---------------------------*')
    return [acc, pre, rec, f1, cv]

classifiers = [KNeighborsClassifier(),
               SVC(random_state=31),
               RandomForestClassifier(random_state=31),
               MLPClassifier(random_state=31, max_iter=600),
               ExtraTreeClassifier(random_state=31),
               GaussianNB()]

ls_acc, ls_pre, ls_rec, ls_f1, ls_cv = [],[],[],[],[]

for classifier in classifiers:
    print(classifier)
    values = classify(classifier, feat, y)       
    
    ls_acc.append(values[0])
    ls_pre.append(values[1])
    ls_rec.append(values[2])
    ls_f1.append(values[3])
    ls_cv.append(values[4])
    
print([*ls_acc, *ls_pre, *ls_rec, *ls_f1, *ls_cv])

KNeighborsClassifier()
(8405, 512) (8405,)
(934, 512) (934,)
Training Accuracy:  94.67
Testing Accuracy:  93.683
Model Accuracy for cross validation: 93.24
Precision: 94
Recall: 94
F1_Score: 94
*---------------------------*
SVC(random_state=31)
(8405, 512) (8405,)
(934, 512) (934,)
Training Accuracy:  87.567
Testing Accuracy:  88.865
Model Accuracy for cross validation: 87.56
Precision: 89
Recall: 89
F1_Score: 89
*---------------------------*
RandomForestClassifier(random_state=31)
(8405, 512) (8405,)
(934, 512) (934,)
Training Accuracy:  100.0
Testing Accuracy:  93.148
Model Accuracy for cross validation: 91.71
Precision: 93
Recall: 93
F1_Score: 93
*---------------------------*
MLPClassifier(max_iter=600, random_state=31)
(8405, 512) (8405,)
(934, 512) (934,)
Training Accuracy:  99.822
Testing Accuracy:  97.109
Model Accuracy for cross validation: 96.3
Precision: 97
Recall: 97
F1_Score: 97
*---------------------------*
ExtraTreeClassifier(random_state=31)
(8405, 512) (8405,)
(934, 512