In [1]:
#importing required libraries
import os
import glob
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
 
#importing keras libraries
import keras
from keras.models import Model, Sequential
from keras.applications.inception_v3 import InceptionV3
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.layers import Conv2D, Activation, Flatten, Dense

#importing sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score

In [2]:
#directory path for full dataset
imagedir = "/home/sanjeev/ML_Dataset/Malimg_data/"

In [3]:
cur_dir = os.getcwd()
os.chdir(imagedir)  # the parent folder with sub-folders

# Get number of samples per family
list_fams = sorted(os.listdir(os.getcwd()), key=str.lower)  # vector of strings with family names
no_imgs = []  # No. of samples per family
for i in range(len(list_fams)):
    os.chdir(list_fams[i])
    len1 = len(glob.glob('*.png'))  # assuming the images are stored as 'png'
    no_imgs.append(len1)
    os.chdir('..')
num_samples = np.sum(no_imgs)  # total number of all samples

# Compute the labels
y = np.zeros(num_samples)
pos = 0
label = 0
for i in no_imgs:
    print ("Label:%2d\tFamily: %15s\tNumber of images: %d" % (label, list_fams[label], i))
    for j in range(i):
        y[pos] = label
        pos += 1
    label += 1
num_classes = label

Label: 0	Family:       Adialer.C	Number of images: 122
Label: 1	Family:       Agent.FYI	Number of images: 116
Label: 2	Family:       Allaple.A	Number of images: 2949
Label: 3	Family:       Allaple.L	Number of images: 1591
Label: 4	Family:   Alueron.gen!J	Number of images: 198
Label: 5	Family:       Autorun.K	Number of images: 106
Label: 6	Family:     C2LOP.gen!g	Number of images: 200
Label: 7	Family:         C2LOP.P	Number of images: 146
Label: 8	Family:  Dialplatform.B	Number of images: 177
Label: 9	Family:       Dontovo.A	Number of images: 162
Label:10	Family:        Fakerean	Number of images: 381
Label:11	Family:   Instantaccess	Number of images: 431
Label:12	Family:      Lolyda.AA1	Number of images: 213
Label:13	Family:      Lolyda.AA2	Number of images: 184
Label:14	Family:      Lolyda.AA3	Number of images: 123
Label:15	Family:       Lolyda.AT	Number of images: 159
Label:16	Family:     Malex.gen!J	Number of images: 136
Label:17	Family:   Obfuscator.AD	Number of images: 142
Label:18

In [4]:
width, height, channels = (224, 224, 3) #image input shape
X = np.zeros((num_samples, width, height, channels))
cnt = 0
paths_list = []
print("Processing images...")
for i in range(len(list_fams)):
    for img_file in glob.glob(list_fams[i]+'/*.png'):
        paths_list.append(os.path.join(os.getcwd(),img_file))
        img = load_img(img_file, target_size=(224, 224))
        x = img_to_array(img) #image to array
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)
        X[cnt] = x
        cnt += 1
print("Images processed: %d" %(cnt))

Processing images...
Images processed: 9339


In [5]:
X.shape, y.shape

((9339, 224, 224, 3), (9339,))

In [6]:
y

array([ 0.,  0.,  0., ..., 24., 24., 24.])

In [7]:
incV3 = Sequential()
incV3.add(Conv2D(3, (3, 3), padding='same', input_shape=(224, 224, 3)))
incV3.add(Activation('relu'))

_incV3 = InceptionV3(include_top = False)

counter=0
for layer in _incV3.layers:
    layer.trainable = False
    counter+=1

print("Inception's ", counter , " layers are not added to the layer")
incV3.add(_incV3)
print("done")

Inception's  311  layers are not added to the layer
done


In [8]:
incV3.add(Flatten())

incV3.add(Dense(2048, activation='relu'))
incV3.add(Dense(512, activation="relu"))

incV3.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [9]:
incV3.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 224, 224, 3)       84        
                                                                 
 activation (Activation)     (None, 224, 224, 3)       0         
                                                                 
 inception_v3 (Functional)   (None, None, None, 2048)  21802784  
                                                                 
 flatten (Flatten)           (None, 51200)             0         
                                                                 
 dense (Dense)               (None, 2048)              104859648 
                                                                 
 dense_1 (Dense)             (None, 512)               1049088   
                                                                 
Total params: 127,711,604
Trainable params: 105,908,820


In [10]:
%%time
#extracting features
features = incV3.predict(X)

CPU times: user 36min 2s, sys: 11min 55s, total: 47min 58s
Wall time: 1min 20s


In [11]:
features.shape

(9339, 512)

In [12]:
# features[0]

In [13]:
#np.save('/home/sanjeev/DL_Exp_Kajal/Finetune_stack_features/MalImg/inceptionV3_feat_malimg_512.npy', features)

In [14]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [15]:
feat = np.load("/home/sanjeev/DL_Exp_Kajal/Finetune_stack_features/MalImg/inceptionV3_feat_malimg_512.npy")

In [16]:
y

array([ 0.,  0.,  0., ..., 24., 24., 24.])

In [17]:
#classification model creation using different classifiers
def classify(model, x, y):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=31)
    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)
    
    start_time = time.time()
    
    model.fit(X_train, y_train)   
    print("Training Accuracy: ", round(model.score(X_train, y_train)*100,3))
    
    end_time = time.time()
    training_time = end_time - start_time 
    print("Training Time:", training_time, "seconds")
    ###################################################################
    start_time = time.time() 
    print("Testing Accuracy: ", round(model.score(X_test, y_test)*100,3))
    end_time = time.time()
    training_time = end_time - start_time 
    print("Testing Time:", training_time, "seconds")
    ####################################################################
    acc = round(model.score(X_test, y_test)*100,3)
    
    score = cross_val_score(model, x, y, cv=5)
    print("Model Accuracy for cross validation:", round(np.mean(score)*100, 2))
    cv = round(np.mean(score)*100, 2)
    
    pred = model.predict(X_test)
    print('Precision:', int(round(precision_recall_fscore_support(y_test, pred, average='micro')[0]*100, 0)))
    print('Recall:', int(round(precision_recall_fscore_support(y_test, pred, average='micro')[1]*100, 0)))
    print('F1_Score:', int(round(precision_recall_fscore_support(y_test, pred, average='micro')[2]*100, 0)))
    ls = precision_recall_fscore_support(y_test, pred, average='micro')
    pre, rec, f1 = int(round(ls[0]*100, 0)), int(round(ls[1]*100, 0)), int(round(ls[2]*100, 0))
    print('*---------------------------*')
    return [acc, pre, rec, f1, cv]

classifiers = [KNeighborsClassifier(),
               SVC(random_state=31),
               RandomForestClassifier(random_state=31),
               MLPClassifier(random_state=31, max_iter=500),
               ExtraTreeClassifier(random_state=31),
               GaussianNB()]

ls_acc, ls_pre, ls_rec, ls_f1, ls_cv = [],[],[],[],[]

for classifier in classifiers:
    print(classifier)
    values = classify(classifier, feat, y)       
    
    ls_acc.append(values[0])
    ls_pre.append(values[1])
    ls_rec.append(values[2])
    ls_f1.append(values[3])
    ls_cv.append(values[4])
    
print([*ls_acc, *ls_pre, *ls_rec, *ls_f1, *ls_cv])

KNeighborsClassifier()
(8405, 512) (8405,)
(934, 512) (934,)
Training Accuracy:  95.907
Training Time: 3.01770281791687 seconds
Testing Accuracy:  93.362
Testing Time: 0.47214794158935547 seconds
Model Accuracy for cross validation: 93.62
Precision: 93
Recall: 93
F1_Score: 93
*---------------------------*
SVC(random_state=31)
(8405, 512) (8405,)
(934, 512) (934,)
Training Accuracy:  96.978
Training Time: 17.62131643295288 seconds
Testing Accuracy:  95.075
Testing Time: 1.3893632888793945 seconds
Model Accuracy for cross validation: 94.64
Precision: 95
Recall: 95
F1_Score: 95
*---------------------------*
RandomForestClassifier(random_state=31)
(8405, 512) (8405,)
(934, 512) (934,)
Training Accuracy:  100.0
Training Time: 11.646068096160889 seconds
Testing Accuracy:  93.041
Testing Time: 0.04771304130554199 seconds
Model Accuracy for cross validation: 92.39
Precision: 93
Recall: 93
F1_Score: 93
*---------------------------*
MLPClassifier(max_iter=500, random_state=31)
(8405, 512) (8405,

In [18]:
# #classification model creation using different classifiers
# def classify(model, x, y):
#     X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=31)
#     print(X_train.shape, y_train.shape)
#     print(X_test.shape, y_test.shape)

#     model.fit(X_train, y_train)   
#     print("Training Accuracy: ", round(model.score(X_train, y_train)*100,3))
#     print("Testing Accuracy: ", round(model.score(X_test, y_test)*100,3))
#     acc = round(model.score(X_test, y_test)*100,3)
    
#     score = cross_val_score(model, x, y, cv=5)
#     print("Model Accuracy for cross validation:", round(np.mean(score)*100, 2))
#     cv = round(np.mean(score)*100, 2)
    
#     pred = model.predict(X_test)
#     print('Precision:', int(round(precision_recall_fscore_support(y_test, pred, average='micro')[0]*100, 0)))
#     print('Recall:', int(round(precision_recall_fscore_support(y_test, pred, average='micro')[1]*100, 0)))
#     print('F1_Score:', int(round(precision_recall_fscore_support(y_test, pred, average='micro')[2]*100, 0)))
#     ls = precision_recall_fscore_support(y_test, pred, average='micro')
#     pre, rec, f1 = int(round(ls[0]*100, 0)), int(round(ls[1]*100, 0)), int(round(ls[2]*100, 0))
#     print('*---------------------------*')
#     return [acc, pre, rec, f1, cv]

# classifiers = [KNeighborsClassifier(),
#                SVC(random_state=31),
#                RandomForestClassifier(random_state=31),
#                MLPClassifier(random_state=31, max_iter=500),
#                ExtraTreeClassifier(random_state=31),
#                GaussianNB()]

# ls_acc, ls_pre, ls_rec, ls_f1, ls_cv = [],[],[],[],[]

# for classifier in classifiers:
#     print(classifier)
#     values = classify(classifier, feat, y)       
    
#     ls_acc.append(values[0])
#     ls_pre.append(values[1])
#     ls_rec.append(values[2])
#     ls_f1.append(values[3])
#     ls_cv.append(values[4])
    
# print([*ls_acc, *ls_pre, *ls_rec, *ls_f1, *ls_cv])