In [13]:
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.sparse
from itertools import islice
import os
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

In [2]:
a_matrix_train = scipy.sparse.load_npz('a_matrix_train.npz')
a_matrix_test = scipy.sparse.load_npz('a_test_matrix.npz')

In [3]:
aa_train = pd.read_csv("train_aa_kernel.csv", header=0)
aa_train = aa_train.set_index(aa_train['Unnamed: 0'].values).drop(['Unnamed: 0','predictions'], axis=1)
aa_test = pd.read_csv("test_aa_kernel.csv", header=0)
aa_test = aa_test.set_index(aa_test['Unnamed: 0'].values).drop('Unnamed: 0', axis=1)

In [7]:
def malware_app_paths():
    type_of_malware = defaultdict(list)
    malware_loc = []
    dir_list = os.listdir("/datasets/dsc180a-wi20-public/Malware/amd_data_smali")
    for i in dir_list:
        string = '/datasets/dsc180a-wi20-public/Malware/amd_data_smali/' + i
        lis = os.listdir(string)
        for variety in lis:
            new_string = string + '/' + variety
            app_list = os.listdir(new_string)
            for app in app_list:
                final_str = new_string + '/' + app
                malware_loc.append(final_str)
                type_of_malware[i].append(app)
    return malware_loc, type_of_malware

In [8]:
malware_path, type_of_malware = malware_app_paths()

In [27]:
list_app = aa_train[aa_train['type'] == 0]['app_name']
type_dic = {}
for app in list_app:
    for i in type_of_malware.keys():
        if app in type_of_malware[i]:
            type_dic[app] = i
    
aa_train['type_of_malware'] = aa_train['app_name'].apply(lambda x: type_dic[x] if x in type_dic.keys() else None)
aa_train['type_of_malware'].value_counts()

FakeInst        428
BankBot         133
Youmi           118
Lotoor           76
RuMMS            65
Minimob          42
SimpleLocker     35
Kyview           30
GingerMaster     22
DroidKungFu      18
Koler            14
Roop             10
Andup            10
Mtk              10
Nandrobox         9
Ksapp             8
Boxer             6
Stealer           5
FakeDoc           5
VikingHorde       4
Gorpo             4
Vidro             3
Fjcon             3
SmsZombie         3
Obad              3
Vmvol             3
Steek             2
FakeAngry         2
Kemoge            1
MobileTX          1
SpyBubble         1
FakeAV            1
Name: type_of_malware, dtype: int64

In [11]:
list_app = aa_test[aa_test['type'] == 0]['app_name']
type_dic = {}
for app in list_app:
    for i in type_of_malware.keys():
        if app in type_of_malware[i]:
            type_dic[app] = i
    
aa_test['type_of_malware'] = aa_test['app_name'].apply(lambda x: type_dic[x] if x in type_dic.keys() else None)
aa_test['type_of_malware'].value_counts()

FakeInst        78
BankBot         23
RuMMS           16
Lotoor          15
Youmi           11
GingerMaster     9
DroidKungFu      8
Kyview           6
Mtk              5
Nandrobox        4
Roop             4
SimpleLocker     4
Andup            2
Koler            2
Stealer          2
Ksapp            2
Gorpo            2
Boxer            1
Vidro            1
SpyBubble        1
VikingHorde      1
Minimob          1
Name: type_of_malware, dtype: int64

In [12]:
kernel = aa_train.drop(['app_name','type', 'type_of_malware'], axis=1).values
y = aa_train['type'].values

In [14]:
def get_results(dic, y_true, y_pred):
    dic['accuracy'] = accuracy_score(y_true, y_pred)
    dic['precision'] = precision_score(y_true, y_pred)
    dic['recall'] = recall_score(y_true, y_pred)
    dic['f1'] = f1_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    dic['tp'] = float(tp)
    dic['fp'] = float(fp)
    dic['fn'] = float(fn)
    dic['tn'] = float(tn)
    return dic

In [15]:
dic = {
        "accuracy": 0.0,
        "precision": 0.0,
        "recall": 0.0,
        "f1": 0.0,
        "tp": 0.0,
        "fp": 0.0,
        "fn": 0.0,
        "tn": 0.0
}

In [16]:
clf = LinearSVC(random_state=0, max_iter=10000, tol=1e-5)
clf.fit(kernel, y)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=0, tol=1e-05,
          verbose=0)

In [18]:
get_results(dic, aa_test['type'].values, clf.predict(aa_test.drop(['app_name','type', 'type_of_malware'], axis=1).values))

{'accuracy': 0.9540229885057471,
 'precision': 0.9652777777777778,
 'recall': 0.9266666666666666,
 'f1': 0.9455782312925171,
 'tp': 139.0,
 'fp': 5.0,
 'fn': 11.0,
 'tn': 193.0}

In [22]:
predict_y = clf.predict(aa_test.drop(['app_name','type', 'type_of_malware'], axis=1).values)
test_y = aa_test['type'].values
tp_index = []
for i in range(len(predict_y)):
    if predict_y[i] != test_y[i] and predict_y[i] == 1:
        tp_index.append(str(i))

In [23]:
tp_index

['172', '232', '274', '288', '322']

In [25]:
aa_test.iloc[tp_index]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2172,2173,2174,2175,2176,2177,2178,app_name,type,type_of_malware
172,365.0,4614.0,724.0,4179.0,84.0,3543.0,2093.0,1418.0,437.0,4155.0,...,76.0,155.0,47.0,42.0,50.0,50.0,756.0,790b65d3fba5e125068602f48f5791b3,0,Koler
232,568.0,497.0,806.0,789.0,179.0,593.0,880.0,258.0,649.0,1231.0,...,117.0,235.0,88.0,57.0,70.0,70.0,347.0,c2d2657b66305c8c2a11f09c8fc8d933,0,Nandrobox
274,590.0,436.0,743.0,723.0,165.0,495.0,696.0,232.0,643.0,1709.0,...,117.0,232.0,89.0,59.0,67.0,67.0,316.0,5d1f6c7007301a54e382991732feffdb,0,Kyview
288,591.0,439.0,748.0,727.0,165.0,498.0,699.0,232.0,645.0,1713.0,...,116.0,232.0,89.0,58.0,66.0,66.0,318.0,227116ba8b0a5ec0692c5fe403a99e91,0,Kyview
322,2911.0,1601.0,3060.0,2140.0,179.0,2096.0,1927.0,959.0,2265.0,2812.0,...,129.0,290.0,97.0,66.0,71.0,71.0,650.0,327f6a001c81e04c1786b222a38cd19f,0,Kyview
