In [2]:
import regex as re
from collections import defaultdict
import time
from scipy.sparse import *

import numpy as np
from sklearn.svm import SVC
from tqdm import tqdm
import os 
import glob
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import *
import json
import matplotlib.pyplot as plt
import pickle
%matplotlib inline



import sys
sys.path.insert(1, '../src')
import make_dataset
import model 
import build_features
import multi_kernel




plt.style.use('seaborn-whitegrid')



In [3]:
import importlib
importlib.reload(make_dataset)
importlib.reload(build_features)
importlib.reload(model)

<module 'model' from '../src/model.py'>

## Feature Extraction

In [4]:
train_created = True

dir_list = '../test-data'
benign_paths = make_dataset.benign_app_paths_test(dir_list)

if train_created == False:

    malware_path, type_of_malware  =make_dataset.malware_app_paths()

    

    path = benign_paths[:2] + malware_path[:3]  

    ## Creates 3 diffrent structures that will be used to create the matrices
    apps_dic, code_block_dic, package_dic = make_dataset.clean_data(path)

    ## Gets all the unique API's in the data structure
    api_list_inter, app_list = make_dataset.unique_api_apps(apps_dic)

    ## Creates an intermediate A structure to find the count of each API
    a_matrix_inter = build_features.create_a_matrix\
    (app_list,api_list_inter, apps_dic)

    ## Gets the index of all API's that occur less in less then n apps
    extra_api_list = make_dataset.get_index_of_api\
    (a_matrix_inter,api_list_inter,2)

    ## removes all the API's from the 3 data structures
    apps_dic,code_block_dic,package_dic = make_dataset.remove_apis\
    (extra_api_list,apps_dic,code_block_dic,package_dic)

    ## Gets the new list of unique API's
    api_list, app_list = make_dataset.unique_api_apps(apps_dic)


    ## Saves these datastructures again 
    name_list = ['data', 'app_api.json', 'code_block.json','lib.json' ]
    make_dataset.save_structures(apps_dic, code_block_dic, package_dic,name_list )

    ## Saves unique api_list
    api_path = 'data/processed/unique_api.text'
    with open("api_path", "wb") as fp: 
        pickle.dump(api_list, fp)

else:
    with open('../test-data/processed/app_to_api.json') as f:
        apps_dic = json.loads(f.read())
        f.close()
    
    with open('../test-data/processed/code_block.json') as f:
        code_block_dic = json.loads(f.read())
        f.close()
    
    with open('../test-data/processed/library_dic.json') as f:
        package_dic = json.loads(f.read())
        f.close()

    api_path = '../test-data/processed/unique_api.text'
    with open(api_path, "rb") as fp: 
        api_list = pickle.load(fp)
    
    app_list = list(apps_dic.keys())

1 done
2 done
3 done


## Bulding Features 

In [19]:
matrix_created = True

if matrix_created == False:
    ## Creates A matrix
    a_matrix = build_features.create_a_matrix(app_list,api_list,apps_dic)

    ## Creatres B matrix
    b_matrix = build_features.create_b_matrix(code_block_dic,api_list,)

    ## Creates P matrix 
    p_matrix = build_features.create_p_matrix(package_dic,api_list)

    ## Saves all the 3 matrices 
    name_list = ['data', 'a_matrix','b_matrix','p_matrix']
    build_features.save_features(a_matrix,b_matrix,p_matrix, name_list)


else:
    ## Loads A matrix 
    a_matrix = load_npz("../test-data/matrix/a_matrix.npz")
    
    ## Loads B matrix 
    b_matrix = load_npz("../test-data/matrix/b_matrix.npz")
    
    ## Loads P matrix 
    p_matrix = load_npz("../test-data/matrix/p_matrix.npz")

### Test data 

In [21]:
test_structure_created = True

test_benign_paths = benign_paths[2:]

if test_structure_created == False:
    
    ## Gets path to all benign apps
    

    ## Gets path to all malware apps
    malware_apps = []
    for i in malware_path:
        if i.split('/')[-1] not in app_list:
            malware_apps.append(i)

    malware_test_paths = malware_apps[:2]

    ## Gets all the paths 
    test_paths = test_benign_paths[:2] + malware_test_paths

    ## Creates structure for the test set 
    test_apps_dic = make_dataset.get_data_test(test_paths)


    ## saves the new test structure created 
    test_name_list = ['data', 'test_app_api.json']
    make_dataset.test_save_structures(test_apps_dic, test_name_list)
    app_list_test = list(test_apps_dic.keys())

    
else:
    with open('../test-data/processed/test_app_api.json') as f:
        test_apps_dic = json.loads(f.read())
        f.close()
    app_list_test = list(test_apps_dic.keys())
    

In [22]:
a_test_created = False

if a_test_created:
    
    ##Loads in the a matrix for the test set 
    a_test_matrix = load_npz("../../a_test_matrix.npz")
else:
    ## Creates the A matrix for the test set 
    a_test_matrix = build_features.create_a_matrix_test(app_list_test,api_list,test_apps_dic)