__Author: Christian Camilo Urcuqui López__

__Date: 3 May 2016__

In [8]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
import torch
#from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn import tree
import pandas as pd
from sklearn.externals import joblib
import pickle
import numpy as np
import seaborn as sns
from joblib import dump, load

In [2]:
df = pd.read_csv('../../datasets/android_permissions.csv', header=0, sep=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Columns: 331 entries, android to type
dtypes: int64(331)
memory usage: 1.0 MB


In [3]:
df = df.astype(int).copy()

In [62]:
df.head(2)

Unnamed: 0,android,android.app.cts.permission.TEST_GRANTED,android.intent.category.MASTER_CLEAR.permission.C2D_MESSAGE,android.os.cts.permission.TEST_GRANTED,android.permission.ACCESS_ALL_DOWNLOADS,android.permission.ACCESS_ALL_EXTERNAL_STORAGE,android.permission.ACCESS_BLUETOOTH_SHARE,android.permission.ACCESS_CACHE_FILESYSTEM,android.permission.ACCESS_CHECKIN_PROPERTIES,android.permission.ACCESS_COARSE_LOCATION,...,com.android.voicemail.permission.WRITE_VOICEMAIL,com.foo.mypermission,com.foo.mypermission2,org.chromium.chrome.shell.permission.C2D_MESSAGE,org.chromium.chrome.shell.permission.DEBUG,org.chromium.chrome.shell.permission.SANDBOX,org.chromium.chromecast.shell.permission.SANDBOX,org.chromium.content_shell.permission.SANDBOX,test_permission,type
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Exploratory Data Analysis

Let's get the top 10 of permissions that are used for malware

In [17]:
df[df.type==1].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199 entries, 0 to 198
Columns: 331 entries, android to type
dtypes: int32(331)
memory usage: 258.9 KB


In [138]:
pd.Series.sort_values(df[df.type==1].sum(axis=0), ascending=False)[:10]

type                                         199
android.permission.INTERNET                  195
android.permission.READ_PHONE_STATE          190
android.permission.ACCESS_NETWORK_STATE      167
android.permission.WRITE_EXTERNAL_STORAGE    136
android.permission.ACCESS_WIFI_STATE         135
android.permission.READ_SMS                  124
android.permission.WRITE_SMS                 104
android.permission.RECEIVE_BOOT_COMPLETED    102
android.permission.ACCESS_COARSE_LOCATION     80
dtype: int64

In [139]:
pd.Series.sort_values(df[df.type==0].sum(axis=0), ascending=False)[:10]

android.permission.INTERNET                  104
android.permission.WRITE_EXTERNAL_STORAGE     76
android.permission.ACCESS_NETWORK_STATE       62
android.permission.WAKE_LOCK                  36
android.permission.RECEIVE_BOOT_COMPLETED     30
android.permission.ACCESS_WIFI_STATE          29
android.permission.READ_PHONE_STATE           24
android.permission.VIBRATE                    21
android.permission.ACCESS_FINE_LOCATION       18
android.permission.READ_EXTERNAL_STORAGE      15
dtype: int64

## Machine Learning Analysis

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 1:330], df['type'], test_size=0.20, random_state=42)

In [5]:
X_train.info(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 3 to 102
Data columns (total 329 columns):
android.app.cts.permission.TEST_GRANTED                          int32
android.intent.category.MASTER_CLEAR.permission.C2D_MESSAGE      int32
android.os.cts.permission.TEST_GRANTED                           int32
android.permission.ACCESS_ALL_DOWNLOADS                          int32
android.permission.ACCESS_ALL_EXTERNAL_STORAGE                   int32
android.permission.ACCESS_BLUETOOTH_SHARE                        int32
android.permission.ACCESS_CACHE_FILESYSTEM                       int32
android.permission.ACCESS_CHECKIN_PROPERTIES                     int32
android.permission.ACCESS_COARSE_LOCATION                        int32
android.permission.ACCESS_CONTENT_PROVIDERS_EXTERNALLY           int32
android.permission.ACCESS_DOWNLOAD_MANAGER                       int32
android.permission.ACCESS_DOWNLOAD_MANAGER_ADVANCED              int32
android.permission.ACCESS_DRM_CERTIFICATES

### Naive Bayes algorithm

In [64]:
# Naive Bayes algorithm
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# pred
pred = gnb.predict(X_test)

# accuracy
accuracy = accuracy_score(pred, y_test)
print("naive_bayes")
print(accuracy)
print(classification_report(pred, y_test, labels=None))


naive_bayes
0.8375
             precision    recall  f1-score   support

          0       0.91      0.76      0.83        41
          1       0.78      0.92      0.85        39

avg / total       0.85      0.84      0.84        80



### KNeighborsClassifier

In [65]:
# kneighbors algorithm
neigh = KNeighborsClassifier(n_neighbors=2)
neigh_two = KNeighborsClassifier(n_neighbors=3)
neigh_three = KNeighborsClassifier(n_neighbors=4)
neigh_four = KNeighborsClassifier(n_neighbors=6)

neigh.fit(X_train, y_train)
neigh_two.fit(X_train, y_train)
neigh_three.fit(X_train, y_train)
neigh_four.fit(X_train, y_train)

pred = neigh.predict(X_test)
pred_two = neigh_two.predict(X_test)
pred_three = neigh_three.predict(X_test)
pred_four = neigh_four.predict(X_test)
# accuracy
accuracy = accuracy_score(pred, y_test)
accuracy_two = accuracy_score(pred_two, y_test)
accuracy_three = accuracy_score(pred_three, y_test)
accuracy_four = accuracy_score(pred_four, y_test)

print("kneighbors 2")
print(accuracy)
print(classification_report(pred, y_test, labels=None))
print("")
print("kneighbors 3")
print(accuracy_two)
print(classification_report(pred_two, y_test, labels=None))
print("")
print("kneighbors 4")
print(accuracy_three)
print(classification_report(pred_three, y_test, labels=None))
print("kneighbors 6")
print(accuracy_four)
print(classification_report(pred_four, y_test, labels=None))


kneighbors 2
0.9
             precision    recall  f1-score   support

          0       0.97      0.82      0.89        40
          1       0.85      0.97      0.91        40

avg / total       0.91      0.90      0.90        80


kneighbors 3
0.8875
             precision    recall  f1-score   support

          0       0.94      0.82      0.88        39
          1       0.85      0.95      0.90        41

avg / total       0.89      0.89      0.89        80


kneighbors 4
0.85
             precision    recall  f1-score   support

          0       0.94      0.76      0.84        42
          1       0.78      0.95      0.86        38

avg / total       0.87      0.85      0.85        80

kneighbors 6
0.85
             precision    recall  f1-score   support

          0       0.94      0.76      0.84        42
          1       0.78      0.95      0.86        38

avg / total       0.87      0.85      0.85        80



### Decision Tree

In [5]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Read the csv test file

pred = clf.predict(X_test)
# accuracy
accuracy = accuracy_score(pred, y_test)
print(clf)
print(accuracy)
print(classification_report(pred, y_test, labels=None))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
0.95
             precision    recall  f1-score   support

          0       0.97      0.92      0.94        36
          1       0.93      0.98      0.96        44

avg / total       0.95      0.95      0.95        80



In [9]:
#we are going to save the model 
dump(clf, 'defense.joblib') 

['defense.joblib']

In [10]:
# lets load and prove it again
clf = load('defense.joblib') 
pred = clf.predict(X_test)
# accuracy
accuracy = accuracy_score(pred, y_test)
print(clf)
print(accuracy)
print(classification_report(pred, y_test, labels=None))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
0.95
             precision    recall  f1-score   support

          0       0.97      0.92      0.94        36
          1       0.93      0.98      0.96        44

avg / total       0.95      0.95      0.95        80



### Neural network


In [6]:
cat_to_name = {"0": "benign", "1": "malware"}

In [7]:
X_train_tensor = torch.tensor(X_train.values).int()
X_test_tensor = torch.tensor(X_test.values).int()
y_train_tensor = torch.tensor(y_train.values).int()
y_test_tensor = torch.tensor(y_test.values).int()

In [8]:
X_train_tensor = X_train_tensor.reshape(-1, 329)
X_test_tensor = X_test_tensor.reshape(-1, 329)
y_train_tensor = y_train_tensor.reshape(-1, 1)
y_test_tensor = y_test_tensor.reshape(-1, 1)

In [9]:
X_train_tensor

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.int32)

In [10]:
from torch import nn
import torch.nn.functional as F

class Network(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Inputs to hidden layer linear transformation
        self.fc1 = nn.Linear(329, 128)
        self.fc2 = nn.Linear(128, 64)
        #self.dout = nn.Dropout(0.2)
        # output layer
        self.fc3 = nn.Linear(64, 1)
        
    def forward(self, x):
        # Hidden layer with ReLu activation
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        # Output layer with softmax activation        
        x = self.fc3(x)
        return x

In [11]:
model = Network()
model

Network(
  (fc1): Linear(in_features=329, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
)

In [12]:
import torch.optim as optim
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

epochs = 5
position = 1
batch_size= 10

for e in range(epochs):
    running_loss = 0
    model.train()
    for beg_i in range(0, X_train_tensor.size(0), batch_size):
        x_batch = X_train_tensor[beg_i:beg_i + batch_size, :]
        y_batch = y_train_tensor[beg_i:beg_i + batch_size, :]
        
        optimizer.zero_grad()
        output =  model(x_batch.float())        
        loss = criterion(output,y_batch.float())        
        loss.backward()
        optimizer.step()
        position += 1
        running_loss += loss.item()
    else:
        print(f"Training loss: {running_loss/len(X_train_tensor)}")

Training loss: 0.016105705145863618
Training loss: 0.005543735778872978
Training loss: 0.003785331997838832
Training loss: 0.0035040460625054234
Training loss: 0.002874317849874965


In [19]:
test_loss = 0.0
class_correct = list(0 for i in range(2))
class_total = list(0. for i in range(2))
model.eval()
# iterate over test data
for index in range(0, X_test_tensor.size(0)):       
    
    # forward pass: compute predicted outputs by passing inputs to the model
    output = model(X_test_tensor[index].float())
    if output > 0.5 :
        prediction = 1 
    else:
        prediction = 0     
    
    # calculate the batch loss
    loss = criterion(output, y_test_tensor[index].float())
    # update test loss 
    test_loss += loss.item()
    # convert output probabilities to predicted class
    #_, pred = torch.max(output, 1)
    
    # compare predictions to true label
    #correct_tensor = pred.eq(y_test_tensor[index].long())
    #correct = np.squeeze(correct_tensor.numpy())
    
    label = y_test_tensor.data[index].numpy()[0]  
    
    if label == prediction:
        class_correct[label] += 1

    class_total[label] += 1
         
# average test loss
test_loss = test_loss/len(X_test_tensor)
print('Test Loss: {:.6f}\n'.format(test_loss))

for i in range(0,2):
    if class_total[i] > 0:
        print('Test Accuracy of %5s: %2d%% (%2d/%2d)' % (
            cat_to_name[str(i)], 100 * class_correct[i] / class_total[i],
            np.sum(class_correct[i]), np.sum(class_total[i])))
    else:
        print('Test Accuracy of %5s: N/A (no training examples)' % (pred[i]))

print('\nTest Accuracy (Overall): %2d%% (%2d/%2d)' % (
    100. * np.sum(class_correct) / np.sum(class_total),
    np.sum(class_correct), np.sum(class_total)))

Test Loss: 0.063895

Test Accuracy of benign: 97% (33/34)
Test Accuracy of malware: 91% (42/46)

Test Accuracy (Overall): 93% (75/80)
