__Author: Christian Camilo Urcuqui López__

__Date: 3 May 2016__

In [27]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
#from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn import tree
import pandas as pd
from sklearn.externals import joblib
import pickle

In [23]:
df = pd.read_csv('../../datasets/android_permissions.csv', header=0, sep=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Columns: 331 entries, android to type
dtypes: int64(331)
memory usage: 1.0 MB


In [61]:
df = df.astype(int).copy()

In [62]:
df.head(2)

Unnamed: 0,android,android.app.cts.permission.TEST_GRANTED,android.intent.category.MASTER_CLEAR.permission.C2D_MESSAGE,android.os.cts.permission.TEST_GRANTED,android.permission.ACCESS_ALL_DOWNLOADS,android.permission.ACCESS_ALL_EXTERNAL_STORAGE,android.permission.ACCESS_BLUETOOTH_SHARE,android.permission.ACCESS_CACHE_FILESYSTEM,android.permission.ACCESS_CHECKIN_PROPERTIES,android.permission.ACCESS_COARSE_LOCATION,...,com.android.voicemail.permission.WRITE_VOICEMAIL,com.foo.mypermission,com.foo.mypermission2,org.chromium.chrome.shell.permission.C2D_MESSAGE,org.chromium.chrome.shell.permission.DEBUG,org.chromium.chrome.shell.permission.SANDBOX,org.chromium.chromecast.shell.permission.SANDBOX,org.chromium.content_shell.permission.SANDBOX,test_permission,type
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [63]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 1:330], df['type'], test_size=0.20, random_state=42)

### Naive Bayes algorithm

In [64]:
# Naive Bayes algorithm
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# pred
pred = gnb.predict(X_test)

# accuracy
accuracy = accuracy_score(pred, y_test)
print("naive_bayes")
print(accuracy)
print(classification_report(pred, y_test, labels=None))


naive_bayes
0.8375
             precision    recall  f1-score   support

          0       0.91      0.76      0.83        41
          1       0.78      0.92      0.85        39

avg / total       0.85      0.84      0.84        80



### KNeighborsClassifier

In [65]:
# kneighbors algorithm
neigh = KNeighborsClassifier(n_neighbors=2)
neigh_two = KNeighborsClassifier(n_neighbors=3)
neigh_three = KNeighborsClassifier(n_neighbors=4)
neigh_four = KNeighborsClassifier(n_neighbors=6)

neigh.fit(X_train, y_train)
neigh_two.fit(X_train, y_train)
neigh_three.fit(X_train, y_train)
neigh_four.fit(X_train, y_train)

pred = neigh.predict(X_test)
pred_two = neigh_two.predict(X_test)
pred_three = neigh_three.predict(X_test)
pred_four = neigh_four.predict(X_test)
# accuracy
accuracy = accuracy_score(pred, y_test)
accuracy_two = accuracy_score(pred_two, y_test)
accuracy_three = accuracy_score(pred_three, y_test)
accuracy_four = accuracy_score(pred_four, y_test)

print("kneighbors 2")
print(accuracy)
print(classification_report(pred, y_test, labels=None))
print("")
print("kneighbors 3")
print(accuracy_two)
print(classification_report(pred_two, y_test, labels=None))
print("")
print("kneighbors 4")
print(accuracy_three)
print(classification_report(pred_three, y_test, labels=None))
print("kneighbors 6")
print(accuracy_four)
print(classification_report(pred_four, y_test, labels=None))


kneighbors 2
0.9
             precision    recall  f1-score   support

          0       0.97      0.82      0.89        40
          1       0.85      0.97      0.91        40

avg / total       0.91      0.90      0.90        80


kneighbors 3
0.8875
             precision    recall  f1-score   support

          0       0.94      0.82      0.88        39
          1       0.85      0.95      0.90        41

avg / total       0.89      0.89      0.89        80


kneighbors 4
0.85
             precision    recall  f1-score   support

          0       0.94      0.76      0.84        42
          1       0.78      0.95      0.86        38

avg / total       0.87      0.85      0.85        80

kneighbors 6
0.85
             precision    recall  f1-score   support

          0       0.94      0.76      0.84        42
          1       0.78      0.95      0.86        38

avg / total       0.87      0.85      0.85        80



### Decision Tree

In [71]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Read the csv test file

pred = clf.predict(X_test)
# accuracy
accuracy = accuracy_score(pred, y_test)
print(clf)
print(accuracy)
print(classification_report(pred, y_test, labels=None))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
0.95
             precision    recall  f1-score   support

          0       0.97      0.92      0.94        36
          1       0.93      0.98      0.96        44

avg / total       0.95      0.95      0.95        80



### Neural network


In [36]:
from torch import nn
import torch.nn.functional as F

class Network(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Inputs to hidden layer linear transformation
        self.fc1 = nn.Linear(331, 128)
        self.fc2 = nn.Linear(128, 64)
        # output layer
        self.fc3 = nn.Linear(64, 1)
    def forward(self, x):
        # Hidden layer with ReLu activation
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        # Output layer with softmax activation
        x = F.softmax(self.fc3(x), dim=1)
        
        return x

In [37]:
model = Network()
model

Network(
  (fc1): Linear(in_features=331, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
)

In [67]:
import torch.optim as optim
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.003)

epochs = 5
position = 0

for e in range(epochs):
    running_loss = 0
    model.train()
    for data in X_train:
            
        # TODO: Training pass
        optimizer.zero_grad()
        print(data)
        output =  model(X_train[:position])        
        loss = criterion(output, y_train[position])        
        loss.backward()
        optimizer.step()
        position += 1
        running_loss += loss.item()
    else:
        print(f"Training loss: {running_loss/len(trainloader)}")

android.app.cts.permission.TEST_GRANTED


AttributeError: 'DataFrame' object has no attribute 'dim'