In [1]:
import torch
from torch.autograd import Variable

In [2]:
import pickle
# want to pickle our corpus
def save_obj(obj, name):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
# load up our data
trainXs = load_obj('trainXs')
trainYs = load_obj('trainYs')

In [4]:
# remove ids
YsNoId = trainYs.drop(['id'], axis=1)
XsNoId = trainXs.drop(['id'], axis=1)

In [73]:
import torch.nn as nn
import torch.nn.functional as F

HIDDEN_LAYER_DIM = 200
NUM_OUTPUT_CATS = 15

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(XsNoId.shape[1], HIDDEN_LAYER_DIM)
        self.fc2 = nn.Linear(HIDDEN_LAYER_DIM, HIDDEN_LAYER_DIM)
        #self.fc3 = nn.Linear(HIDDEN_LAYER_DIM, HIDDEN_LAYER_DIM)
        self.fc4 = nn.Linear(HIDDEN_LAYER_DIM, NUM_OUTPUT_CATS)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        #x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return F.log_softmax(x)

In [74]:
net = Net()
print(net)

Net(
  (fc1): Linear(in_features=106, out_features=200)
  (fc2): Linear(in_features=200, out_features=200)
  (fc4): Linear(in_features=200, out_features=15)
)


In [81]:
from torch import optim
# create a stochastic gradient descent optimizer
# optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
# optimizer = optim.Adadelta(net.parameters()) # got this one down to .21 loss
optimizer = optim.Adam(net.parameters(), lr=.0001) # got this one down to .20 loss
# create a loss function
criterion = nn.NLLLoss()

In [76]:
import numpy as np
import torch.utils.data as data

class MalwareDataset(data.Dataset):
    """Malware dataset."""

    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x_val = torch.from_numpy(np.array(self.X.loc[[idx]])).float()
        y_val = torch.from_numpy(np.array(self.Y.loc[[idx]])).long()
        return (x_val, y_val)

my_dataset = MalwareDataset(XsNoId, YsNoId['malware_category'])

In [77]:
from torch.utils.data import DataLoader

epochs = 50
log_interval = 10
batch_size = 200

train_loader = torch.utils.data.DataLoader(
    my_dataset,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True
)

In [82]:
# run the main training loop
for epoch in range(epochs):
    for batch_idx, (data, target) in enumerate(train_loader):
        # note: we have to squeeze the target because we just want a 
        # tensor of length 200 instead of a 200x1 tensor, and using a numpy
        # array for some reason causes it to be a 200x1 tensor
        data, target = Variable(data), torch.squeeze(Variable(target))
        # resize data from (batch_size, 1, 28, 28) to (batch_size, 28*28)
        data = data.view(-1, XsNoId.shape[1])
        optimizer.zero_grad()
        net_out = net(data)
        loss = criterion(net_out, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                           100. * batch_idx / len(train_loader), loss.data[0]))





In [83]:
# want to load up our test data
testXs = load_obj('testXs')

In [84]:
testIds = testXs['id']
testInputs = testXs.drop(['id'], axis=1)

In [85]:
preds = list()
for index, row in testInputs.iterrows():
    data = Variable(torch.from_numpy(np.array(row)).float())
    net_out = net(data)
    pred = net_out.data.max(0)[1][0]  # get the index of the max log-probability
    if pred is not 8:
        print("not 8", pred)
    preds.append(pred)



not 8 10
not 8 13
not 8 0
not 8 10
not 8 5
not 8 12
not 8 4
not 8 10
not 8 10
not 8 1
not 8 10
not 8 11
not 8 12
not 8 13
not 8 2
not 8 12
not 8 3
not 8 12
not 8 0
not 8 12
not 8 12
not 8 12
not 8 10
not 8 10
not 8 6
not 8 11
not 8 10
not 8 3
not 8 0
not 8 12
not 8 1
not 8 12
not 8 10
not 8 10
not 8 0
not 8 0
not 8 11
not 8 2
not 8 10
not 8 12
not 8 3
not 8 0
not 8 12
not 8 4
not 8 7
not 8 10
not 8 13
not 8 13
not 8 12
not 8 12
not 8 12
not 8 10
not 8 10
not 8 10
not 8 5
not 8 10
not 8 12
not 8 10
not 8 10
not 8 11
not 8 12
not 8 12
not 8 10
not 8 11
not 8 12
not 8 4
not 8 10
not 8 1
not 8 12
not 8 10
not 8 0
not 8 10
not 8 2
not 8 12
not 8 13
not 8 10
not 8 5
not 8 10
not 8 12
not 8 10
not 8 13
not 8 10
not 8 12
not 8 5
not 8 12
not 8 4
not 8 5
not 8 10
not 8 0
not 8 7
not 8 10
not 8 10
not 8 10
not 8 7
not 8 10
not 8 11
not 8 0
not 8 0
not 8 10
not 8 7
not 8 5
not 8 0
not 8 12
not 8 10
not 8 10
not 8 3
not 8 12
not 8 12
not 8 12
not 8 10
not 8 9
not 8 10
not 8 12
not 8 12
not 8 10
no

not 8 5
not 8 7
not 8 10
not 8 0
not 8 1
not 8 12
not 8 12
not 8 10
not 8 10
not 8 10
not 8 12
not 8 12
not 8 10
not 8 10
not 8 12
not 8 3
not 8 11
not 8 2
not 8 12
not 8 12
not 8 12
not 8 10
not 8 12
not 8 13
not 8 4
not 8 10
not 8 7
not 8 10
not 8 12
not 8 10
not 8 10
not 8 11
not 8 10
not 8 6
not 8 12
not 8 6
not 8 11
not 8 0
not 8 0
not 8 10
not 8 9
not 8 0
not 8 10
not 8 12
not 8 0
not 8 13
not 8 10
not 8 12
not 8 10
not 8 10
not 8 10
not 8 12
not 8 10
not 8 9
not 8 4
not 8 5
not 8 9
not 8 12
not 8 0
not 8 11
not 8 3
not 8 10
not 8 10
not 8 12
not 8 3
not 8 12
not 8 3
not 8 12
not 8 10
not 8 12
not 8 10
not 8 7
not 8 10
not 8 9
not 8 7
not 8 10
not 8 4
not 8 13
not 8 6
not 8 10
not 8 0
not 8 4
not 8 3
not 8 10
not 8 7
not 8 12
not 8 12
not 8 11
not 8 12
not 8 10
not 8 10
not 8 10
not 8 12
not 8 12
not 8 12
not 8 10
not 8 10
not 8 4
not 8 10
not 8 10
not 8 13
not 8 10
not 8 10
not 8 5
not 8 0
not 8 2
not 8 9
not 8 10
not 8 10
not 8 2
not 8 12
not 8 10
not 8 7
not 8 10
not 8 10
not 

In [86]:
import csv
with open('nn_pred_5.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['Id', 'Prediction'])
    for idx, testId in enumerate(testIds):
        nn_pred = preds[idx]
        writer.writerow([testId, nn_pred])