# Jupyter Notebook Version of DL

The project is first developed in Jupyter Notebook for easy testing/verification but could be moved to a formal Python Script in the future (if I have time). Contrary to the Xtract-Sampler we won't be implementing any byte extraction but rather right now assume we have the data.
***
Training and Developing a model

### Import Statements

In [1]:
import pickle
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from ByteVectorDataset import ByteVectorDataset
from model import SimpleCNN
from time import time

In [2]:
BYTE_BLOCK_SIZE = 512

### Loading Files/Data Processing

Mostly for testing purposes

In [3]:
print("loading files now...")

with open('CDIACFileData/ByteVectors/byte_vector_dict_512B_one_gram.pkl', "rb") as fp1:
    one_gram = pickle.load(fp1)
with open('CDIACFileData/ByteVectors/byte_vector_dict_512B_two_gram.pkl', "rb") as fp2:
    two_gram = pickle.load(fp2)

print("loading files done!")

loading files now...
loading files done!


In [4]:
label_path = "CDIACFileData/labels/cdiac_naivetruth_processed.csv"
dataset_one_gram = ByteVectorDataset(label_path, one_gram)
dataset_two_gram = ByteVectorDataset(label_path, two_gram)

In [5]:
reccomended_num_workers = 4 * torch.cuda.device_count()
# ^ from https://discuss.pytorch.org/t/guidelines-for-assigning-num-workers-to-dataloader/813/3

In [6]:
dataloader_one_gram = DataLoader(dataset_one_gram, batch_size=1,
                        shuffle=True, num_workers=reccomended_num_workers)
dataloader_two_gram = DataLoader(dataset_two_gram, batch_size=1,
                        shuffle=True, num_workers=reccomended_num_workers)

In [7]:
for i_batch, sample_batched in enumerate(dataloader_one_gram):
    print(i_batch, sample_batched)
    # observe 4th batch and stop.
    if i_batch == 3:
        break

0 [tensor([[239, 187, 191,  60,  63, 120, 109, 108,  32, 118, 101, 114, 115, 105,
         111, 110,  61,  34,  49,  46,  48,  34,  32, 101, 110,  99, 111, 100,
         105, 110, 103,  61,  34, 117, 116, 102,  45,  56,  34,  63,  62,  13,
          10,  60, 120,  95, 116,  97, 103, 115,  62,  13,  10,  32,  32,  60,
          85, 115, 101, 114,  62,  13,  10,  32,  32,  32,  32,  60,  78,  97,
         109, 101,  62,  83, 117, 108, 108, 105, 118,  97, 110,  44,  32,  75,
         101, 118, 105, 110,  60,  47,  78,  97, 109, 101,  62,  13,  10,  32,
          32,  32,  32,  60,  79, 114, 103,  97, 110, 105, 122,  97, 116, 105,
         111, 110,  62,  65,  79,  77,  76,  47,  78,  79,  65,  65,  60,  47,
          79, 114, 103,  97, 110, 105, 122,  97, 116, 105, 111, 110,  62,  13,
          10,  32,  32,  32,  32,  60,  65, 100, 100, 114, 101, 115, 115,  62,
          52,  51,  48,  49,  32,  82, 105,  99, 107, 101, 110,  98,  97,  99,
         107, 101, 114,  32,  67,  97, 117, 115, 

From here on out we will just be using one grams but the two grams process is identical

In [8]:
TRAIN_TEST_SPLIT = .8 # 80% for training 20% for testing
train_set_size = int(TRAIN_TEST_SPLIT * len(dataset_one_gram))
val_set_size = len(dataset_one_gram) - train_set_size
train_set, val_set = torch.utils.data.random_split(dataset_one_gram, [train_set_size, val_set_size])
print(len(train_set))
print(len(val_set))

11474
2869


In [9]:
train_loader = DataLoader(train_set, batch_size=32,
                        shuffle=True, num_workers=reccomended_num_workers)
val_loader = DataLoader(val_set, batch_size=32,
                        shuffle=True, num_workers=reccomended_num_workers)

In [10]:
dataiter = iter(train_loader)
images, labels = dataiter.next()
print(type(images))
print(images.shape)
print(labels)

<class 'torch.Tensor'>
torch.Size([32, 512])
tensor([2, 1, 1, 3, 2, 1, 2, 1, 3, 1, 1, 1, 3, 4, 0, 2, 1, 2, 2, 1, 2, 3, 5, 2,
        1, 2, 3, 1, 1, 1, 1, 2])


### Training

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleCNN(BYTE_BLOCK_SIZE)
if torch.cuda.device_count() > 1:
  print("Let's use", torch.cuda.device_count(), "GPUs!")
  model = nn.DataParallel(model)
model.to(device)
    
loss_function = nn.NLLLoss() # This is a convex loss function
optimizer = torch.optim.Adam(model.parameters(), lr=.03) # From my CS361 class SGD is shown to do well on convex functions

Let's use 4 GPUs!


In [12]:
time0 = time()
epochs = 100
for epoch in range(epochs):
    running_loss = 0
    for byte_vector, labels in train_loader:
        byte_vector = byte_vector.to(device)
        labels = labels.to(device)
        
        #Training Pass
        optimizer.zero_grad()
        output = model(byte_vector).to(device)
        loss = loss_function(output, labels)
        
        #backpropagation
        loss.backward()
        
        #optimization
        optimizer.step()
        
        running_loss += loss.item()
    else:
        print("Epoch {} - Training loss: {}".format(e, running_loss/len(train_loader)))


print("\nTraining Time (in minutes) = ", (time()-time0)/60)

RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/cc/anaconda3/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/home/cc/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/cc/xtract-sampler-DL/model.py", line 20, in forward
    out = self.l1(x)
  File "/home/cc/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/cc/anaconda3/lib/python3.8/site-packages/torch/nn/modules/container.py", line 139, in forward
    input = module(input)
  File "/home/cc/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/cc/anaconda3/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 298, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File "/home/cc/anaconda3/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 294, in _conv_forward
    return F.conv1d(input, weight, bias, self.stride,
RuntimeError: Expected 3-dimensional input for 3-dimensional weight [256, 512, 8], but got 2-dimensional input of size [8, 512] instead


### Testing

In [None]:
correct_count, all_count = 0
for byte_vector, labels in val_loader:
    for i in range(len(labels)):
        byte_vector.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            output = model(byte_vector)

        probabilities = torch.exp(output)
        probab = list(probabilities.cpu().numpy()[0])
        pred_label = probab.index(max(probab))
        true_label = labels.cpu.numpy()[i]
        if true_label == pred_label:
            correct_count += 1
        all_count += 1

print("Number of Images Tested =", all_count)
print("\n Model Accuracy =", (correct_count/all_count))
