# Jupyter Notebook Version of DL

The project is first developed in Jupyter Notebook for easy testing/verification but could be moved to a formal Python Script in the future (if I have time). Contrary to the Xtract-Sampler we won't be implementing any byte extraction but rather right now assume we have the data.
***
Training and Developing a model

### Import Statements

In [1]:
%load_ext autoreload

In [2]:
import pickle
import numpy as np
import torch, os
from torch import nn
from torch.utils.data import DataLoader
from ByteVectorDataset import ByteVectorDataset
from model import SimpleCNN
from time import time

%autoreload 2

In [3]:
BYTE_BLOCK_SIZE = 512

### Loading Files/Data Processing

Mostly for testing purposes

In [4]:
print("loading files now...")

with open('CDIACFileData/ByteVectors/byte_vector_dict_512B_one_gram.pkl', "rb") as fp1:
    one_gram = pickle.load(fp1)
with open('CDIACFileData/ByteVectors/byte_vector_dict_512B_two_gram.pkl', "rb") as fp2:
    two_gram = pickle.load(fp2)

print("loading files done!")

loading files now...
loading files done!


In [5]:
label_path = "CDIACFileData/labels/cdiac_naivetruth_processed.csv"
dataset_one_gram = ByteVectorDataset(label_path, one_gram)
dataset_two_gram = ByteVectorDataset(label_path, two_gram)

In [6]:
reccomended_num_workers = 4 * torch.cuda.device_count()
# ^ from https://discuss.pytorch.org/t/guidelines-for-assigning-num-workers-to-dataloader/813/3

In [7]:
dataloader_one_gram = DataLoader(dataset_one_gram, batch_size=1,
                        shuffle=True, num_workers=reccomended_num_workers)
dataloader_two_gram = DataLoader(dataset_two_gram, batch_size=1,
                        shuffle=True, num_workers=reccomended_num_workers)

In [8]:
for i_batch, sample_batched in enumerate(dataloader_one_gram):
    print(i_batch, sample_batched)
    # observe 3rd batch and stop.
    if i_batch == 2:
        break

0 [tensor([[ 60.,  63., 120., 109., 108.,  32., 118., 101., 114., 115., 105., 111.,
         110.,  61.,  34.,  49.,  46.,  48.,  34.,  32., 101., 110.,  99., 111.,
         100., 105., 110., 103.,  61.,  34., 117., 116., 102.,  45.,  56.,  34.,
          63.,  62.,  13.,  10.,  60., 120.,  95., 116.,  97., 103., 115.,  62.,
          13.,  10.,  32.,  32.,  60.,  85., 115., 101., 114.,  62.,  13.,  10.,
          32.,  32.,  32.,  32.,  60.,  78.,  97., 109., 101.,  62.,  83., 117.,
         108., 108., 105., 118.,  97., 110.,  44.,  32.,  75., 101., 118., 105.,
         110.,  60.,  47.,  78.,  97., 109., 101.,  62.,  13.,  10.,  32.,  32.,
          32.,  32.,  60.,  79., 114., 103.,  97., 110., 105., 122.,  97., 116.,
         105., 111., 110.,  62.,  78.,  79.,  65.,  65.,  47.,  65., 116., 108.,
          97., 110., 116., 105.,  99.,  32.,  79.,  99., 101.,  97., 110., 111.,
         103., 114.,  97., 112., 104., 105.,  99.,  32.,  38.,  97., 109., 112.,
          59.,  32.,  77.

From here on out we will just be using one grams but the two grams process is identical

In [9]:
TRAIN_TEST_SPLIT = .8 # 80% for training 20% for testing
train_set_size = int(TRAIN_TEST_SPLIT * len(dataset_one_gram))
val_set_size = len(dataset_one_gram) - train_set_size
train_set, val_set = torch.utils.data.random_split(dataset_one_gram, [train_set_size, val_set_size])
print(len(train_set))
print(len(val_set))

11474
2869


In [10]:
train_loader = DataLoader(train_set, batch_size=32,
                        shuffle=True, num_workers=reccomended_num_workers)
val_loader = DataLoader(val_set, batch_size=32,
                        shuffle=True, num_workers=reccomended_num_workers)

In [11]:
dataiter = iter(train_loader)
images, labels = dataiter.next()
print(type(images))
print(images.shape)
print(labels)

<class 'torch.Tensor'>
torch.Size([32, 512])
tensor([1, 1, 3, 1, 1, 3, 5, 3, 1, 2, 1, 0, 2, 0, 2, 2, 0, 2, 1, 1, 2, 2, 3, 1,
        1, 2, 1, 1, 3, 2, 2, 2])


### Training

In [12]:
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.device(1))
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(1))

True
0
<torch.cuda.device object at 0x7fc9c0b11df0>
4
Tesla V100-SXM2-32GB


In [13]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
torch.rand(1).cuda()
print(torch.version.cuda)

10.2


In [15]:
%autoreload 2

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = SimpleCNN(BYTE_BLOCK_SIZE)
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model, device_ids=[0])
model.to(device)

loss_function = nn.NLLLoss() # This is a convex loss function
optimizer = torch.optim.Adam(model.parameters(), lr=.03) # From my CS361 class SGD is shown to do well on convex functions

Let's use 4 GPUs!


In [16]:
%autoreload 2
time0 = time()
epochs = 10
for epoch in range(epochs):
    running_loss = 0
    for byte_vector, labels in train_loader:
        #print(byte_vector.shape)
        byte_vector = byte_vector.to(device)
        labels = labels.to(device)
        
        #Training Pass
        optimizer.zero_grad()
       
        output = model(byte_vector).to(device)
        #print("Outside: input size", input.size(), "output_size", output.size())
        #print(output.shape)
        loss = loss_function(output, labels)
        
        #backpropagation
        loss.backward()
        
        #optimization
        optimizer.step()
        
        running_loss += loss.item()
    else:
        print("Epoch {} - Training loss: {}".format(epoch, running_loss/len(train_loader)))


print("\nTraining Time (in minutes) = ", (time()-time0)/60)

Epoch 0 - Training loss: -524059800019.64136
Epoch 1 - Training loss: -9401254744896.89
Epoch 2 - Training loss: -40934368018249.445
Epoch 3 - Training loss: -105912079956053.58
Epoch 4 - Training loss: -211838474312244.78
Epoch 5 - Training loss: -363878928988701.94
Epoch 6 - Training loss: -565353263723123.5
Epoch 7 - Training loss: -819143398112869.2
Epoch 8 - Training loss: -1128076303755840.2
Epoch 9 - Training loss: -1494795087748869.0

Training Time (in minutes) =  0.33757022619247434


### Testing

In [17]:
correct_count = 0
all_count = 0
for byte_vector, labels in val_loader:
    for i in range(len(labels)):
        byte_vector.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            output = model(byte_vector)

        probabilities = torch.exp(output)
        probab = list(probabilities.cpu().numpy()[0])
        pred_label = probab.index(max(probab))
        true_label = labels.cpu().numpy()[i]
        if true_label == pred_label:
            correct_count += 1
        all_count += 1

print("Number of Images Tested =", all_count)
print("\nModel Accuracy =", (correct_count/all_count))


Number of Images Tested = 2869

Model Accuracy = 0.04879749041477867
