# Jupyter Notebook Version of DL

The project is first developed in Jupyter Notebook for easy testing/verification but could be moved to a formal Python Script in the future (if I have time). Contrary to the Xtract-Sampler we won't be implementing any byte extraction but rather right now assume we have the data.
***
Training and Developing a model

### Import Statements

In [42]:
import pickle
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from ByteVectorDataset import ByteVectorDataset

### Loading Files/Data Processing

Mostly for testing purposes

In [34]:
print("loading files now...")

with open('CDIACFileData/ByteVectors/byte_vector_dict_512B_one_gram.pkl', "rb") as fp1:
    one_gram = pickle.load(fp1)
with open('CDIACFileData/ByteVectors/byte_vector_dict_512B_two_gram.pkl', "rb") as fp2:
    two_gram = pickle.load(fp2)

print("loading files done!")

loading files now...
loading files done!


In [35]:
label_path = "CDIACFileData/labels/cdiac_naivetruth_processed.csv"
dataset_one_gram = ByteVectorDataset(label_path, one_gram)
dataset_two_gram = ByteVectorDataset(label_path, two_gram)

In [36]:
reccomended_num_workers = 4 * torch.cuda.device_count()
# ^ from https://discuss.pytorch.org/t/guidelines-for-assigning-num-workers-to-dataloader/813/3

In [37]:
dataloader_one_gram = DataLoader(dataset_one_gram, batch_size=1,
                        shuffle=True, num_workers=reccomended_num_workers)
dataloader_two_gram = DataLoader(dataset_two_gram, batch_size=1,
                        shuffle=True, num_workers=reccomended_num_workers)

In [38]:
for i_batch, sample_batched in enumerate(dataloader_one_gram):
    print(i_batch, sample_batched)
    # observe 4th batch and stop.
    if i_batch == 3:
        break

0 [tensor([[ 80, 108, 101,  97, 115, 101,  32,  99, 105, 116, 101,  32, 116, 104,
         105, 115,  32, 100,  97, 116,  97,  32, 115, 101, 116,  32,  97, 115,
          58,  10,  10,  84, 105, 108,  98, 114, 111, 111, 107,  44,  32,  66,
          46,  44,  32,  97, 110, 100,  32,  77,  46,  32,  82, 111, 115, 101,
         110,  98, 101, 114, 103,  46,  32,  50,  48,  49,  51,  46,  32,  67,
          97, 114,  98, 111, 110,  32,  68, 105, 111, 120, 105, 100, 101,  32,
          97, 110, 100,  32,  72, 121, 100, 114, 111, 103, 114,  97, 112, 104,
         105,  99,  32, 100,  97, 116,  97,  32, 111,  98, 116,  97, 105, 110,
         101, 100,  32, 100, 117, 114, 105, 110, 103,  32, 116, 104, 101,  32,
          82,  47,  86,  32,  65, 117, 114, 111, 114,  97,  32,  65, 117, 115,
         116, 114,  97, 108, 105, 115,  32,  99, 114, 117, 105, 115, 101,  32,
          10,  48,  57,  65,  82,  50,  48,  48,  54,  48,  49,  48,  50,  32,
          97, 108, 111, 110, 103,  32, 116, 104, 

From here on out we will just be using one grams but the two grams process is identical

In [39]:
TRAIN_TEST_SPLIT = .8 # 80% for training 20% for testing
train_set_size = int(TRAIN_TEST_SPLIT * len(dataset_one_gram))
val_set_size = len(dataset_one_gram) - train_set_size
train_set, val_set = torch.utils.data.random_split(dataset_one_gram, [train_set_size, val_set_size])
print(len(train_set))
print(len(val_set))

11474
2869


In [40]:
train_loader = DataLoader(train_set, batch_size=32,
                        shuffle=True, num_workers=reccomended_num_workers)
val_loader = DataLoader(val_set, batch_size=32,
                        shuffle=True, num_workers=reccomended_num_workers)

In [41]:
dataiter = iter(train_loader)
images, labels = dataiter.next()
print(type(images))
print(images.shape)
print(labels)

<class 'torch.Tensor'>
torch.Size([32, 512])
('tabular', 'image', 'tabular', 'json/xml', 'tabular', 'image', 'freetext', 'freetext', 'freetext', 'freetext', 'freetext', 'freetext', 'freetext', 'freetext', 'tabular', 'tabular', 'tabular', 'tabular', 'freetext', 'freetext', 'tabular', 'unknown', 'freetext', 'tabular', 'unknown', 'freetext', 'freetext', 'tabular', 'tabular', 'freetext', 'freetext', 'tabular')
