In [1]:
import matplotlib.pyplot as plt
import os, pickle
from pathlib import Path
import numpy as np
import time, json, sys

# 1 gram

In [7]:
def analyze_bytes(chunksize, rootdir):
    mega_dict = dict()
    start = time.time()
    file_index = 0
    for subdir, dirs, files in os.walk(rootdir):
        for file_name in files:
            filepath = os.path.abspath(os.path.join(subdir, file_name))
            file_dict = dict()
            f = open(filepath, "rb")
            contents = f.read(chunksize)
            contents = np.frombuffer(contents, dtype=np.uint8)
           
            if len(contents) < chunksize:
                contents = np.pad(contents, (0, chunksize - len(contents)), constant_values=0)

            f.close()

            mega_dict[filepath] = contents

            file_index += 1
            if file_index % 2000 == 0:
                print('done with 2000')
    end = time.time()
    time_elapsed = end - start
    print("For chunksize: {cs} bytes Time elapsed was: {te} seconds".format(cs=chunksize, te=time_elapsed))
    return mega_dict

In [25]:
mega_dict = analyze_bytes(256, '../CDIACPub8')

done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
For chunksize: 256 bytes Time elapsed was: 0.41138315200805664 seconds


In [11]:
with open('CDIACFileData/ByteVectors/byte_vector_dict_512B.pkl', 'wb+') as handle:
    pickle.dump(mega_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [71]:
with open('CDIACFileData/ByteVectors/byte_vector_dict_256KB.pkl', 'rb') as handle:
    mega_dict = pickle.load(handle)

In [77]:
mega_array = np.array([item for item in mega_dict.values()])


In [78]:
print(mega_array.shape)

(20427, 262144)


# 2 - gram

In [27]:
def convert_bytes_to_two_grams(bytes_object, chunksize):
    if len(bytes_object) == 0:
        contents = np.zeros(chunksize - 1)
    else:
        contents = np.zeros(len(bytes_object) - 1, dtype=np.uint32) # number of adjacent pairs is always length - 1
        for i in range(len(bytes_object) - 1):
            contents[i] = bytes_object[i] * 256 + bytes_object[i+1]
    return contents
    


In [28]:
def analyze_bytes_2grams(chunksize, rootdir):
    mega_dict = dict()
    start = time.time()
    file_index = 0
    for subdir, dirs, files in os.walk(rootdir):
        for file_name in files:
            filepath = os.path.abspath(os.path.join(subdir, file_name))
            file_dict = dict()
            f = open(filepath, "rb")
            contents = f.read(chunksize)
            contents = convert_bytes_to_two_grams(contents, chunksize)
           
            if len(contents) - 1 < chunksize:
                contents = np.pad(contents, (0, chunksize - len(contents) - 1), constant_values=0)

            f.close()

            mega_dict[filepath] = contents

            file_index += 1
            if file_index % 2000 == 0:
                print('done with 2000')

    end = time.time()
    time_elapsed = end - start
    print("For chunksize: {cs} bytes Time elapsed was: {te} seconds".format(cs=chunksize, te=time_elapsed))
    return mega_dict

In [29]:
mega_dict_2grams = analyze_bytes_2grams(10, '../CDIACPub8')

done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
For chunksize: 10 bytes Time elapsed was: 0.9835131168365479 seconds


In [13]:
with open('CDIACFileData/ByteVectors/byte_vector_dict_512B_2grams.pkl', 'wb+') as handle:
    pickle.dump(mega_dict_2grams, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Fast tracking 2grams

In [32]:
byte_sizes = [1048576]

for chunk_size in byte_sizes:
	mega_dict_2grams = analyze_bytes_2grams(chunk_size, '../CDIACPub8')
	with open('CDIACFileData/ByteVectors/byte_vector_dict_{s}B_2grams.pkl'.format(s=chunk_size), 'wb+') as handle:
		pickle.dump(mega_dict_2grams, handle, protocol=pickle.HIGHEST_PROTOCOL)

done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
done with 2000
For chunksize: 1048576 bytes Time elapsed was: 761.0494556427002 seconds


In [35]:
for subdir, dirs, files in os.walk("CDIACFileData/ByteVectors/"):
	for file in files:
		file_is_good = False
		filepath = os.path.join(subdir, file)
		with open(filepath, "rb") as fp:
			grams = pickle.load(fp)
			print("Loaded")
			for key, value in grams.items():
				for item in value:
					if item > 1000:
						file_is_good = True
						break
				if file_is_good:
					break
		if file_is_good:
			print(file, "is good.")
				


KeyboardInterrupt: 