In [1]:
TEST_DATA = "sequences_sampled.dat"
TRAIN_DATA = "train_set_clean.dat"
TEST_LABEL_FILE = "labels_sampled.txt"

MAXLINES = int(4e5)
background_label = "0"

In [2]:
test_sequences = list()

print("Sequences")
for i, line in enumerate(open(TEST_DATA, "rt")):
    if i==0:
        alphabet_size = int(line.split()[-1])
        continue
    elif i+1 >= MAXLINES + 1 or len(line.strip()) == 0: # +1 on maxlines for the header
        continue
    line = line.split()
    test_sequences.append( list( int(x) for x in line[2:]) ) 
    if i % int(10e5) == 0:
        print("Line: ", i)

test_labels = list()
print("Labels")
for i, line in enumerate(open(TEST_LABEL_FILE, "rt")):
    if i+1 >= MAXLINES or len(line.strip()) == 0:
        continue
    if line.split()[-1] != background_label:
        test_labels.append(1)
    else:
        test_labels.append(0)
assert(len(test_sequences) == len(test_labels))
print("Got {} labeled sequences".format(len(test_sequences)))

Sequences
Labels
Got 399999 labeled sequences


In [5]:
benign_lines = dict()
malign_lines = dict()

def do_multimap(d: dict, s: str):
    """
    Counts the strings in the relevant maps.
    """
    if s in d:
        d[s] += 1
    else:
        d[s] = 1

for seq, label in zip(test_sequences, test_labels):
    seq_str = " ".join(str(x) for x in seq)
    if label == 0:
        do_multimap(benign_lines, seq_str)
    elif label == 1:
         do_multimap(malign_lines, seq_str)
            
print(len(benign_lines), len(malign_lines))

57124 1282


In [7]:
for overlap in set(benign_lines.keys()).intersection(malign_lines.keys()):
    print("Benign counts: {}, malign counts: {}, len: {}".format(benign_lines[overlap], malign_lines[overlap], \
                                                                 len(overlap.split())))

Benign counts: 8, malign counts: 3, len: 10
Benign counts: 2, malign counts: 2, len: 10
Benign counts: 1, malign counts: 1, len: 10
Benign counts: 5, malign counts: 460, len: 10
Benign counts: 1, malign counts: 2, len: 10
Benign counts: 1, malign counts: 1, len: 10
Benign counts: 6, malign counts: 22, len: 10
Benign counts: 4, malign counts: 1, len: 3
Benign counts: 1, malign counts: 2, len: 10
Benign counts: 38, malign counts: 102, len: 10
Benign counts: 5, malign counts: 22, len: 10
Benign counts: 2, malign counts: 4, len: 10
Benign counts: 1, malign counts: 1, len: 10
Benign counts: 1, malign counts: 1, len: 10
Benign counts: 2, malign counts: 11, len: 10
Benign counts: 2, malign counts: 5, len: 10
Benign counts: 3, malign counts: 26, len: 10
Benign counts: 10, malign counts: 19, len: 10
Benign counts: 4, malign counts: 2, len: 10
Benign counts: 1, malign counts: 1, len: 10
Benign counts: 172, malign counts: 9, len: 10
Benign counts: 4, malign counts: 38, len: 10
Benign counts: 1, m

In [10]:
train_counter = dict()

for i, line in enumerate(open(TRAIN_DATA, "rt")):
    if i == 0:
        continue
    elif i%int(1e6) == 0:
        print("Line ", i)
    line = " ".join(line.split()[2:])
    if line in malign_lines:
        do_multimap(train_counter, line)
print("Total amount of sequences: ", i-1)


Line  1000000
Line  2000000
Line  3000000
Line  4000000
Line  5000000
Line  6000000
Line  7000000
Line  8000000
Line  9000000
Line  10000000
Line  11000000
Line  12000000
Line  13000000
Line  14000000
Line  15000000
Line  16000000
Line  17000000
Line  18000000
Line  19000000
Line  20000000
Line  21000000
Line  22000000
Line  23000000
Line  24000000
Line  25000000
Line  26000000
Line  27000000
Line  28000000
Line  29000000
Line  30000000
Line  31000000
Line  32000000
Line  33000000
Line  34000000
Line  35000000
Total amount of sequences:  35422725


{'392951 392951 392951 392951 392951 392951 392951 392951 392951 392951': 91385,
 '111719 111719 111719 111719 111719 111719 111719 111719 111719 111719': 125183,
 '113015 111719 111719 111719 111719 111719 111719 111719 111719 111719': 461,
 '580871 580871 580871 580871 580871 580871 534215 534215 534215 534215': 591,
 '580871 580871 580871 534215 534215 534215 534215 534215 534215 534215': 1043,
 '580871 580871 534215 534215 534215 534215 534215 534215 534215 534215': 1337,
 '534215 534215 534215 534215 534215 534215 534215 534215 534215 534215': 311377,
 '534215 534215 534215 534215 534215 534215 534215 534215 534215 580871': 1201,
 '534215 534215 534215 534215 534215 534215 534215 534215 580871 580871': 357,
 '534215 534215 534215 534215 534215 534215 580871 580871 580871 580871': 227,
 '534215 534215 534215 534215 580871 580871 580871 580871 580871 580871': 271,
 '534215 534215 534215 580871 580871 580871 580871 580871 580871 580871': 364,
 '534215 534215 580871 580871 580871 5808

In [11]:
for seq, count in train_counter.items():
    print("Train data: {}, test data: {}".format(count, malign_lines[seq]))

Train data: 91385, test data: 11
Train data: 125183, test data: 377
Train data: 461, test data: 21
Train data: 591, test data: 2
Train data: 1043, test data: 1
Train data: 1337, test data: 1
Train data: 311377, test data: 40
Train data: 1201, test data: 1
Train data: 357, test data: 1
Train data: 227, test data: 1
Train data: 271, test data: 1
Train data: 364, test data: 1
Train data: 579, test data: 1
Train data: 155792, test data: 38
Train data: 585, test data: 2
Train data: 2493, test data: 1
Train data: 1433, test data: 2
Train data: 1317, test data: 1
Train data: 1293, test data: 1
Train data: 1381, test data: 2
Train data: 1529, test data: 1
Train data: 2119, test data: 1
Train data: 495352, test data: 28
Train data: 2010, test data: 29102
Train data: 4, test data: 1
Train data: 3, test data: 1
Train data: 4, test data: 1
Train data: 6, test data: 1
Train data: 2, test data: 1
Train data: 2, test data: 1
Train data: 1, test data: 1
Train data: 756, test data: 54
Train data: 18991

In [12]:
print(len(train_counter), len(malign_lines))

576 1282
