In [6]:
TEST_DATA = "test_set_clean.dat"
TRAIN_DATA = "train_set_clean.dat"
TEST_LABEL_FILE = "labels_test_set_clean.txt"

MAXLINES = int(7e6)
background_label = "0"

In [7]:
test_sequences = list()

print("Sequences")
for i, line in enumerate(open(TEST_DATA, "rt")):
    if i==0:
        alphabet_size = int(line.split()[-1])
        continue
    elif MAXLINES and i+1 >= MAXLINES + 1 or len(line.strip()) == 0: # +1 on maxlines for the header
        continue
    line = line.split()
    test_sequences.append( list( int(x) for x in line[2:]) ) 
    if i % int(10e5) == 0:
        print("Line: ", i)

test_labels = list()
print("Labels")
for i, line in enumerate(open(TEST_LABEL_FILE, "rt")):
    if MAXLINES and i+1 >= MAXLINES or len(line.strip()) == 0:
        continue
    if line.split()[-1] != background_label:
        test_labels.append(1)
    else:
        test_labels.append(0)
assert(len(test_sequences) == len(test_labels))
print("Got {} labeled sequences".format(len(test_sequences)))

n_test_sequences = len(test_sequences)

Sequences
Line:  1000000
Line:  2000000
Line:  3000000
Line:  4000000
Line:  5000000
Line:  6000000
Labels
Got 6999999 labeled sequences


In [8]:
benign_lines = dict()
malign_lines = dict()

def do_multimap(d: dict, s: str):
    """
    Counts the strings in the relevant maps.
    """
    if s in d:
        d[s] += 1
    else:
        d[s] = 1

for seq, label in zip(test_sequences, test_labels):
    seq_str = " ".join(str(x) for x in seq)
    if label == 0:
        do_multimap(benign_lines, seq_str)
    elif label == 1:
         do_multimap(malign_lines, seq_str)
            
print(len(benign_lines), len(malign_lines))

243386 875


In [9]:
malign_counts = list()
benign_counts = list()

overlapping_strings = set(benign_lines.keys()).intersection(malign_lines.keys())
for overlap in overlapping_strings:
    #print("Benign counts: {}, malign counts: {}, len: {}".format(benign_lines[overlap], malign_lines[overlap], \
    #                                                             len(overlap.split())))
    malign_counts.append((malign_lines[overlap], benign_lines[overlap], len(overlap.split()), overlap))
    benign_counts.append((benign_lines[overlap], malign_lines[overlap], len(overlap.split())))
malign_counts.sort(reverse=True)
benign_counts.sort(reverse=True)
malign_counts[:10], benign_counts[:10]

([(12984,
   26798,
   10,
   '99027 99027 99027 99027 99027 99027 99027 99027 99027 99027'),
  (12941,
   114,
   10,
   '146699 146699 146699 146699 146699 146699 146699 146699 146699 146699'),
  (11522,
   270,
   10,
   '146700 146700 146700 146700 146700 146700 146700 146700 146700 146700'),
  (5832,
   1,
   10,
   '136997 136997 136997 136997 136997 136997 136997 136997 136997 136997'),
  (5160,
   20493,
   10,
   '155960 155960 155960 155960 155960 155960 155960 155960 155960 155960'),
  (2950,
   8,
   10,
   '136996 136996 136996 136996 136996 136996 136996 136996 136996 136996'),
  (539,
   17325,
   10,
   '99026 99026 99026 99026 99026 99026 99026 99026 99026 99026'),
  (264,
   3394,
   10,
   '155938 155938 155938 155938 155938 155938 155938 155938 155938 155938'),
  (226,
   2701,
   10,
   '99049 99049 99049 99049 99049 99049 99049 99049 99049 99049'),
  (220,
   27,
   10,
   '51840 51840 51840 51840 51840 51840 51840 51840 51840 51840')],
 [(26798, 12984, 10),
  (20

In [10]:
most_costly_strings = set()
for i in range(10):
    most_costly_strings.add(malign_counts[i][-1])
most_costly_strings

{'136996 136996 136996 136996 136996 136996 136996 136996 136996 136996',
 '136997 136997 136997 136997 136997 136997 136997 136997 136997 136997',
 '146699 146699 146699 146699 146699 146699 146699 146699 146699 146699',
 '146700 146700 146700 146700 146700 146700 146700 146700 146700 146700',
 '155938 155938 155938 155938 155938 155938 155938 155938 155938 155938',
 '155960 155960 155960 155960 155960 155960 155960 155960 155960 155960',
 '51840 51840 51840 51840 51840 51840 51840 51840 51840 51840',
 '99026 99026 99026 99026 99026 99026 99026 99026 99026 99026',
 '99027 99027 99027 99027 99027 99027 99027 99027 99027 99027',
 '99049 99049 99049 99049 99049 99049 99049 99049 99049 99049'}

In [11]:
counts = list()
for mal_string, mal_count in malign_lines.items():
    if mal_string in overlapping_strings:
        continue
    counts.append(mal_count)
counts.sort(reverse=True)
print(counts[:10])

[11814, 6491, 5075, 2896, 2463, 1621, 1546, 1429, 1229, 1214]


In [12]:
train_counter = dict()

for i, line in enumerate(open(TRAIN_DATA, "rt")):
    if i == 0:
        continue
    elif i%int(1e6) == 0:
        print("Line ", i)
    line = " ".join(line.split()[2:])
    if line in malign_lines:
        do_multimap(train_counter, line)
print("Total amount of sequences: ", i-1)

n_train_sequences = i-1

Line  1000000
Line  2000000
Line  3000000
Line  4000000
Line  5000000
Line  6000000
Line  7000000
Total amount of sequences:  7878900


In [13]:
for seq, count in train_counter.items():
    print("Train data: {}, test data: {}".format(count, malign_lines[seq]))

Train data: 19659, test data: 264
Train data: 1, test data: 1
Train data: 1, test data: 1
Train data: 1, test data: 1
Train data: 1, test data: 1
Train data: 1, test data: 1
Train data: 1, test data: 1
Train data: 13110, test data: 5160
Train data: 8721, test data: 82
Train data: 11, test data: 1
Train data: 11, test data: 1
Train data: 11, test data: 1
Train data: 11, test data: 1
Train data: 11, test data: 1
Train data: 11, test data: 1
Train data: 12, test data: 1
Train data: 13, test data: 1
Train data: 13, test data: 1
Train data: 6357, test data: 1
Train data: 218, test data: 2
Train data: 4, test data: 1
Train data: 4, test data: 1
Train data: 3, test data: 1
Train data: 3, test data: 1
Train data: 3, test data: 1
Train data: 3, test data: 1
Train data: 343, test data: 11522
Train data: 2, test data: 1
Train data: 2, test data: 1
Train data: 2, test data: 1
Train data: 2, test data: 1
Train data: 2, test data: 1
Train data: 2, test data: 1
Train data: 2, test data: 1
Train data:

In [14]:
print(len(train_counter), len(malign_lines))

187 875


In [15]:
malign_counts.sort(reverse=True)
for count, _, _, line in malign_counts:
    if line in train_counter:
        if line in benign_lines:
            print("Malign test set: {}, p: {}, benign test set: {}, p: {}, train set: {}, p: {}".format(count, count / n_test_sequences, benign_lines[line], benign_lines[line] / n_test_sequences,  train_counter[line], train_counter[line] / n_train_sequences))
        else:
            print("Malign test set: {}, p: {}, benign test set: {}, p: {}, train set: {}, p: {}".format(count, count / n_test_sequences, 0, 0, train_counter[line], train_counter[line] / n_train_sequences))
    else:
        if line in benign_lines:
            print("Malign test set: {}, p: {}, benign test set: {}, p: {}, train set: {}, p: {}".format(count, count / n_test_sequences, benign_lines[line], benign_lines[line] / n_test_sequences,  0, 0))
        else:
            print("Malign test set: {}, p: {}, benign test set: {}, p: {}, train set: {}, p: {}".format(count, count / n_test_sequences, 0, 0, 0, 0))


Malign test set: 12984, p: 0.0018548574078367725, benign test set: 26798, p: 0.0038282862611837515, train set: 47166, p: 0.005986368655522979
Malign test set: 12941, p: 0.0018487145498163643, benign test set: 114, p: 1.628571661224523e-05, train set: 6307, p: 0.0008004924545304548
Malign test set: 11522, p: 0.0016460002351428907, benign test set: 270, p: 3.857143408163344e-05, train set: 343, p: 4.3533995862366575e-05
Malign test set: 5832, p: 0.0008331429761632823, benign test set: 1, p: 1.4285716326530904e-07, train set: 0, p: 0
Malign test set: 5160, p: 0.0007371429624489946, benign test set: 20493, p: 0.0029275718467959783, train set: 13110, p: 0.0016639378593458478
Malign test set: 2950, p: 0.00042142863163266165, benign test set: 8, p: 1.1428573061224723e-06, train set: 0, p: 0
Malign test set: 539, p: 7.700001100000157e-05, benign test set: 17325, p: 0.002475000353571479, train set: 0, p: 0
Malign test set: 264, p: 3.7714291102041586e-05, benign test set: 3394, p: 0.000484857212

In [16]:
n_train_sequences, 4704

(7878900, 4704)