# CSTNET analysis

In [1]:
import pandas as pd
import os

In [2]:
path = "cstnet-tls1.3_packet\\packet"

train = pd.read_csv(os.path.join(path, "train_dataset.tsv"), sep="\t", header=0)
test = pd.read_csv(os.path.join(path, "test_dataset.tsv"), sep="\t", header=0)
valid = pd.read_csv(os.path.join(path, "valid_dataset.tsv"), sep="\t", header=0)

In [3]:
train["label"].value_counts()


label
81     4000
88     4000
98     4000
39     4000
113    4000
       ... 
75     2222
14     2034
99     1922
1      1818
20      844
Name: count, Length: 120, dtype: int64

In [4]:
test.label.value_counts()


label
61     500
103    500
59     500
50     500
7      500
      ... 
75     278
14     254
99     241
1      227
20     106
Name: count, Length: 120, dtype: int64

In [5]:
valid.label.value_counts()

label
2      500
9      500
110    500
114    500
21     500
      ... 
75     278
14     254
99     240
1      228
20     105
Name: count, Length: 120, dtype: int64

In [6]:
# how many packets from train, test, val
a = train['label'].value_counts().reset_index()
a.columns = ['label', 'train_count']

b = test['label'].value_counts().reset_index()
b.columns = ['label', 'test_count']

c = valid['label'].value_counts().reset_index()
c.columns = ['label', 'valid_count']

counts = pd.merge(a, b, on='label')
counts = pd.merge(counts, c, on='label')

counts = counts.fillna(0)

counts['train_count'] = counts['train_count'].astype(int)
counts['test_count'] = counts['test_count'].astype(int)
counts['valid_count'] = counts['valid_count'].astype(int)

In [7]:
full_labels_df = counts[(counts['train_count'] == 4000) & (counts['test_count'] == 500) & (counts['valid_count'] == 500)]
full_labels_df

Unnamed: 0,label,train_count,test_count,valid_count
0,81,4000,500,500
1,88,4000,500,500
2,98,4000,500,500
3,39,4000,500,500
4,113,4000,500,500
...,...,...,...,...
105,51,4000,500,500
106,96,4000,500,500
107,45,4000,500,500
108,70,4000,500,500


In [8]:
full_labels = full_labels_df['label'].values.tolist()
full_labels.sort()

In [9]:
full_labels

[0,
 2,
 3,
 4,
 6,
 7,
 9,
 10,
 11,
 12,
 13,
 15,
 16,
 17,
 18,
 19,
 21,
 22,
 23,
 24,
 25,
 26,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 74,
 76,
 77,
 78,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119]

Aside from 10 domains, all domains have 5000 packets, disted 80-10-10.
We can chose 10 domains to finetune on, as per SoK article.

In [10]:
def count_seq_len(seq):
    return len(seq.split(" "))

train['packet_seq_len'] = train['text_a'].apply(count_seq_len)

In [11]:
train['packet_seq_len'].value_counts()

packet_seq_len
64    409797
53     14628
41     12933
48      9147
45      3853
60      3545
57      3064
52      2039
49      1823
56      1752
44       507
63       349
59       330
61       250
58       243
43       225
42       206
47       101
54        97
50        86
55        80
46        74
51        70
62        64
39        51
40        28
38        25
Name: count, dtype: int64

Almost all packets are 64 tokens length, meaning 128 bytes. This corresponds to the article claiming they take the first 128 byte (others are lower and probably small packets).

In [12]:
# Look at one example label
#  order by the text itself
train[train.label == 0].sort_values(by='text_a')

Unnamed: 0,label,text_a,packet_seq_len
156366,0,01bb bb02 0262 6271 71c6 c61d 1d87 87f2 f2c3 c...,48
11605,0,01bb bb02 026f 6f73 73d3 d32a 2a79 7906 069b 9...,64
303233,0,01bb bb02 026f 6f78 78c7 c72a 2a79 7906 069b 9...,64
132340,0,01bb bb02 026f 6f83 83a7 a72a 2a79 7909 0940 4...,60
323343,0,01bb bb03 0375 75a4 a4aa aa9b 9b98 980d 0d55 5...,64
...,...,...,...
248567,0,fe92 92e6 e6e4 e440 40db db9a 9a4a 4a1a 1ab7 b...,64
266964,0,fe92 92e6 e6e4 e442 4248 489a 9a4a 4a1b 1b10 1...,60
299009,0,ff34 3410 1098 984e 4eac ac3c 3c19 19f6 f6dc d...,64
448518,0,ffc2 c280 808c 8c5b 5b6b 6b94 94ab ab11 117b 7...,64


I think you can clearly see paterns in the headers, at least in the first 56 bytes

In [13]:
dir = os.path.join(path, "..\\ten_classes")
classes = full_labels[:10]
classes

[0, 2, 3, 4, 6, 7, 9, 10, 11, 12]

In [14]:
train_ten = train[train['label'].isin(classes)][['label', 'text_a']]
test_ten = test[test['label'].isin(classes)][['label', 'text_a']]
valid_ten = valid[valid['label'].isin(classes)][['label', 'text_a']]

# store as tsv files
train_ten.to_csv(os.path.join(dir, 'train_ten.tsv'), sep='\t', index=False)
test_ten.to_csv(os.path.join(dir, 'test_ten.tsv'), sep='\t', index=False)
valid_ten.to_csv(os.path.join(dir, 'valid_ten.tsv'), sep='\t', index=False)