# CDA Sampling task
### CTU-Malware-Capture-Botnet-52 or Scenario 11 in the CTU-13 dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_table('capture20110818-2.pcap.netflow.labeled',delim_whitespace=True) 
print(type(df))

<class 'pandas.core.frame.DataFrame'>


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df = df.iloc[:,:-5]
df.columns = ['Date','Time','Durat','Prot','Src IP Addr:Port','Dir','Dst IP Addr:Port','Flags','Tos','Packets','Bytes','Flows','Label']
LEN_DF = len(df)
print(LEN_DF)
print(df.head())

408835
         Date          Time  Durat Prot    Src IP Addr:Port Dir  \
0  2011-08-18  10:39:35.289    0.0  TCP  85.3.219.122:55347  ->   
1  2011-08-18  10:39:36.067    0.0  TCP  85.3.219.122:55347  ->   
2  2011-08-18  10:39:36.754    0.0  TCP  85.3.219.122:55347  ->   
3  2011-08-18  10:39:37.079    0.0  TCP   84.13.8.236:61289  ->   
4  2011-08-18  10:39:37.186    0.0  TCP    147.32.3.51:4397  ->   

     Dst IP Addr:Port Flags  Tos  Packets  Bytes  Flows       Label  
0  147.32.84.118:6881  S_RA    0        2    120      1  Background  
1  147.32.84.118:6881  S_RA    0        2    120      1  Background  
2  147.32.84.118:6881  S_RA    0        2    116      1  Background  
3  147.32.84.118:6881  S_RA    0        2    120      1  Background  
4  147.32.87.22:10010  S_RA    0        2    116      1  Background  


In [4]:
#Seperate address and port
addr = []
port = []
for i in df['Dst IP Addr:Port']:
    try:
        s = i.split(':', 1)
        addr.append(s[0])
        port.append(s[1])
    except:
        port.append('Null')

df['Des_address'] = addr
df['Port'] = port
print(df.head())

         Date          Time  Durat Prot    Src IP Addr:Port Dir  \
0  2011-08-18  10:39:35.289    0.0  TCP  85.3.219.122:55347  ->   
1  2011-08-18  10:39:36.067    0.0  TCP  85.3.219.122:55347  ->   
2  2011-08-18  10:39:36.754    0.0  TCP  85.3.219.122:55347  ->   
3  2011-08-18  10:39:37.079    0.0  TCP   84.13.8.236:61289  ->   
4  2011-08-18  10:39:37.186    0.0  TCP    147.32.3.51:4397  ->   

     Dst IP Addr:Port Flags  Tos  Packets  Bytes  Flows       Label  \
0  147.32.84.118:6881  S_RA    0        2    120      1  Background   
1  147.32.84.118:6881  S_RA    0        2    120      1  Background   
2  147.32.84.118:6881  S_RA    0        2    116      1  Background   
3  147.32.84.118:6881  S_RA    0        2    120      1  Background   
4  147.32.87.22:10010  S_RA    0        2    116      1  Background   

     Des_address   Port  
0  147.32.84.118   6881  
1  147.32.84.118   6881  
2  147.32.84.118   6881  
3  147.32.84.118   6881  
4   147.32.87.22  10010  


In [5]:
label = np.unique(df['Label'].values)
print('Label:')
print(len(label))
print(label)

print('Protocol:')
protocol = np.unique(df['Prot'].values)
print(len(protocol))
print(protocol)

Label:
3
['Background' 'Botnet' 'LEGITIMATE']
Protocol:
10
['ARP' 'ICMP' 'IGMP' 'IPV6' 'IPX/SPX' 'PIM' 'RARP' 'RTCP' 'TCP' 'UDP']


In [6]:
min_pac = min(df['Packets'])
max_pac = max(df['Packets'])
med_pac = np.median(df['Packets'].values)
print(min_pac,med_pac,max_pac)
s = df['Packets'].sort_values()

(1, 6.0, 78781)


In [7]:
# df_long = pd.read_table('capture20110818.biargus.long.labeled',delim_whitespace=True) 
# print(type(df_long))
# print(df_long.head())

#### Top 10 most frequent address before sampling

In [8]:
des_ip = np.unique(df['Des_address'])
des_ip_sort = df['Des_address'].value_counts()
print(len(des_ip))
print(des_ip_sort[:10])
ori_ip = des_ip_sort[:10].index.values

15258
147.32.96.69      279763
147.32.80.9        29441
147.32.84.229      28445
147.32.86.116      11692
147.32.84.59        3324
147.32.80.13         901
147.32.84.118        625
147.32.84.2          619
76.13.114.90         571
209.85.149.132       571
Name: Des_address, dtype: int64


### Sampling

In [9]:
import random
from random import randint

# select k elements 
select = [100000,10000,1000,500,100,60]

#remove duplicate
sub_df = df.iloc[:,2:12]
sub_df = sub_df.drop_duplicates()
print(len(sub_df))
inx_sub = sub_df.index.values
df = df.loc[inx_sub,:]
LEN_DF = len(df)

rand = np.random.random((LEN_DF,1))

df['random'] = rand

ip_list = list(ori_ip)
for k in select:
    print(k)
    # first k items
    samp_df = df[:k]
    # select random sample with a probability smaller than k/i
    index = df.index.values
    proba = k/index
    df['proba'] = proba
    samples = df.loc[df['proba']<=df['random'],:]

    all_sample = pd.concat([samp_df,samples])
    print(len(all_sample))
    
    sample = all_sample.sample(n=k)
    
    samp_des_ip_sort = sample['Des_address'].value_counts()
    top = samp_des_ip_sort[:10]
    ip = top.index.values
    ip_list.extend(ip)
    
    
factors, uniques = pd.factorize(ip_list)
print((factors,uniques))

340865
100000


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




347485
10000
341129
1000
340916
500
340874
100
340869
60
340869
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  0,  2,  1,  3,  4,  5,  7,
        8,  9, 10,  0,  2,  1,  3,  4,  8,  5,  9,  7, 10,  0,  1,  2,  3,
        4,  7,  8, 11, 12, 13,  0,  1,  2,  3,  9, 14,  7, 15, 16, 17,  0,
        1,  2,  3, 18, 19, 20, 14, 21, 22,  0,  2,  3,  5, 23, 24, 25,  7,
       10, 26]), array(['147.32.96.69', '147.32.80.9', '147.32.84.229', '147.32.86.116',
       '147.32.84.59', '147.32.80.13', '147.32.84.118', '147.32.84.2',
       '76.13.114.90', '209.85.149.132', '147.32.84.111', '147.32.85.118',
       '74.125.232.196', '147.32.87.238', '74.125.232.207', '77.75.76.5',
       '91.203.99.45', '94.50.169.74', '147.229.147.170', '88.182.107.11',
       '187.15.99.84', '74.125.232.213', '77.78.99.23', '217.77.163.141',
       '74.125.232.201', '79.111.124.254', '147.32.86.194'], dtype=object))


In [10]:
# IP rank comparison
samp_ip = samp_des_ip_sort[:10].index.values

comp = pd.DataFrame({'Origin':factors[:10]})

for i,k in enumerate(select):
    comp[str(k)] = factors[(i+1)*10:(i+2)*10]

print(comp)

   Origin  100000  10000  1000  500  100  60
0       0       0      0     0    0    0   0
1       1       2      2     1    1    1   2
2       2       1      1     2    2    2   3
3       3       3      3     3    3    3   5
4       4       4      4     4    9   18  23
5       5       5      8     7   14   19  24
6       6       7      5     8    7   20  25
7       7       8      9    11   15   14   7
8       8       9      7    12   16   21  10
9       9      10     10    13   17   22  26


# Count min sketch

In [69]:
import count_min_sketch #see the inplementation in count_min_sketch.py file
from time import time
from random import randint


# get the value and freq as input
ip_list = des_ip_sort.index.values
freq_list = np.array(des_ip_sort).tolist()
#get top 10 ips
true_10 = np.array(des_ip_sort[:10].keys())

In [70]:
def test_cms(ip_list,freq_list,w,d):
    mytime = 0
    mine = count_min_sketch.CountMinSketch(w, d)
    for ip,freq in zip(ip_list,freq_list):
        t = time()
        mine.update(ip, freq)
        mytime += time() - t
    loss= 0
    pre_freq={}
    for ip, freq in zip(ip_list,freq_list):
        
        loss += (mine.query(ip) - freq)**2
        pre_freq[ip]=int(mine[ip])

    print 'loss:', loss**0.5 / len(ip_list)
    print 'time', mytime
    ips=[]
    topNum = 10
    nlargestList = heapq.nlargest(topNum, pre_freq.values())        #get top 10  
    for value in nlargestList:                                #print
        for key in pre_freq:  
            if pre_freq[key] == value:  
                ips.append(key)
                print key, pre_freq[key]
    return np.array(ips)
    

In [108]:
ips = test_cms(ip_list,freq_list,1000,10)
acc = sum(ips==true_10)/10.
print 'the top10 accuracy is', acc

loss: 0.128709966236
time 0.121211528778
147.32.96.69 279770
147.32.80.9 29449
147.32.84.229 28466
147.32.86.116 11708
147.32.84.59 3339
147.32.80.13 914
147.32.84.2 633
147.32.84.118 631
76.13.114.90 591
209.85.149.132 579
the top10 accuracy is 0.8


In [107]:
ips = test_cms(ip_list,freq_list,800,10)
acc = sum(ips==true_10)/10.
print 'the top10 accuracy is', acc

loss: 0.182796159308
time 0.103269577026
147.32.96.69 279791
147.32.80.9 29462
147.32.84.229 28475
147.32.86.116 11731
147.32.84.59 3350
147.32.80.13 920
147.32.84.118 657
147.32.84.2 642
209.85.149.132 599
76.13.114.90 593
the top10 accuracy is 0.8
