# CDA Sampling task
### CTU-Malware-Capture-Botnet-52 or Scenario 11 in the CTU-13 dataset

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_table('capture20110818-2.pcap.netflow.labeled',delim_whitespace=True) 
print(type(df))

<class 'pandas.core.frame.DataFrame'>


  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df = df.iloc[:,:-5]
df.columns = ['Date','Time','Durat','Prot','Src IP Addr:Port','Dir','Dst IP Addr:Port','Flags','Tos','Packets','Bytes','Flows','Label']
LEN_DF = len(df)
print(LEN_DF)
print(df.head())

408835
         Date          Time  Durat Prot    Src IP Addr:Port Dir  \
0  2011-08-18  10:39:35.289    0.0  TCP  85.3.219.122:55347  ->   
1  2011-08-18  10:39:36.067    0.0  TCP  85.3.219.122:55347  ->   
2  2011-08-18  10:39:36.754    0.0  TCP  85.3.219.122:55347  ->   
3  2011-08-18  10:39:37.079    0.0  TCP   84.13.8.236:61289  ->   
4  2011-08-18  10:39:37.186    0.0  TCP    147.32.3.51:4397  ->   

     Dst IP Addr:Port Flags  Tos  Packets  Bytes  Flows       Label  
0  147.32.84.118:6881  S_RA    0        2    120      1  Background  
1  147.32.84.118:6881  S_RA    0        2    120      1  Background  
2  147.32.84.118:6881  S_RA    0        2    116      1  Background  
3  147.32.84.118:6881  S_RA    0        2    120      1  Background  
4  147.32.87.22:10010  S_RA    0        2    116      1  Background  


In [5]:
#Seperate address and port
addr = []
port = []
for i in df['Dst IP Addr:Port']:
    try:
        s = i.split(':', 1)
        addr.append(s[0])
        port.append(s[1])
    except:
        port.append('Null')

df['Des_address'] = addr
df['Port'] = port
print(df.head())

         Date          Time  Durat Prot    Src IP Addr:Port Dir  \
0  2011-08-18  10:39:35.289    0.0  TCP  85.3.219.122:55347  ->   
1  2011-08-18  10:39:36.067    0.0  TCP  85.3.219.122:55347  ->   
2  2011-08-18  10:39:36.754    0.0  TCP  85.3.219.122:55347  ->   
3  2011-08-18  10:39:37.079    0.0  TCP   84.13.8.236:61289  ->   
4  2011-08-18  10:39:37.186    0.0  TCP    147.32.3.51:4397  ->   

     Dst IP Addr:Port Flags  Tos  Packets  Bytes  Flows       Label  \
0  147.32.84.118:6881  S_RA    0        2    120      1  Background   
1  147.32.84.118:6881  S_RA    0        2    120      1  Background   
2  147.32.84.118:6881  S_RA    0        2    116      1  Background   
3  147.32.84.118:6881  S_RA    0        2    120      1  Background   
4  147.32.87.22:10010  S_RA    0        2    116      1  Background   

     Des_address   Port  
0  147.32.84.118   6881  
1  147.32.84.118   6881  
2  147.32.84.118   6881  
3  147.32.84.118   6881  
4   147.32.87.22  10010  


In [6]:
label = np.unique(df['Label'].values)
print('Label:')
print(len(label))
print(label)

print('Protocol:')
protocol = np.unique(df['Prot'].values)
print(len(protocol))
print(protocol)

Label:
3
['Background' 'Botnet' 'LEGITIMATE']
Protocol:
10
['ARP' 'ICMP' 'IGMP' 'IPV6' 'IPX/SPX' 'PIM' 'RARP' 'RTCP' 'TCP' 'UDP']


In [7]:
min_pac = min(df['Packets'])
max_pac = max(df['Packets'])
med_pac = np.median(df['Packets'].values)
print(min_pac,med_pac,max_pac)
s = df['Packets'].sort_values()

(1, 6.0, 78781)


In [8]:
# df_long = pd.read_table('capture20110818.biargus.long.labeled',delim_whitespace=True) 
# print(type(df_long))
# print(df_long.head())

#### Top 10 most frequent address before sampling

In [9]:
des_ip = np.unique(df['Des_address'])
des_ip_sort = df['Des_address'].value_counts()
print(len(des_ip))
print(des_ip_sort[:10])
ori_ip = des_ip_sort[:10].index.values

15258
147.32.96.69      279763
147.32.80.9        29441
147.32.84.229      28445
147.32.86.116      11692
147.32.84.59        3324
147.32.80.13         901
147.32.84.118        625
147.32.84.2          619
76.13.114.90         571
209.85.149.132       571
Name: Des_address, dtype: int64


### Sampling

In [10]:
import random
from random import randint

# select k elements 
select = [100000,10000,1000,500,100,60]

#remove duplicate
sub_df = df.iloc[:,2:12]
sub_df = sub_df.drop_duplicates()
print(len(sub_df))
inx_sub = sub_df.index.values
df = df.loc[inx_sub,:]
LEN_DF = len(df)

rand = np.random.random((LEN_DF,1))

df['random'] = rand

ip_list = list(ori_ip)
for k in select:
    print(k)
    # first k items
    samp_df = df[:k]
    # select random sample with a probability smaller than k/i
    index = df.index.values
    proba = k/index
    df['proba'] = proba
    samples = df.loc[df['proba']<=df['random'],:]

    all_sample = pd.concat([samp_df,samples])
    print(len(all_sample))
    
    sample = all_sample.sample(n=k)
    
    samp_des_ip_sort = sample['Des_address'].value_counts()
    top = samp_des_ip_sort[:10]
    ip = top.index.values
    ip_list.extend(ip)
    
    
factors, uniques = pd.factorize(ip_list)
print((factors,uniques))

340865
100000


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




347485
10000
341129
1000
340916
500
340874
100
340869
60
340869
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  0,  2,  1,  3,  4,  5,  8,
        7,  9, 10,  0,  1,  2,  3,  4,  5,  7,  9,  8, 11,  0,  2,  1,  3,
        4, 12, 13,  5, 14, 15,  0,  1,  2,  3,  4, 16, 17, 10, 18, 19,  0,
        1,  2,  3,  4, 20, 21, 22, 23, 24,  0,  2,  1, 25, 26, 27, 28, 29,
       30, 31]), array(['147.32.96.69', '147.32.80.9', '147.32.84.229', '147.32.86.116',
       '147.32.84.59', '147.32.80.13', '147.32.84.118', '147.32.84.2',
       '76.13.114.90', '209.85.149.132', '147.32.84.111', '188.138.84.239',
       '147.32.85.26', '74.125.232.199', '147.32.87.13', '79.47.210.175',
       '74.125.232.195', '147.32.85.118', '208.94.110.144', '188.2.49.165',
       '74.125.79.108', '87.205.87.61', '147.32.80.105', '85.13.80.69',
       '74.125.232.196', '212.11.63.254', '80.239.148.114',
       '118.166.69.19', '87.6.157.242', '74.125.232.223', '147.32.87.21',
       '195.113.232.81'], dtype=object))


In [11]:
# IP rank comparison
samp_ip = samp_des_ip_sort[:10].index.values

comp = pd.DataFrame({'Origin':factors[:10]})

for i,k in enumerate(select):
    comp[str(k)] = factors[(i+1)*10:(i+2)*10]

print(comp)

   Origin  100000  10000  1000  500  100  60
0       0       0      0     0    0    0   0
1       1       2      1     2    1    1   2
2       2       1      2     1    2    2   1
3       3       3      3     3    3    3  25
4       4       4      4     4    4    4  26
5       5       5      5    12   16   20  27
6       6       8      7    13   17   21  28
7       7       7      9     5   10   22  29
8       8       9      8    14   18   23  30
9       9      10     11    15   19   24  31


# Count min sketch

In [15]:
import count_min_sketch #see the inplementation in count_min_sketch.py file
from time import time
from random import randint
import heapq


# get the value and freq as input
ip_list = des_ip_sort.index.values
freq_list = np.array(des_ip_sort).tolist()
#get top 10 ips
true_10 = np.array(des_ip_sort[:10].keys())

In [16]:
def test_cms(ip_list,freq_list,w,d):
    mytime = 0
    mine = count_min_sketch.CountMinSketch(w, d)
    for ip,freq in zip(ip_list,freq_list):
        t = time()
        mine.update(ip, freq)
        mytime += time() - t
    loss= 0
    pre_freq={}
    for ip, freq in zip(ip_list,freq_list):
        
        loss += (mine.query(ip) - freq)**2
        pre_freq[ip]=int(mine[ip])

    print 'loss:', loss**0.5 / len(ip_list)
    print 'time', mytime
    ips=[]
    topNum = 10
    nlargestList = heapq.nlargest(topNum, pre_freq.values())        #get top 10  
    for value in nlargestList:                                #print
        for key in pre_freq:  
            if pre_freq[key] == value:  
                ips.append(key)
                print key, pre_freq[key]
    return np.array(ips)
    

In [27]:
ips = test_cms(ip_list,freq_list,1000,10)
acc = sum(ips==true_10)/10.
print 'the top10 accuracy is', acc

loss: 0.128148847712
time 0.118900299072
147.32.96.69 279778
147.32.80.9 29457
147.32.84.229 28461
147.32.86.116 11704
147.32.84.59 3340
147.32.80.13 931
147.32.84.118 648
147.32.84.2 632
76.13.114.90 594
209.85.149.132 584
the top10 accuracy is 1.0


In [85]:
ips = test_cms(ip_list,freq_list,700,5)
acc = sum(ips==true_10)/10.
print 'the top10 accuracy is', acc

loss: 0.269457943564
time 0.0748283863068
147.32.96.69 279797
147.32.80.9 29468
147.32.84.229 28463
147.32.86.116 11722
147.32.84.59 3365
147.32.80.13 934
147.32.84.2 649
147.32.84.118 645
76.13.114.90 596
209.85.149.132 585
the top10 accuracy is 0.8


In [23]:
print(des_ip_sort[:10])

147.32.96.69      279763
147.32.80.9        29441
147.32.84.229      28445
147.32.86.116      11692
147.32.84.59        3324
147.32.80.13         901
147.32.84.118        625
147.32.84.2          619
76.13.114.90         571
209.85.149.132       571
Name: Des_address, dtype: int64
