In [92]:
import pandas as pd
import time # time everything
start = time.time()
first_6th = pd.read_csv("/Users/chineseSamurai/Documents/capstone_Data/1_6m",encoding='iso-8859-1')
end = time.time()
print("op done in %0.2f seconds" % (end - start))

op done in 13.12 seconds


# Approach
## everything I do here considers performance before implementation
* initial look at data
* filter obs leaving UVa network
* form unique src-dest IP pairs
* frequency counts, duration, length, etc.
* extract information from 'info'

In [93]:
# initial look at data: dimension, first couple of obs
print('dimension of dataset is: '+str(first_6th.shape))
first_6th.head(10)

dimension of dataset is: (6000000, 7)


Unnamed: 0,No.,Time,Source,Destination,Protocol,Length,Info
0,1,0.0,199.111.209.37,15.72.228.56,TCP,60,11663 > 443 [ACK] Seq=1 Ack=1 Win=65142 Len=0
1,2,2e-06,199.111.208.221,31.13.69.202,TCP,66,28673 > 443 [ACK] Seq=1 Ack=1 Win=2054 Len=0 T...
2,3,3e-06,199.111.211.98,173.194.132.170,TCP,66,11962 > 443 [ACK] Seq=1 Ack=1 Win=4053 Len=0 T...
3,4,5e-06,199.111.209.127,17.253.21.203,TCP,66,8753 > 80 [ACK] Seq=1 Ack=1 Win=5066 Len=0 TSv...
4,5,6e-06,199.111.186.170,151.101.32.84,TCP,66,63787 > 443 [ACK] Seq=1 Ack=1 Win=5096 Len=0 T...
5,6,1e-05,199.111.192.43,31.13.69.228,TLSv1.2,454,Application Data
6,7,1.4e-05,199.111.198.21,172.217.2.197,TCP,66,54449 > 443 [ACK] Seq=1 Ack=1 Win=4089 Len=0 T...
7,8,1.5e-05,199.111.167.60,54.242.189.178,TCP,66,5824 > 80 [ACK] Seq=1 Ack=1 Win=4093 Len=0 TSv...
8,9,1.5e-05,216.115.104.250,199.111.204.244,TLSv1.2,12432,Ignored Unknown Record
9,10,1.9e-05,173.194.7.7,199.111.210.75,TLSv1.2,5562,Ignored Unknown Record


In [94]:
# filter data, keep only transmission leaving UVA network
### in order to do this, define a function
def filter_data(df, domain):
    """filter df based on first six digits of domain (e.g. '111.111.xxx.xxx').
    return filtered column as a pandas.series obj
    """
    # defining capture group for use in pattern matching
    cap_grp = ("%s%s%s") % ("(",domain,".\d*.\d*)")
    return df['Source'].str.extract(cap_grp, expand=False)

In [96]:
# take 199.111.xxx.xxx as a filter
start = time.time()
first_6th.loc[:,'Source'] = filter_data(first_6th, '199.111')
end = time.time()
print("op done in %0.2f seconds" % (end - start))
first_6th.head(10)

op done in 6.85 seconds


Unnamed: 0,No.,Time,Source,Destination,Protocol,Length,Info
0,1,0.0,199.111.209.37,15.72.228.56,TCP,60,11663 > 443 [ACK] Seq=1 Ack=1 Win=65142 Len=0
1,2,2e-06,199.111.208.221,31.13.69.202,TCP,66,28673 > 443 [ACK] Seq=1 Ack=1 Win=2054 Len=0 T...
2,3,3e-06,199.111.211.98,173.194.132.170,TCP,66,11962 > 443 [ACK] Seq=1 Ack=1 Win=4053 Len=0 T...
3,4,5e-06,199.111.209.127,17.253.21.203,TCP,66,8753 > 80 [ACK] Seq=1 Ack=1 Win=5066 Len=0 TSv...
4,5,6e-06,199.111.186.170,151.101.32.84,TCP,66,63787 > 443 [ACK] Seq=1 Ack=1 Win=5096 Len=0 T...
5,6,1e-05,199.111.192.43,31.13.69.228,TLSv1.2,454,Application Data
6,7,1.4e-05,199.111.198.21,172.217.2.197,TCP,66,54449 > 443 [ACK] Seq=1 Ack=1 Win=4089 Len=0 T...
7,8,1.5e-05,199.111.167.60,54.242.189.178,TCP,66,5824 > 80 [ACK] Seq=1 Ack=1 Win=4093 Len=0 TSv...
8,9,1.5e-05,,199.111.204.244,TLSv1.2,12432,Ignored Unknown Record
9,10,1.9e-05,,199.111.210.75,TLSv1.2,5562,Ignored Unknown Record


In [102]:
# now simply take out all obs with Source IP missing.
first_6th.dropna(axis=0, inplace = True)
# and check out dimension after dropping:
print(first_6th.shape)
first_6th.head(10)

(3423754, 7)


Unnamed: 0,No.,Time,Source,Destination,Protocol,Length,Info
0,1,0.0,199.111.209.37,15.72.228.56,TCP,60,11663 > 443 [ACK] Seq=1 Ack=1 Win=65142 Len=0
1,2,2e-06,199.111.208.221,31.13.69.202,TCP,66,28673 > 443 [ACK] Seq=1 Ack=1 Win=2054 Len=0 T...
2,3,3e-06,199.111.211.98,173.194.132.170,TCP,66,11962 > 443 [ACK] Seq=1 Ack=1 Win=4053 Len=0 T...
3,4,5e-06,199.111.209.127,17.253.21.203,TCP,66,8753 > 80 [ACK] Seq=1 Ack=1 Win=5066 Len=0 TSv...
4,5,6e-06,199.111.186.170,151.101.32.84,TCP,66,63787 > 443 [ACK] Seq=1 Ack=1 Win=5096 Len=0 T...
5,6,1e-05,199.111.192.43,31.13.69.228,TLSv1.2,454,Application Data
6,7,1.4e-05,199.111.198.21,172.217.2.197,TCP,66,54449 > 443 [ACK] Seq=1 Ack=1 Win=4089 Len=0 T...
7,8,1.5e-05,199.111.167.60,54.242.189.178,TCP,66,5824 > 80 [ACK] Seq=1 Ack=1 Win=4093 Len=0 TSv...
10,11,2.2e-05,199.111.201.86,74.125.22.128,TLSv1.2,1434,Ignored Unknown Record
13,14,0.000105,199.111.210.239,104.95.31.61,TLSv1.2,326,Application Data


## number of obs went from 6E6 to 3.4E6, about 50% decrease
## so a bit over half of the all traffic was leaving UVA network
## next step

In [109]:
# form unique src-dest IP pairs
start = time.time()
first_6th['Pair'] = first_6th['Source'] +"_"+ first_6th['Destination']
end = time.time()
print("op done in %0.2f seconds" % (end - start))
first_6th.head(10)

op done in 1.13 seconds


Unnamed: 0,No.,Time,Source,Destination,Protocol,Length,Info,Pair
0,1,0.0,199.111.209.37,15.72.228.56,TCP,60,11663 > 443 [ACK] Seq=1 Ack=1 Win=65142 Len=0,199.111.209.37_15.72.228.56
1,2,2e-06,199.111.208.221,31.13.69.202,TCP,66,28673 > 443 [ACK] Seq=1 Ack=1 Win=2054 Len=0 T...,199.111.208.221_31.13.69.202
2,3,3e-06,199.111.211.98,173.194.132.170,TCP,66,11962 > 443 [ACK] Seq=1 Ack=1 Win=4053 Len=0 T...,199.111.211.98_173.194.132.170
3,4,5e-06,199.111.209.127,17.253.21.203,TCP,66,8753 > 80 [ACK] Seq=1 Ack=1 Win=5066 Len=0 TSv...,199.111.209.127_17.253.21.203
4,5,6e-06,199.111.186.170,151.101.32.84,TCP,66,63787 > 443 [ACK] Seq=1 Ack=1 Win=5096 Len=0 T...,199.111.186.170_151.101.32.84
5,6,1e-05,199.111.192.43,31.13.69.228,TLSv1.2,454,Application Data,199.111.192.43_31.13.69.228
6,7,1.4e-05,199.111.198.21,172.217.2.197,TCP,66,54449 > 443 [ACK] Seq=1 Ack=1 Win=4089 Len=0 T...,199.111.198.21_172.217.2.197
7,8,1.5e-05,199.111.167.60,54.242.189.178,TCP,66,5824 > 80 [ACK] Seq=1 Ack=1 Win=4093 Len=0 TSv...,199.111.167.60_54.242.189.178
10,11,2.2e-05,199.111.201.86,74.125.22.128,TLSv1.2,1434,Ignored Unknown Record,199.111.201.86_74.125.22.128
13,14,0.000105,199.111.210.239,104.95.31.61,TLSv1.2,326,Application Data,199.111.210.239_104.95.31.61


# aggregate information

In [110]:
# frequency counts, duration, length, etc.



# extract information from 'info'