In [1]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import math
import gensim
from gensim.models import Word2Vec 
import time

In [2]:
def transfer(df, mode='train'):
    
    print('run ports to categories')
    #step 2: convert port number into 3 categories
    common_port = [20,21,22,23,25,53,80,110,143,179,389,443,636,990,1433,8080]
    df.dest_port = [-x if x in common_port else x for x in df['dest_port']]
    
    df.dest_port = [-1 if -1<int(x)<1024 else -2 if 1023<int(x)<49151 else -3 if int(x) >49151 else x for x in df['dest_port']]
    df = pd.get_dummies(df, columns = ['dest_port'], drop_first=True)
    
    
    
    #step 3: location transform
    common_country = ['US', 'China', 'Russia', 'Netherlands', 'Bulgaria', 'Ukraine', 'United Kingdom', 'Japan', 'Canada', 'Iceland', 'France', 'Seychelles']
    df['location'] = [str(x) if str(x) in common_country else 'other' for x in df.location]
    df = pd.get_dummies(df, columns = ['location'], drop_first=True)
    
    
    #step 4: unigram
    history_string = 'shadfrctwiq^SHADFRCTWIQ'
    res = []
    for i in history_string:
        res.append([])
    
    for history in df.history:
        for i in range(len(history_string)):
            res[i].append(history.count(history_string[i]))
    
    for i in range(len(history_string)):
        df[history_string[i]] = res[i]
        
    
    #step 5: get dummy conn state
    df = pd.get_dummies(df, columns=['conn_state'], drop_first=True)
    
    
    #step 6: replace missing value by 0
    df = df.replace('-', -1.0)
    
    
    print('run ts into hour')
    #step 7: convert ts into hour
    xhr, yhr = [], []

    for i in df.ts:
        temp = (datetime.fromtimestamp(i) - timedelta(hours=-5)).strftime('%Y-%m-%d %H:%M:%S')[11:13]
        xhr.append(np.sin(2*math.pi*int(temp)/24))
        yhr.append(np.cos(2*math.pi*int(temp)/24))

    df['xhr'] = xhr
    df['yhr'] = yhr

    
    #step 8: log transformation
    df['duration'] = pd.to_numeric(df['duration'])
    df['src_bytes'] = pd.to_numeric(df['src_bytes'])
    df['dest_bytes'] = pd.to_numeric(df['dest_bytes'])
    df['src_pkts'] = pd.to_numeric(df['src_pkts'])
    df['dest_pkts'] = pd.to_numeric(df['dest_pkts'])
    df['duration'] = np.log(1.1 + df['duration'])
    df['src_bytes'] = np.log(1.1 + df['src_bytes'])
    df['dest_bytes'] = np.log(1.1 + df['dest_bytes'])
    df['src_pkts'] = np.log(1.1 + df['src_pkts'])
    df['dest_pkts'] = np.log(1.1 + df['dest_pkts'])
    
    
    return df.drop(['ts','history','src_ip'], axis = 1)

In [3]:
'''
start_time = time.time()
df = pd.read_csv('/scratch/by8jj/sample files/Bal_train/all_train.csv')
df = transfer(df, 'train')
df.to_csv('/scratch/by8jj/stratified samples/test of test/train/all_train.csv', index = False)
print("--- %s seconds ---" % (time.time() - start_time))
'''

'\nstart_time = time.time()\ndf = pd.read_csv(\'/scratch/by8jj/sample files/Bal_train/all_train.csv\')\ndf = transfer(df, \'train\')\ndf.to_csv(\'/scratch/by8jj/stratified samples/test of test/train/all_train.csv\', index = False)\nprint("--- %s seconds ---" % (time.time() - start_time))\n'

In [4]:
inputPath = '/scratch/by8jj/sample files/Bal_test/test_'

In [5]:
for i in range(12, 23):
    start_time = time.time()
    df = pd.read_csv(inputPath + str(i) + '.csv')
    df = transfer(df, 'test')
    df.to_csv('/scratch/by8jj/stratified samples/test of test/test/test_' + str(i) + '.csv', index = False)
    print("--- %s seconds ---" % (time.time() - start_time))

run ports to categories
run ts into hour
--- 121.11493420600891 seconds ---
run ports to categories
run ts into hour
--- 125.71537661552429 seconds ---
run ports to categories
run ts into hour
--- 202.89757418632507 seconds ---
run ports to categories
run ts into hour
--- 194.76542043685913 seconds ---
run ports to categories
run ts into hour
--- 183.0406801700592 seconds ---
run ports to categories
run ts into hour
--- 210.969464302063 seconds ---
run ports to categories
run ts into hour
--- 204.9678819179535 seconds ---
run ports to categories
run ts into hour
--- 118.51590728759766 seconds ---
run ports to categories
run ts into hour
--- 115.49377059936523 seconds ---
run ports to categories
run ts into hour
--- 149.64178323745728 seconds ---
run ports to categories
run ts into hour
--- 200.06779861450195 seconds ---
