## project : network traffic data type classification / skku 기계학습 수업
info : 주어진 network traffic dataset을 바탕으로 데이터를 전처리,   
모델 학습하여 4가지의 타입 중 올바른 타입으로 분류하는 프로젝트입니다.  
name : 조병웅  
model : 해당 프로젝트에는 logistic 회귀 scratch 코드 모델을 사용.  


## 캐글 데이터 로딩 및 환경구축
캐글 주소 -> https://www.kaggle.com/competitions/skku-2023-1-machine-learning-first-project

In [1]:
!pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [31]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"jobyeoungung","key":"dc2e4a297fcb7b59ff7102ff99a56b63"}'}

In [32]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions list

ref                                                                                deadline             category             reward  teamCount  userHasEntered  
---------------------------------------------------------------------------------  -------------------  ---------------  ----------  ---------  --------------  
https://www.kaggle.com/competitions/vesuvius-challenge-ink-detection               2023-06-14 23:59:00  Featured         $1,000,000        662           False  
https://www.kaggle.com/competitions/asl-signs                                      2023-05-01 23:59:00  Research           $100,000       1160           False  
https://www.kaggle.com/competitions/tlvmc-parkinsons-freezing-gait-prediction      2023-06-08 23:59:00  Research           $100,000        757           False  
https://www.kaggle.com/competitions/amp-parkinsons-disease-progression-prediction  2023-05-18 23:59:00  Featured            $60,000       1484           False  
https://www.kaggle.com/competition

In [33]:
!kaggle competitions download -c skku-2023-1-machine-learning-first-project

skku-2023-1-machine-learning-first-project.zip: Skipping, found more recently modified local copy (use --force to force download)


In [34]:
!unzip skku-2023-1-machine-learning-first-project.zip

Archive:  skku-2023-1-machine-learning-first-project.zip
replace example_video.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

## 데이터 전처리
- train 폴더 내 데이터를 가지고 전처리
- 데이터는 interaction(0), bulk(1), video(2), web(3) 총 4가지.
- time 칼럼의 형태 변경
- 부족한 개수의 데이터는 삭제
- 같은 11 크기로 windowing

In [35]:
import os
import pandas as pd
import numpy as np
import time

#데이터 개수 새는 함수
def count_ip_dst(data):
    temp = data.groupby(['time','ip_dst']).agg(count = ('ip_dst', 'count'))
    ip_dst_dict = {}
    i = 0
    for (k1, k2), group in data.groupby(['time','ip_dst']):
        if k2 in ip_dst_dict:
            ip_dst_dict[k2] += temp.iloc[i]
        else :
            ip_dst_dict[k2] = temp.iloc[i]
        i += 1
    return ip_dst_dict

#데이터 개수가 부족하면 삭제하는 함수
def discard_ip_dst(data, ip_dst_dict):
    for key in ip_dst_dict:
        if int(ip_dst_dict[key]) <= 40:
            idx = data[data['ip_dst'] == key].index
            data.drop(idx, inplace = True)

#데이터의 길이를 측정 및 데이터를 같은 크기로 맞추기 위해 windowing하는 함수            
def counting_data_len(data):
    temp = data.groupby(['ip_dst', 'time']).agg(count = ('data_len', 'count'))['count']
    temp
    to_csv_df = {"traffic(t-10)":[],"traffic(t-9)":[],"traffic(t-8)":[],"traffic(t-7)":[],"traffic(t-6)":[],"traffic(t-5)":[],"traffic(t-4)":[],"traffic(t-3)":[],"traffic(t-2)":[],"traffic(t-1)":[],"traffic(t)":[]}
    to_csv_df = pd.DataFrame(to_csv_df)
    for i in range(len(temp)):
        if (i+11) == (len(temp)-1):    
            break
        to_csv_df.loc[i] = temp[i:i+11].values
    return to_csv_df

#main 코드부분
df = pd.DataFrame()
files = os.listdir('./train/')
for file in files:    
    path = "./train/"+file
    data= pd.read_csv(path, encoding = 'utf-8')
    for i in data.index :
        t = time.gmtime(data.loc[i, 'time'])
        data.loc[i, 'time'] = time.strftime('%Y/%m/%d %H:%M:%S', t)

    ip_dst_dict = count_ip_dst(data)
    discard_ip_dst(data, ip_dst_dict)
        
    to_csv_df = counting_data_len(data)
    if file[0] == 'i':
        to_csv_df['inter'] = 1
        to_csv_df['bulk'] = 0
        to_csv_df['video'] = 0
        to_csv_df['web'] = 0
    elif file[0] == 'b':
        to_csv_df['inter'] = 0
        to_csv_df['bulk'] = 1
        to_csv_df['video'] = 0
        to_csv_df['web'] = 0
    elif file[0] == 'v':
        to_csv_df['inter'] = 0
        to_csv_df['bulk'] = 0
        to_csv_df['video'] = 1
        to_csv_df['web'] = 0
    else :
        to_csv_df['inter'] = 0
        to_csv_df['bulk'] = 0
        to_csv_df['video'] = 0
        to_csv_df['web'] = 1    
    df = pd.concat([df, to_csv_df])
df

Unnamed: 0,traffic(t-10),traffic(t-9),traffic(t-8),traffic(t-7),traffic(t-6),traffic(t-5),traffic(t-4),traffic(t-3),traffic(t-2),traffic(t-1),traffic(t),inter,bulk,video,web
0,6,14,6,6,6,2,1,6,13,7,6,1,0,0,0
1,14,6,6,6,2,1,6,13,7,6,6,1,0,0,0
2,6,6,6,2,1,6,13,7,6,6,2,1,0,0,0
3,6,6,2,1,6,13,7,6,6,2,1,1,0,0,0
4,6,2,1,6,13,7,6,6,2,1,6,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43,4,2,1,3,1,13,16,1,7,6,4,0,0,0,1
44,2,1,3,1,13,16,1,7,6,4,5,0,0,0,1
45,1,3,1,13,16,1,7,6,4,5,3,0,0,0,1
46,3,1,13,16,1,7,6,4,5,3,2,0,0,0,1


## 정규화 및 데이터셋 분리

In [151]:
from  sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
df_input = df[['traffic(t-10)', 'traffic(t-9)','traffic(t-8)','traffic(t-7)','traffic(t-6)','traffic(t-5)','traffic(t-4)','traffic(t-3)','traffic(t-2)','traffic(t-1)','traffic(t)']]
df_target = df[['inter', 'bulk', 'video', 'web']]
train_x, test_x, train_y, test_y = train_test_split(df_input, df_target, random_state = 102)

ss=StandardScaler()
ss.fit(train_x)
train_x=ss.transform(train_x)
test_x=ss.transform(test_x) 

## scratch 모델 구현

In [149]:
class LogisticRegression:
    def __init__(self, max_iter=500, penalty="l2", initialize = "one", random_seed = 1213):
        self.author = "조병웅"
        self.id = 2019312570
        
        self.max_iter = max_iter
        self.penalty = penalty
        self.initialize = initialize
        self.random_seed = random_seed
        self.lr = 0.0005
        self.lamb = 0.01
        np.random.seed(self.random_seed)

        if self.penalty not in ["l1", "l2"]:
            raise ValueError("Penalty must be l1 or l2")

        if self.initialize not in ["one", "LeCun", "random"]:
            raise ValueError("Only [LeCun, One, random] Initialization supported")
    
    def softmax(self, x):
        m = np.max(x)
        x = np.exp(x-m)
        sum_x = np.sum(x)
        result = x / sum_x #softmax 수식을 코드로 구현.
        return result # 문제2: softmax 함수를 구현하세요.    


    def fwpass(self, x):

        z1 = np.dot(x, self.w1) + self.b1
        z2 = np.dot(x, self.w2) + self.b2
        z3 = np.dot(x, self.w3) + self.b3
        z4 = np.dot(x, self.w4) + self.b4

        z1 = z1.reshape(-1,1)
        z2 = z2.reshape(-1,1)
        z3 = z3.reshape(-1,1)
        z4 = z4.reshape(-1,1)

        z = np.hstack((z1, z2, z3, z4))
        
        z = self.softmax(z) #시그모이드 함수로 0~1사이의 값으로 바꾼다.                
        return z


    def bwpass(self, x, err): 
        if self.penalty == "l1":
            sum = 0  
            for i in range(len(self.w1)):
                sum += abs(self.w1[i]) 
            w1_grad = (2/len(x))*(np.dot(np.array(err).T[0],x)) + self.lamb*sum 
            sum = 0 
            for i in range(len(self.w2)):
                sum += abs(self.w2[i])
            w2_grad = (2/len(x))*(np.dot(np.array(err).T[1],x)) + self.lamb*sum       
            sum = 0 
            for i in range(len(self.w3)):
                sum += abs(self.w3[i]) 
            w3_grad = (2/len(x))*(np.dot(np.array(err).T[2],x)) + self.lamb*sum 
            sum = 0
            for i in range(len(self.w4)):
                sum += abs(self.w4[i]) 
            w4_grad = (2/len(x))*(np.dot(np.array(err).T[3],x)) + self.lamb*sum            
        
        elif self.penalty == "l2":
            sum = 0 
            for i in range(len(self.w1)):
                sum += pow(self.w1[i],2) 
            w1_grad = (2/len(x))*(np.dot(np.array(err).T[0],x)) 
            sum = 0 
            for i in range(len(self.w2)):
                sum += pow(self.w2[i],2)  
            w2_grad = (2/len(x))*(np.dot(np.array(err).T[1],x)) 
            sum = 0 
            for i in range(len(self.w3)):
                sum += pow(self.w3[i],2) 
            w3_grad = (2/len(x))*(np.dot(np.array(err).T[2],x))
            sum = 0 
            for i in range(len(self.w4)):
                sum += pow(self.w4[i],2)  
            w4_grad = (2/len(x))*(np.dot(np.array(err).T[3],x)) 

        b1_grad = (2/len(x))*np.sum(np.array(err).T[0]) 
        b2_grad = (2/len(x))*np.sum(np.array(err).T[1]) 
        b3_grad = (2/len(x))*np.sum(np.array(err).T[2]) 
        b4_grad = (2/len(x))*np.sum(np.array(err).T[3]) 



        return w1_grad, w2_grad, w3_grad, w4_grad, b1_grad, b2_grad, b3_grad, b4_grad

    
    def initialize_w(self, x):
        w_library = {
            "one":np.ones(x.shape[1]),
            "LeCun":np.random.uniform(low = -np.sqrt(1.0 / x.shape[1]), high = np.sqrt(1.0 / x.shape[1]), size = x.shape[1]), 
            #random 라이브러리를 사용하여 가중치 초기화 수식을 작성한다. 제곱근은 sqrt메서드로 구현한다.  
            "random":np.random.randint(0,1, size = x.shape[1]) 
            # randint메서드를 사용해 구현. 
        }

        return w_library[self.initialize]


    def fit(self, x, y): # Y는 총 갯수, 4(정답 : 1000 0100 0010 0001)

        self.w1 = self.initialize_w(x) 
        self.w2 = self.initialize_w(x) 
        self.w3 = self.initialize_w(x) 
        self.w4 = self.initialize_w(x) #4개 필요

        self.b1 = 0 #4개 필요
        self.b2 = 0 
        self.b3 = 0 
        self.b4 = 0 

        for _ in range(self.max_iter):
            z = self.fwpass(x)
            err = -(y - z) # 712크기의 리스트, 4의 레이블
            w1_grad, w2_grad, w3_grad, w4_grad, b1_grad, b2_grad, b3_grad, b4_grad = self.bwpass(x, err)

            self.w1 = self.w1 - self.lr*w1_grad #앞에서 1/n으로 나누었기 때문에, 평균을 따로 활용하지 않았다.
            self.w2 = self.w2 - self.lr*w2_grad 
            self.w3 = self.w3 - self.lr*w3_grad 
            self.w4 = self.w4 - self.lr*w4_grad 

            self.b1 = self.b1 - self.lr*self.b1 #
            self.b2 = self.b2 - self.lr*self.b2 
            self.b3 = self.b3 - self.lr*self.b3 
            self.b4 = self.b4 - self.lr*self.b4 

        return self.w1, self.w2, self.w3, self.w4, self.b1, self.b2, self.b3, self.b4


    def predict(self, x):
        z = self.fwpass(x)
        for i in range(len(z)):
            z_max = np.argmax(z[i])
            z[i] = np.zeros(4)
            z[i][z_max] = 1
        print(z)
        return z

    def score(self, x, y):
        return np.mean(self.predict(x) == y)
    
    def make_sub(self, x):
        z = self.predict(x)
        print(1, z[1500:1510])
        temp = np.ones(z.shape[0])
        for i in range(z.shape[0]):
            if z[i][0] == 1.0:
                temp[i] = 0
            elif z[i][1] == 1.0:
                temp[i] = 1
            elif z[i][2] == 1.0:
                temp[i] = 2
            elif z[i][3] == 1.0:
                temp[i] = 3
        temp = temp.astype(np.int64)
        sub_df = pd.DataFrame({'type' :temp})
        sub_df.index = range(1,z.shape[0]+1)
        sub_df.to_csv('sample_submission.csv', index_label=['id'])

## 결과

In [126]:
test_data= pd.read_csv('./test.csv', encoding = 'utf-8')
test_data=ss.transform(test_data)

In [153]:
import time

lr = LogisticRegression(max_iter=1000, penalty="l2", initialize = "LeCun")
start = time.time()
coef1, coef2, coef3, coef4, interrupt1, interrupt2,  interrupt3,  interrupt4 = lr.fit(train_x, train_y)
print(f'time: {time.time() - start}')
print(f"Accuray: {lr.score(test_x, test_y)}")
lr.make_sub(test_data)

time: 3.486359119415283
[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 ...
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
Accuray: inter    0.797435
bulk     0.907878
video    0.890866
web      0.848469
dtype: float64
[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
1 [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]]


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


## 캐글로 전송

In [154]:
!kaggle competitions submit -c skku-2023-1-machine-learning-first-project -f sample_submission.csv -m "Message"

100% 20.4k/20.4k [00:00<00:00, 101kB/s]
Successfully submitted to [SKKU 2023-1 Machine Learning] First Project