In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('dataset_phishing.csv')
data.shape

(11430, 89)

In [3]:
data['status']

0        legitimate
1          phishing
2          phishing
3        legitimate
4        legitimate
            ...    
11425    legitimate
11426      phishing
11427    legitimate
11428    legitimate
11429      phishing
Name: status, Length: 11430, dtype: object

In [4]:
#b_status를 만들고 그거를 원핫 인코딩으로하고 legitimate를 1나머지를 0으로 해라
data['b_status'] = pd.get_dummies(data['status'])['legitimate'].astype('int')

In [5]:
data.drop('status', axis=1, inplace=True)
data[['url','b_status']].head()

Unnamed: 0,url,b_status
0,http://www.crestonwood.com/router.php,1
1,http://shadetreetechnology.com/V4/validation/a...,0
2,https://support-appleld.com.secureupdate.duila...,0
3,http://rgipt.ac.in,1
4,http://www.iracing.com/tracks/gateway-motorspo...,1


In [6]:
from sklearn.model_selection import train_test_split
x = data.iloc[:, 1:-1]
y = data.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, stratify=y) #y의 정답오답 비율을 훈련과 테스트 데이터셋에서 비율을 동일하게
print(x_train.shape, x_test.shape)
y_train.head()
y_train.info()


(9144, 87) (2286, 87)
<class 'pandas.core.series.Series'>
Index: 9144 entries, 6132 to 7166
Series name: b_status
Non-Null Count  Dtype
--------------  -----
9144 non-null   int64
dtypes: int64(1)
memory usage: 142.9 KB


In [7]:
scaler = StandardScaler()
scaler.fit(x_train)

x_train_tensor = torch.from_numpy(scaler.transform(x_train)).float()
x_test_tensor = torch.from_numpy(scaler.transform(x_test)).float()

y_train_tensor = torch.from_numpy(y_train.values).float().unsqueeze(1)



In [8]:
y_train_tensor.shape

torch.Size([9144, 1])

In [9]:
y_test_tensor = torch.from_numpy(y_test.values).float().unsqueeze(1)

In [10]:
b_size = 256
class mymodel(nn.Module):
    def __init__(self,i,o):
        super().__init__()
        self.go = nn.Sequential(
           nn.Linear(i, 200),
            nn.LeakyReLU(0.1),
            nn.Linear(200, 100),
            nn.LeakyReLU(0.1),
            nn.Linear(100, 20),
            nn.LeakyReLU(0.1),
            nn.Linear(20, 5),
            nn.LeakyReLU(0.1),
            nn.Linear(5, o),
            nn.Sigmoid()
        )

    def forward(self,x):
        return self.go(x)

In [11]:
x_train_tensor.size()

torch.Size([9144, 87])

In [12]:
id = x_train_tensor.size(1)
od = y_train_tensor.size(1)
model = mymodel(id,od)
loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())

In [13]:
for i in range(1000):
    indices = torch.randperm(9144)

    x_batch_list = torch.index_select(x_train_tensor,0,indices).split(b_size,0)
    y_batch_list = torch.index_select(y_train_tensor,0,indices).split(b_size,0)

    temp = []
    for x,y in zip(x_batch_list, y_batch_list):
        y_pred = model(x)
        loss = loss_func(y_pred,y)
        temp.append(loss)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if(i % 100)==0:
        print(i, sum(temp)/ len(temp))

0 tensor(0.5440, grad_fn=<DivBackward0>)
100 tensor(0.0328, grad_fn=<DivBackward0>)
200 tensor(0.0326, grad_fn=<DivBackward0>)
300 tensor(0.0326, grad_fn=<DivBackward0>)
400 tensor(0.0326, grad_fn=<DivBackward0>)
500 tensor(0.0449, grad_fn=<DivBackward0>)
600 tensor(0.0434, grad_fn=<DivBackward0>)
700 tensor(0.0434, grad_fn=<DivBackward0>)
800 tensor(0.0434, grad_fn=<DivBackward0>)
900 tensor(0.0434, grad_fn=<DivBackward0>)


In [14]:
y_pred_list = []
model.eval()
with torch.no_grad():
    y_test_pred = torch.round(model(x_test_tensor))

In [15]:
x_test_batches = x_test_tensor.split(b_size,0)
model.eval()
with torch.no_grad():
    for x in x_test_batches:
        y_test_pred = torch.round(model(x))
        y_pred_list.extend(y_test_pred.squeeze().detach().tolist())

y_pred_list = torch.tensor(y_pred_list).unsqueeze(1)

In [16]:
y_test_tensor

tensor([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [0.]])

In [17]:
y_pred_list.shape

torch.Size([2286, 1])

In [18]:
y_pred_list

tensor([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [0.]])

In [19]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print("Confusion Matrix\n", str(confusion_matrix(y_test_tensor, y_pred_list)))
print("Precision:\t"+str(precision_score(y_test_tensor, y_pred_list)))
print("Recall:\t"+str(recall_score(y_test_tensor, y_pred_list)))
print("F1 Score:\t"+str(f1_score(y_test_tensor, y_pred_list)))

Confusion Matrix
 [[1098   45]
 [  41 1102]]
Precision:	0.960767218831735
Recall:	0.9641294838145232
F1 Score:	0.9624454148471616
