In [26]:
# instalar: pip install liac-arff
import arff
import pandas as pd
import numpy as np 
import random
from sklearn import svm
from sklearn.metrics import precision_score, accuracy_score, classification_report, f1_score, recall_score
# import imblearn as ibl
# import seaborn as sns
# import matplotlib.pyplot as plt
from csv import DictReader
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold


0 @relation phishing

02 @attribute having_IP_Address   { -1,1 } <br>
03 @attribute URL_Length          { 1,0,-1 } <br>
04 @attribute Shortining_Service  { 1,-1 } <br>
05 @attribute having_At_Symbol    { 1,-1 } <br>
06 @attribute double_slash_redirecting { -1,1 } <br>
07 @attribute Prefix_Suffix       { -1,1 } <br>
08 @attribute having_Sub_Domain   { -1,0,1 } <br>
09 @attribute SSLfinal_State      { -1,1,0 } <br>
10 @attribute Domain_registeration_length { -1,1 } <br>
11 @attribute Favicon     { 1,-1 } <br>
12 @attribute port        { 1,-1 } <br>
13 @attribute HTTPS_token { -1,1 } <br>
14 @attribute Request_URL { 1,-1 } <br>
15 @attribute URL_of_Anchor { -1,0,1 } <br>
16 @attribute Links_in_tags { 1,-1,0 } <br>
17 @attribute SFH  { -1,1,0 } <br>
18 @attribute Submitting_to_email { -1,1 } <br>
19 @attribute Abnormal_URL        { -1,1 } <br>
20 @attribute Redirect      { 0,1 } <br>
21 @attribute on_mouseover  { 1,-1 } <br>
22 @attribute RightClick    { 1,-1 } <br>
23 @attribute popUpWidnow   { 1,-1 } <br>
24 @attribute Iframe { 1,-1 } <br>
25 @attribute age_of_domain { -1,1 } <br>
26 @attribute DNSRecord     { -1,1 } <br>
27 @attribute web_traffic   { -1,0,1 } <br>
28 @attribute Page_Rank     { -1,1 } <br>
29 @attribute Google_Index  { 1,-1 } <br>
30 @attribute Links_pointing_to_page { 1,0,-1 } <br>
31 @attribute Statistical_report     { -1,1 } <br>
32 @attribute Result    { -1,1 } <br>

Limpeza dos dados, a tabela a seguir mostra os atributos selecionados do dataset 1:

| **#** | **Attributes**              | **Description**                                                      | **Feature Type** |
|-------|-----------------------------|----------------------------------------------------------------------|------------------|
| 1     | having_IP_Address           | If the domain part has an IP address                                 | Lexical          |
| 2     | URL_Length                  | Number of characters in the URL                                      | Lexical          |
| 3     | Shortining_Service          | Is the URL shortened                                                 | Lexical          |
| 4     | having_At_Symbol            | Is “@” sign present                                                  | Lexical          |
| 5     | Prefix_Suffix               | Does the domain part include “-* symbol  (separates suffix prefix)   | Lexical          |
| 6     | having_Sub_Domain           | Is the number of sub-domains high  (greater than 2)                  | Lexical          |
| 7     | SSLfinal_State              | Has valid SSL certificate (age)                                      | Domain           |
| 8     | Domain_registeration_length | Is the domain age of the website short (shorter than a year)         | Domain           |
| 9     | Request_URL                 | Are the external objects (images, videos) loaded from another domain | Content          |
| 10    | URL_of_Anchor               | Is the URL anchor linking to a webpage                               | Content          |
| 11    | Links_in_tags               | Presence of \<meta\>, \<scripts\>,  \<link\> tags                          | Content          |
| 12    | SFH                         | Does the server form handler(SFH) contain empty strings              | Domain           |
| 13    | Abnormal_URL                | Is the hostname included in WHOIS database                           | Lexical          |
| 14    | age_of_domain               | Is the age of the website short or long                              | Domain           |
| 15    | DNSRecord                   | If the website is recognized by WHOIS database                       | Domain           |
| 16    | web_traffic                 | If the webpage is frequently visited                                 | Domain           |
| 17    | Page_Rank                   | How important the webpage is on the internet                         | Domain           |
| 18    | Google_Index                | If the website is in Google’s index or not                           | Domain           |
| 19    | Statistical_report          | If the host belongs to top phishing  IPs or domains (PhishTank)      | Domain           |

Atributos adicionais no segundo dataset:

| **#** | **Attributes**              | **Description**                                                      | **Feature Type** |
|-------|-----------------------------|----------------------------------------------------------------------|------------------|
| 1     | external_objects            | Any suspicious behaviors related to website elements                               | Content          |
| 2     | anchor_tags                 | Any suspicious behaviors related to the anchor tag                                 | Content          |

Por fim temos os atributos de cada dataset:

| **#** | **Attributes**              | **Presence of these features in the datasets** |
|-------|-----------------------------|------------------------------------------------|
| 1     | having_IP_Address           | Both                                           |
| 2     | URL_Length                  | Both                                           |
| 3     | Shortining_Service          | Both                                           |
| 4     | having_At_Symbol            | Both                                           |
| 5     | Prefix_Suffix               | Both                                           |
| 6     | having_Sub_Domain           | Both                                           |
| 7     | SSLfinal_State              | Both                                           |
| 8     | Domain_registeration_length | Both                                           |
| 9     | Links_in_tags               | Both                                           |
| 10    | SFH                         | Both                                           |
| 11    | Abnormal_URL                | Both                                           |
| 12    | age_of_domain               | Both                                           |
| 13    | DNSRecord                   | Both                                           |
| 14    | web_traffic                 | Both                                           |
| 15    | Statistical_report          | Both                                           |
| 16    | Request_URL                 | First dataset                                  |
| 17    | URL_of_Anchor               | First dataset                                  |
| 18    | Page_Rank                   | First dataset                                  |
| 19    | Google_Index                | First dataset                                  |
| 20    | external_objects            | Second dataset                                 |
| 21    | anchor_tags                 | Second dataset                                 |

In [23]:
def select_random(results):
    leg = list()
    selected = list()

    for i in range(len(results)):
        if results[i] == '1':
            leg.append(i)
 
    while len(set(selected)) != 1259:
        if random.choice(leg) not in selected:
            selected.append(random.choice(leg))
            
#     print(len(selected))
#     mys = set(selected)
#     print(len(mys))
    return set(selected)



file_path = "trainingdataset.arff" # Dados: 11055
# leg: 6157
# phi: 4898

# file_path = "old.arff" # Dados: 2456

# Abrir o arquivo ARFF
with open("old.arff", 'r') as file_test:
    arff_data_teste = arff.load(file_test)

# Acessar os dados do arquivo
data_test = arff_data_teste['data']  # Dados em formato de lista ou matriz
attributes_test = arff_data_teste['attributes']  # Atributos do arquivo ARFF

df_test = pd.DataFrame(data_test, columns=[attr[0] for attr in attributes_test])

ST_df_test = df_test.drop([
    'double_slash_redirecting', 
    'Favicon', 
    'port', 
    'HTTPS_token', 
    'Submitting_to_email', 
    'Redirect', 
    'on_mouseover', 
    'RightClick',
    'popUpWidnow', 
    'Iframe', 
    'Links_pointing_to_page'
], axis=1)


# Abrir o arquivo ARFF
with open("trainingdataset.arff", 'r') as file:
    arff_data = arff.load(file)

# Acessar os dados do arquivo
data = arff_data['data']  # Dados em formato de lista ou matriz
attributes = arff_data['attributes']  # Atributos do arquivo ARFF

df = pd.DataFrame(data, columns=[attr[0] for attr in attributes])

ST_df = df.drop([
    'double_slash_redirecting', 
    'Favicon', 
    'port', 
    'HTTPS_token', 
    'Submitting_to_email', 
    'Redirect', 
    'on_mouseover', 
    'RightClick',
    'popUpWidnow', 
    'Iframe', 
    'Links_pointing_to_page'
], axis=1)

teste = select_random(list(ST_df['Result']))
ST_df.drop(teste, axis=0, inplace=True)
# ST_df['Result'].value_counts()

Result
-1    1362
1     1094
Name: count, dtype: int64

In [6]:
# file_path = "trainingdataset.arff" # Dados: 11055
# file_path = "old.arff" # Dados: 2456

df = pd.read_csv('fixed_values_ds.csv')

# df['result'].value_counts()
# df.columns

ND_df = df.drop([
    'double-slash_redirection', 
    'favicons', 
    'ports', 
    'https', 
    'auto_email', 
    'iframe_redirection', 
    'on_mouse_over', 
    'right_click',
    'popup_windows', 
    'links_pointing', 
    'image_text_keyword'
], axis=1)

ND_df.drop([0,1,2,3,4], axis=0, inplace=True)
ND_df['result'].value_counts()

result
 1    7044
-1    7044
Name: count, dtype: int64

### SVM

In [33]:
#Atributo de saída
y_st_test = ST_df_test['Result'].values
y_st = ST_df['Result'].values

#Atributos de entrada
X_st_test = ST_df_test.drop('Result', axis=1)
X_st = ST_df.drop('Result', axis=1)

svm_model = svm.SVC(kernel="linear")
svm_model.fit(X_st, y_st)
# print(svm_model.score(X_st, y_st))
pred = svm_model.predict(X_st_test)
# accuracy_score(y_st_test, pred)
precision_score(y_st_test, pred)


ValueError: pos_label=1 is not a valid label. It should be one of ['-1', '1']