In [2]:
# instalar: pip install liac-arff
import arff
import pandas as pd
import numpy as np 
import random
import imblearn
from imblearn.under_sampling import RandomUnderSampler

from sklearn import svm
from sklearn.metrics import precision_score, accuracy_score, classification_report, f1_score, recall_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from csv import DictReader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold

# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras import layers

0 @relation phishing

02 @attribute having_IP_Address   { -1,1 } <br>
03 @attribute URL_Length          { 1,0,-1 } <br>
04 @attribute Shortining_Service  { 1,-1 } <br>
05 @attribute having_At_Symbol    { 1,-1 } <br>
06 @attribute double_slash_redirecting { -1,1 } <br>
07 @attribute Prefix_Suffix       { -1,1 } <br>
08 @attribute having_Sub_Domain   { -1,0,1 } <br>
09 @attribute SSLfinal_State      { -1,1,0 } <br>
10 @attribute Domain_registeration_length { -1,1 } <br>
11 @attribute Favicon     { 1,-1 } <br>
12 @attribute port        { 1,-1 } <br>
13 @attribute HTTPS_token { -1,1 } <br>
14 @attribute Request_URL { 1,-1 } <br>
15 @attribute URL_of_Anchor { -1,0,1 } <br>
16 @attribute Links_in_tags { 1,-1,0 } <br>
17 @attribute SFH  { -1,1,0 } <br>
18 @attribute Submitting_to_email { -1,1 } <br>
19 @attribute Abnormal_URL        { -1,1 } <br>
20 @attribute Redirect      { 0,1 } <br>
21 @attribute on_mouseover  { 1,-1 } <br>
22 @attribute RightClick    { 1,-1 } <br>
23 @attribute popUpWidnow   { 1,-1 } <br>
24 @attribute Iframe { 1,-1 } <br>
25 @attribute age_of_domain { -1,1 } <br>
26 @attribute DNSRecord     { -1,1 } <br>
27 @attribute web_traffic   { -1,0,1 } <br>
28 @attribute Page_Rank     { -1,1 } <br>
29 @attribute Google_Index  { 1,-1 } <br>
30 @attribute Links_pointing_to_page { 1,0,-1 } <br>
31 @attribute Statistical_report     { -1,1 } <br>
32 @attribute Result    { -1,1 } <br>

Limpeza dos dados, a tabela a seguir mostra os atributos selecionados do dataset 1:

| **#** | **Attributes**              | **Description**                                                      | **Feature Type** |
|-------|-----------------------------|----------------------------------------------------------------------|------------------|
| 1     | having_IP_Address           | If the domain part has an IP address                                 | Lexical          |
| 2     | URL_Length                  | Number of characters in the URL                                      | Lexical          |
| 3     | Shortining_Service          | Is the URL shortened                                                 | Lexical          |
| 4     | having_At_Symbol            | Is “@” sign present                                                  | Lexical          |
| 5     | Prefix_Suffix               | Does the domain part include “-* symbol  (separates suffix prefix)   | Lexical          |
| 6     | having_Sub_Domain           | Is the number of sub-domains high  (greater than 2)                  | Lexical          |
| 7     | SSLfinal_State              | Has valid SSL certificate (age)                                      | Domain           |
| 8     | Domain_registeration_length | Is the domain age of the website short (shorter than a year)         | Domain           |
| 9     | Request_URL                 | Are the external objects (images, videos) loaded from another domain | Content          |
| 10    | URL_of_Anchor               | Is the URL anchor linking to a webpage                               | Content          |
| 11    | Links_in_tags               | Presence of \<meta\>, \<scripts\>,  \<link\> tags                          | Content          |
| 12    | SFH                         | Does the server form handler(SFH) contain empty strings              | Domain           |
| 13    | Abnormal_URL                | Is the hostname included in WHOIS database                           | Lexical          |
| 14    | age_of_domain               | Is the age of the website short or long                              | Domain           |
| 15    | DNSRecord                   | If the website is recognized by WHOIS database                       | Domain           |
| 16    | web_traffic                 | If the webpage is frequently visited                                 | Domain           |
| 17    | Page_Rank                   | How important the webpage is on the internet                         | Domain           |
| 18    | Google_Index                | If the website is in Google’s index or not                           | Domain           |
| 19    | Statistical_report          | If the host belongs to top phishing  IPs or domains (PhishTank)      | Domain           |

Atributos adicionais no segundo dataset:

| **#** | **Attributes**              | **Description**                                                      | **Feature Type** |
|-------|-----------------------------|----------------------------------------------------------------------|------------------|
| 1     | external_objects            | Any suspicious behaviors related to website elements                               | Content          |
| 2     | anchor_tags                 | Any suspicious behaviors related to the anchor tag                                 | Content          |

Por fim temos os atributos de cada dataset:

| **#** | **Attributes**              | **Presence of these features in the datasets** |
|-------|-----------------------------|------------------------------------------------|
| 1     | having_IP_Address           | Both                                           |
| 2     | URL_Length                  | Both                                           |
| 3     | Shortining_Service          | Both                                           |
| 4     | having_At_Symbol            | Both                                           |
| 5     | Prefix_Suffix               | Both                                           |
| 6     | having_Sub_Domain           | Both                                           |
| 7     | SSLfinal_State              | Both                                           |
| 8     | Domain_registeration_length | Both                                           |
| 9     | Links_in_tags               | Both                                           |
| 10    | SFH                         | Both                                           |
| 11    | Abnormal_URL                | Both                                           |
| 12    | age_of_domain               | Both                                           |
| 13    | DNSRecord                   | Both                                           |
| 14    | web_traffic                 | Both                                           |
| 15    | Statistical_report          | Both                                           |
| 16    | Request_URL                 | First dataset                                  |
| 17    | URL_of_Anchor               | First dataset                                  |
| 18    | Page_Rank                   | First dataset                                  |
| 19    | Google_Index                | First dataset                                  |
| 20    | external_objects            | Second dataset                                 |
| 21    | anchor_tags                 | Second dataset                                 |

In [3]:
# file_path = "trainingdataset.arff" # Dados: 11055

# Abrir o arquivo ARFF
with open("trainingdataset.arff", 'r') as file:
    arff_data = arff.load(file)

# Acessar os dados do arquivo
data = arff_data['data']  # Dados em formato de lista ou matriz
attributes = arff_data['attributes']  # Atributos do arquivo ARFF

ST_df = pd.DataFrame(data, columns=[attr[0] for attr in attributes])

oversample = RandomUnderSampler(sampling_strategy='majority')

ST_df['Result'] = ST_df['Result'].astype(int)

ST_df_best_features = ST_df.drop([
    'double_slash_redirecting', 
    'Favicon', 
    'port', 
    'HTTPS_token', 
    'Submitting_to_email', 
    'Redirect', 
    'on_mouseover', 
    'RightClick',
    'popUpWidnow', 
    'Iframe', 
    'Links_pointing_to_page'
], axis=1)

# for i in range(len(ST_df['Result'])):
# # print(ST_df['Result'][23])
# # for i in ST_df['Result'][24]:
# #     print(len(i))
#     if ST_df['Result'][i] != '1':
#         print("ASLUKDHAKUSHDASD")
#     print(ST_df['Result'])
#     if ST_df['Result'][i] != ST_df_best_features['Result'][i]

# ST_df_best_features

# ST_df['Result']
# ST_df['Result'].value_counts()


In [5]:
# file_path = "trainingdataset.arff" # Dados: 11055
# file_path = "old.arff" # Dados: 2456
ND_df = pd.read_csv('fixed_values_ds.csv')

oversample = RandomUnderSampler(sampling_strategy='majority')

ND_df_best_features = ND_df.drop([
    'double-slash_redirection', 
    'favicons', 
    'ports', 
    'https', 
    'auto_email', 
    'iframe_redirection', 
    'on_mouse_over', 
    'right_click',
    'popup_windows', 
    'links_pointing', 
    'image_text_keyword'
], axis=1)
ND_df_best_features
# n_ND_df['result']

# ND_df['result'].value_counts()

Unnamed: 0,having_ip_address,length_of_url,shortening_services,having_at_symbol,prefix and suffix,sub_domains,ssl_state,domain_registered,external_objects,anchor_tags,links_in_tags,sfh-domain,abnoramal_url,domain_age,dns_record,web_traffic,statistical_report,result
0,-1,-1,-1,-1,-1,1,1,1,-1,-1,-1,0,1,1,1,1,-1,1
1,-1,0,-1,-1,1,0,-1,1,1,1,1,-1,-1,-1,1,1,-1,1
2,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,1,1,-1,1
3,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,1,0,-1,1
4,-1,1,-1,-1,1,1,-1,1,1,-1,1,-1,-1,1,1,-1,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14088,-1,-1,-1,-1,-1,0,-1,1,-1,1,-1,-1,-1,1,1,-1,-1,-1
14089,-1,-1,-1,-1,-1,0,-1,1,1,-1,-1,-1,-1,-1,1,-1,-1,-1
14090,-1,-1,-1,-1,-1,0,-1,1,-1,1,0,-1,-1,-1,1,-1,-1,-1
14091,-1,-1,-1,-1,-1,0,-1,1,1,-1,0,0,-1,-1,1,-1,-1,-1


## RF

#### Dataset_1 + All_features

In [7]:
# #Atributo de saída
y_st = ST_df['Result'].values

# #Atributos de entrada
X_st = ST_df.drop('Result', axis=1)

X_st, y_st = oversample.fit_resample(X_st, y_st)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

clf = RandomForestClassifier().fit(X_st_train, y_st_train)
y_pred = clf.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing (-1)', 'Legitimate (1)'], digits=4))

Accuracy: 96.63%

                precision    recall  f1-score   support

 Phishing (-1)     0.9737    0.9601    0.9668      1502
Legitimate (1)     0.9588    0.9729    0.9658      1437

      accuracy                         0.9663      2939
     macro avg     0.9663    0.9665    0.9663      2939
  weighted avg     0.9664    0.9663    0.9663      2939



#### Dataset_1 + Best_features

In [9]:
# #Atributo de saída
y_st = ST_df_best_features['Result'].values

# #Atributos de entrada
X_st = ST_df_best_features.drop('Result', axis=1)

X_st, y_st = oversample.fit_resample(X_st, y_st)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

clf = RandomForestClassifier().fit(X_st_train, y_st_train)
y_pred = clf.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing (-1)', 'Legitimate (1)'], digits=4))

Accuracy: 95.78%

                precision    recall  f1-score   support

 Phishing (-1)     0.9624    0.9547    0.9586      1502
Legitimate (1)     0.9531    0.9610    0.9570      1437

      accuracy                         0.9578      2939
     macro avg     0.9577    0.9579    0.9578      2939
  weighted avg     0.9578    0.9578    0.9578      2939



#### Dataset_2 + All_features

In [10]:
# #Atributo de saída
y_nd = ND_df['result'].values

# #Atributos de entrada
X_nd = ND_df.drop('result', axis=1)

X_nd, y_nd = oversample.fit_resample(X_st, y_st)

X_nd_train, X_nd_test, y_nd_train, y_nd_test = train_test_split(X_nd, y_nd, test_size=0.3,random_state=42)

clf = RandomForestClassifier().fit(X_nd_train, y_nd_train)
y_pred = clf.predict(X_nd_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_nd_test, y_pred) * 100))
print(classification_report(y_nd_test, y_pred, target_names=['Phishing (-1)', 'Legitimate (1)'], digits=4))

Accuracy: 96.33%

                precision    recall  f1-score   support

 Phishing (-1)     0.9672    0.9607    0.9639      1502
Legitimate (1)     0.9592    0.9659    0.9626      1437

      accuracy                         0.9633      2939
     macro avg     0.9632    0.9633    0.9632      2939
  weighted avg     0.9633    0.9633    0.9633      2939



#### Dataset_2 + Best_features

In [12]:
# #Atributo de saída
y_nd = ND_df_best_features['result'].values

# #Atributos de entrada
X_nd = ND_df_best_features.drop('result', axis=1)

X_nd, y_nd = oversample.fit_resample(X_st, y_st)

X_nd_train, X_nd_test, y_nd_train, y_nd_test = train_test_split(X_nd, y_nd, test_size=0.3,random_state=42)

clf = RandomForestClassifier().fit(X_nd_train, y_nd_train)
y_pred = clf.predict(X_nd_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_nd_test, y_pred) * 100))
print(classification_report(y_nd_test, y_pred, target_names=['Phishing (-1)', 'Legitimate (1)'], digits=4))

Accuracy: 95.44%

                precision    recall  f1-score   support

 Phishing (-1)     0.9634    0.9467    0.9550      1502
Legitimate (1)     0.9453    0.9624    0.9538      1437

      accuracy                         0.9544      2939
     macro avg     0.9544    0.9546    0.9544      2939
  weighted avg     0.9546    0.9544    0.9544      2939



### SVM

#### Dataset_1 + All_features

In [14]:
# #Atributo de saída
y_st = ST_df['Result'].values

# #Atributos de entrada
X_st = ST_df.drop('Result', axis=1)

X_st, y_st = oversample.fit_resample(X_st, y_st)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

svm_model = svm.SVC(kernel='linear').fit(X_st_train, y_st_train)
y_pred = svm_model.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing (-1)', 'Legitimate (1)'], digits=4))

Accuracy: 92.45%

                precision    recall  f1-score   support

 Phishing (-1)     0.9457    0.9041    0.9244      1502
Legitimate (1)     0.9042    0.9457    0.9245      1437

      accuracy                         0.9245      2939
     macro avg     0.9249    0.9249    0.9245      2939
  weighted avg     0.9254    0.9245    0.9245      2939



#### Dataset_1 + Best_features

In [16]:
# #Atributo de saída
y_st = ST_df_best_features['Result'].values

# #Atributos de entrada
X_st = ST_df_best_features.drop('Result', axis=1)

X_st, y_st = oversample.fit_resample(X_st, y_st)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

svm_model = svm.SVC(kernel='linear').fit(X_st_train, y_st_train)
y_pred = svm_model.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing (-1)', 'Legitimate (1)'], digits=4))

Accuracy: 92.17%

                precision    recall  f1-score   support

 Phishing (-1)     0.9473    0.8968    0.9213      1502
Legitimate (1)     0.8978    0.9478    0.9221      1437

      accuracy                         0.9217      2939
     macro avg     0.9225    0.9223    0.9217      2939
  weighted avg     0.9231    0.9217    0.9217      2939



#### Dataset_2 + All_features

In [18]:
# #Atributo de saída
y_nd = ND_df['result'].values

# #Atributos de entrada
X_nd = ND_df.drop('result', axis=1)

X_nd, y_nd = oversample.fit_resample(X_nd, y_nd)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

svm_model = svm.SVC(kernel='linear').fit(X_st_train, y_st_train)
y_pred = svm_model.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing (-1)', 'Legitimate (1)'], digits=4))

Accuracy: 92.17%

                precision    recall  f1-score   support

 Phishing (-1)     0.9473    0.8968    0.9213      1502
Legitimate (1)     0.8978    0.9478    0.9221      1437

      accuracy                         0.9217      2939
     macro avg     0.9225    0.9223    0.9217      2939
  weighted avg     0.9231    0.9217    0.9217      2939



#### Dataset_2 + Best_features

In [20]:
# #Atributo de saída
y_st = ND_df_best_features['result'].values

# #Atributos de entrada
X_st = ND_df_best_features.drop('result', axis=1)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

svm_model = svm.SVC(kernel='linear').fit(X_st_train, y_st_train)
y_pred = svm_model.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing (-1)', 'Legitimate (1)'], digits=4))

Accuracy: 82.50%

                precision    recall  f1-score   support

 Phishing (-1)     0.8250    0.8234    0.8242      2107
Legitimate (1)     0.8249    0.8265    0.8257      2121

      accuracy                         0.8250      4228
     macro avg     0.8250    0.8250    0.8250      4228
  weighted avg     0.8250    0.8250    0.8250      4228



### LR

#### Dataset_1 + All_features

In [22]:
# #Atributo de saída
y_st = ST_df['Result'].values

# #Atributos de entrada
X_st = ST_df.drop('Result', axis=1)

X_st, y_st = oversample.fit_resample(X_st, y_st)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

clf = LogisticRegression().fit(X_st_train, y_st_train)
y_pred = clf.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing (-1)', 'Legitimate (1)'], digits=4))

Accuracy: 91.97%

                precision    recall  f1-score   support

 Phishing (-1)     0.9318    0.9095    0.9205      1502
Legitimate (1)     0.9077    0.9304    0.9189      1437

      accuracy                         0.9197      2939
     macro avg     0.9197    0.9199    0.9197      2939
  weighted avg     0.9200    0.9197    0.9197      2939



#### Dataset_1 + Best_features

In [24]:
# #Atributo de saída
y_st = ST_df_best_features['Result'].values

# #Atributos de entrada
X_st = ST_df_best_features.drop('Result', axis=1)

X_st, y_st = oversample.fit_resample(X_st, y_st)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

clf = LogisticRegression().fit(X_st_train, y_st_train)
y_pred = clf.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing (-1)', 'Legitimate (1)'], digits=4))

Accuracy: 91.49%

                precision    recall  f1-score   support

 Phishing (-1)     0.9241    0.9081    0.9161      1502
Legitimate (1)     0.9057    0.9221    0.9138      1437

      accuracy                         0.9149      2939
     macro avg     0.9149    0.9151    0.9149      2939
  weighted avg     0.9151    0.9149    0.9149      2939



#### Dataset_2 + All_features

In [26]:
# #Atributo de saída
y_st = ND_df['result'].values

# #Atributos de entrada
X_st = ND_df.drop('result', axis=1)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

clf = LogisticRegression().fit(X_st_train, y_st_train)
y_pred = clf.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing (-1)', 'Legitimate (1)'], digits=4))

Accuracy: 84.32%

                precision    recall  f1-score   support

 Phishing (-1)     0.8349    0.8543    0.8445      2107
Legitimate (1)     0.8518    0.8322    0.8419      2121

      accuracy                         0.8432      4228
     macro avg     0.8434    0.8432    0.8432      4228
  weighted avg     0.8434    0.8432    0.8432      4228



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Dataset_2 + Best_features

In [27]:
# #Atributo de saída
y_st = ND_df_best_features['result'].values

# #Atributos de entrada
X_st = ND_df_best_features.drop('result', axis=1)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

clf = LogisticRegression().fit(X_st_train, y_st_train)
y_pred = clf.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing (-1)', 'Legitimate (1)'], digits=4))

Accuracy: 82.10%

                precision    recall  f1-score   support

 Phishing (-1)     0.8178    0.8244    0.8211      2107
Legitimate (1)     0.8241    0.8175    0.8208      2121

      accuracy                         0.8210      4228
     macro avg     0.8210    0.8210    0.8210      4228
  weighted avg     0.8210    0.8210    0.8210      4228

