In [20]:
# instalar: pip install liac-arff
import arff
import pandas as pd
import numpy as np 
import random
from sklearn import svm
from sklearn.metrics import precision_score, accuracy_score, classification_report, f1_score, recall_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from csv import DictReader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

0 @relation phishing

02 @attribute having_IP_Address   { -1,1 } <br>
03 @attribute URL_Length          { 1,0,-1 } <br>
04 @attribute Shortining_Service  { 1,-1 } <br>
05 @attribute having_At_Symbol    { 1,-1 } <br>
06 @attribute double_slash_redirecting { -1,1 } <br>
07 @attribute Prefix_Suffix       { -1,1 } <br>
08 @attribute having_Sub_Domain   { -1,0,1 } <br>
09 @attribute SSLfinal_State      { -1,1,0 } <br>
10 @attribute Domain_registeration_length { -1,1 } <br>
11 @attribute Favicon     { 1,-1 } <br>
12 @attribute port        { 1,-1 } <br>
13 @attribute HTTPS_token { -1,1 } <br>
14 @attribute Request_URL { 1,-1 } <br>
15 @attribute URL_of_Anchor { -1,0,1 } <br>
16 @attribute Links_in_tags { 1,-1,0 } <br>
17 @attribute SFH  { -1,1,0 } <br>
18 @attribute Submitting_to_email { -1,1 } <br>
19 @attribute Abnormal_URL        { -1,1 } <br>
20 @attribute Redirect      { 0,1 } <br>
21 @attribute on_mouseover  { 1,-1 } <br>
22 @attribute RightClick    { 1,-1 } <br>
23 @attribute popUpWidnow   { 1,-1 } <br>
24 @attribute Iframe { 1,-1 } <br>
25 @attribute age_of_domain { -1,1 } <br>
26 @attribute DNSRecord     { -1,1 } <br>
27 @attribute web_traffic   { -1,0,1 } <br>
28 @attribute Page_Rank     { -1,1 } <br>
29 @attribute Google_Index  { 1,-1 } <br>
30 @attribute Links_pointing_to_page { 1,0,-1 } <br>
31 @attribute Statistical_report     { -1,1 } <br>
32 @attribute Result    { -1,1 } <br>

Limpeza dos dados, a tabela a seguir mostra os atributos selecionados do dataset 1:

| **#** | **Attributes**              | **Description**                                                      | **Feature Type** |
|-------|-----------------------------|----------------------------------------------------------------------|------------------|
| 1     | having_IP_Address           | If the domain part has an IP address                                 | Lexical          |
| 2     | URL_Length                  | Number of characters in the URL                                      | Lexical          |
| 3     | Shortining_Service          | Is the URL shortened                                                 | Lexical          |
| 4     | having_At_Symbol            | Is “@” sign present                                                  | Lexical          |
| 5     | Prefix_Suffix               | Does the domain part include “-* symbol  (separates suffix prefix)   | Lexical          |
| 6     | having_Sub_Domain           | Is the number of sub-domains high  (greater than 2)                  | Lexical          |
| 7     | SSLfinal_State              | Has valid SSL certificate (age)                                      | Domain           |
| 8     | Domain_registeration_length | Is the domain age of the website short (shorter than a year)         | Domain           |
| 9     | Request_URL                 | Are the external objects (images, videos) loaded from another domain | Content          |
| 10    | URL_of_Anchor               | Is the URL anchor linking to a webpage                               | Content          |
| 11    | Links_in_tags               | Presence of \<meta\>, \<scripts\>,  \<link\> tags                          | Content          |
| 12    | SFH                         | Does the server form handler(SFH) contain empty strings              | Domain           |
| 13    | Abnormal_URL                | Is the hostname included in WHOIS database                           | Lexical          |
| 14    | age_of_domain               | Is the age of the website short or long                              | Domain           |
| 15    | DNSRecord                   | If the website is recognized by WHOIS database                       | Domain           |
| 16    | web_traffic                 | If the webpage is frequently visited                                 | Domain           |
| 17    | Page_Rank                   | How important the webpage is on the internet                         | Domain           |
| 18    | Google_Index                | If the website is in Google’s index or not                           | Domain           |
| 19    | Statistical_report          | If the host belongs to top phishing  IPs or domains (PhishTank)      | Domain           |

Atributos adicionais no segundo dataset:

| **#** | **Attributes**              | **Description**                                                      | **Feature Type** |
|-------|-----------------------------|----------------------------------------------------------------------|------------------|
| 1     | external_objects            | Any suspicious behaviors related to website elements                               | Content          |
| 2     | anchor_tags                 | Any suspicious behaviors related to the anchor tag                                 | Content          |

Por fim temos os atributos de cada dataset:

| **#** | **Attributes**              | **Presence of these features in the datasets** |
|-------|-----------------------------|------------------------------------------------|
| 1     | having_IP_Address           | Both                                           |
| 2     | URL_Length                  | Both                                           |
| 3     | Shortining_Service          | Both                                           |
| 4     | having_At_Symbol            | Both                                           |
| 5     | Prefix_Suffix               | Both                                           |
| 6     | having_Sub_Domain           | Both                                           |
| 7     | SSLfinal_State              | Both                                           |
| 8     | Domain_registeration_length | Both                                           |
| 9     | Links_in_tags               | Both                                           |
| 10    | SFH                         | Both                                           |
| 11    | Abnormal_URL                | Both                                           |
| 12    | age_of_domain               | Both                                           |
| 13    | DNSRecord                   | Both                                           |
| 14    | web_traffic                 | Both                                           |
| 15    | Statistical_report          | Both                                           |
| 16    | Request_URL                 | First dataset                                  |
| 17    | URL_of_Anchor               | First dataset                                  |
| 18    | Page_Rank                   | First dataset                                  |
| 19    | Google_Index                | First dataset                                  |
| 20    | external_objects            | Second dataset                                 |
| 21    | anchor_tags                 | Second dataset                                 |

In [4]:
def select_random(results):
    leg = list()
    selected = list()

    for i in range(len(results)):
        if results[i] == '1':
            leg.append(i)
 
    while len(set(selected)) != 1259:
        if random.choice(leg) not in selected:
            selected.append(random.choice(leg))
            
    return set(selected)



# file_path = "trainingdataset.arff" # Dados: 11055

# Abrir o arquivo ARFF
with open("trainingdataset.arff", 'r') as file:
    arff_data = arff.load(file)

# Acessar os dados do arquivo
data = arff_data['data']  # Dados em formato de lista ou matriz
attributes = arff_data['attributes']  # Atributos do arquivo ARFF

ST_df = pd.DataFrame(data, columns=[attr[0] for attr in attributes])

res = select_random(list(ST_df['Result']))
ST_df.drop(list(res), axis=0, inplace=True)

ST_df['Result'] = ST_df['Result'].astype(int)
ST_df['Result'] = ST_df['Result'].replace(-1, 0)

ST_df_best_features = ST_df.drop([
    'double_slash_redirecting', 
    'Favicon', 
    'port', 
    'HTTPS_token', 
    'Submitting_to_email', 
    'Redirect', 
    'on_mouseover', 
    'RightClick',
    'popUpWidnow', 
    'Iframe', 
    'Links_pointing_to_page'
], axis=1)

# for i in range(len(ST_df['Result'])):
# # print(ST_df['Result'][23])
# # for i in ST_df['Result'][24]:
# #     print(len(i))
#     if ST_df['Result'][i] != '1':
#         print("ASLUKDHAKUSHDASD")
#     print(ST_df['Result'])
#     if ST_df['Result'][i] != ST_df_best_features['Result'][i]

# ST_df_best_features

# ST_df['Result']
# ST_df['Result'].value_counts()

In [5]:
# file_path = "trainingdataset.arff" # Dados: 11055
# file_path = "old.arff" # Dados: 2456
def select_random(results):
    leg = list()
    selected = list()

    for i in range(len(results)):
        if results[i] == 1:
            leg.append(i)
 
    while len(set(selected)) != 5:
        if random.choice(leg) not in selected:
            selected.append(random.choice(leg))
            
    return set(selected)


ND_df = pd.read_csv('fixed_values_ds.csv')

ND_df['result'] = ND_df['result'].replace(-1, 0)

res = select_random(list(ND_df['result']))

n_ND_df = ND_df.drop(list(res), inplace=False)

ND_df_best_features = n_ND_df.drop([
    'double-slash_redirection', 
    'favicons', 
    'ports', 
    'https', 
    'auto_email', 
    'iframe_redirection', 
    'on_mouse_over', 
    'right_click',
    'popup_windows', 
    'links_pointing', 
    'image_text_keyword'
], axis=1)
ND_df_best_features
# n_ND_df['result']

# ND_df['result'].value_counts()

Unnamed: 0,having_ip_address,length_of_url,shortening_services,having_at_symbol,prefix and suffix,sub_domains,ssl_state,domain_registered,external_objects,anchor_tags,links_in_tags,sfh-domain,abnoramal_url,domain_age,dns_record,web_traffic,statistical_report,result
0,-1,-1,-1,-1,-1,1,1,1,-1,-1,-1,0,1,1,1,1,-1,1
1,-1,0,-1,-1,1,0,-1,1,1,1,1,-1,-1,-1,1,1,-1,1
2,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,1,1,-1,1
3,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,1,0,-1,1
4,-1,1,-1,-1,1,1,-1,1,1,-1,1,-1,-1,1,1,-1,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14088,-1,-1,-1,-1,-1,0,-1,1,-1,1,-1,-1,-1,1,1,-1,-1,0
14089,-1,-1,-1,-1,-1,0,-1,1,1,-1,-1,-1,-1,-1,1,-1,-1,0
14090,-1,-1,-1,-1,-1,0,-1,1,-1,1,0,-1,-1,-1,1,-1,-1,0
14091,-1,-1,-1,-1,-1,0,-1,1,1,-1,0,0,-1,-1,1,-1,-1,0


## RF

#### Dataset_1 + All_features

In [8]:
# #Atributo de saída
y_st = ST_df['Result'].values

# #Atributos de entrada
X_st = ST_df.drop('Result', axis=1)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

clf = RandomForestClassifier().fit(X_st_train, y_st_train)
y_pred = clf.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing - 0', 'Legitimate - 1'], digits=4))

Accuracy: 97.07%

                precision    recall  f1-score   support

  Phishing - 0     0.9726    0.9686    0.9706      1464
Legitimate - 1     0.9689    0.9729    0.9709      1475

      accuracy                         0.9707      2939
     macro avg     0.9708    0.9707    0.9707      2939
  weighted avg     0.9707    0.9707    0.9707      2939



#### Dataset_1 + Best_features

In [9]:
# #Atributo de saída
y_st = ST_df_best_features['Result'].values

# #Atributos de entrada
X_st = ST_df_best_features.drop('Result', axis=1)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

clf = RandomForestClassifier().fit(X_st_train, y_st_train)
y_pred = clf.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing - 0', 'Legitimate - 1'], digits=4))

Accuracy: 96.60%

                precision    recall  f1-score   support

  Phishing - 0     0.9710    0.9604    0.9657      1464
Legitimate - 1     0.9611    0.9715    0.9663      1475

      accuracy                         0.9660      2939
     macro avg     0.9660    0.9660    0.9660      2939
  weighted avg     0.9660    0.9660    0.9660      2939



#### Dataset_2 + All_features

In [10]:
# #Atributo de saída
y_nd = ND_df['result'].values

# #Atributos de entrada
X_nd = ND_df.drop('result', axis=1)

X_nd_train, X_nd_test, y_nd_train, y_nd_test = train_test_split(X_nd, y_nd, test_size=0.3,random_state=42)

clf = RandomForestClassifier().fit(X_nd_train, y_nd_train)
y_pred = clf.predict(X_nd_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_nd_test, y_pred) * 100))
print(classification_report(y_nd_test, y_pred, target_names=['Phishing - 0', 'Legitimate - 1'], digits=4))

Accuracy: 93.05%

                precision    recall  f1-score   support

  Phishing - 0     0.9270    0.9340    0.9305      2107
Legitimate - 1     0.9340    0.9269    0.9304      2121

      accuracy                         0.9305      4228
     macro avg     0.9305    0.9305    0.9305      4228
  weighted avg     0.9305    0.9305    0.9305      4228



#### Dataset_2 + Best_features

In [11]:
# #Atributo de saída
y_nd = ND_df_best_features['result'].values

# #Atributos de entrada
X_nd = ND_df_best_features.drop('result', axis=1)

X_nd_train, X_nd_test, y_nd_train, y_nd_test = train_test_split(X_nd, y_nd, test_size=0.3,random_state=42)

clf = RandomForestClassifier().fit(X_nd_train, y_nd_train)
y_pred = clf.predict(X_nd_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_nd_test, y_pred) * 100))
print(classification_report(y_nd_test, y_pred, target_names=['Phishing - 0', 'Legitimate - 1'], digits=4))

Accuracy: 86.07%

                precision    recall  f1-score   support

  Phishing - 0     0.8475    0.8807    0.8638      2121
Legitimate - 1     0.8749    0.8405    0.8574      2106

      accuracy                         0.8607      4227
     macro avg     0.8612    0.8606    0.8606      4227
  weighted avg     0.8612    0.8607    0.8606      4227



### SVM

#### Dataset_1 + All_features

In [12]:
# #Atributo de saída
y_st = ST_df['Result'].values

# #Atributos de entrada
X_st = ST_df.drop('Result', axis=1)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

svm_model = svm.SVC(kernel='linear').fit(X_st_train, y_st_train)
y_pred = svm_model.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing - 0', 'Legitimate - 1'], digits=4))

Accuracy: 92.21%

                precision    recall  f1-score   support

  Phishing - 0     0.9364    0.9051    0.9205      1464
Legitimate - 1     0.9088    0.9390    0.9236      1475

      accuracy                         0.9221      2939
     macro avg     0.9226    0.9220    0.9220      2939
  weighted avg     0.9225    0.9221    0.9221      2939



#### Dataset_1 + Best_features

In [13]:
# #Atributo de saída
y_st = ST_df_best_features['Result'].values

# #Atributos de entrada
X_st = ST_df_best_features.drop('Result', axis=1)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

svm_model = svm.SVC(kernel='linear').fit(X_st_train, y_st_train)
y_pred = svm_model.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing - 0', 'Legitimate - 1'], digits=4))

Accuracy: 91.32%

                precision    recall  f1-score   support

  Phishing - 0     0.9333    0.8893    0.9108      1464
Legitimate - 1     0.8951    0.9369    0.9155      1475

      accuracy                         0.9132      2939
     macro avg     0.9142    0.9131    0.9132      2939
  weighted avg     0.9141    0.9132    0.9132      2939



#### Dataset_2 + All_features

In [14]:
# #Atributo de saída
y_st = ND_df['result'].values

# #Atributos de entrada
X_st = ND_df.drop('result', axis=1)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

svm_model = svm.SVC(kernel='linear').fit(X_st_train, y_st_train)
y_pred = svm_model.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing - 0', 'Legitimate - 1'], digits=4))

Accuracy: 87.16%

                precision    recall  f1-score   support

  Phishing - 0     0.8879    0.8495    0.8683      2107
Legitimate - 1     0.8567    0.8934    0.8747      2121

      accuracy                         0.8716      4228
     macro avg     0.8723    0.8715    0.8715      4228
  weighted avg     0.8722    0.8716    0.8715      4228



#### Dataset_2 + Best_features

In [15]:
# #Atributo de saída
y_st = ND_df_best_features['result'].values

# #Atributos de entrada
X_st = ND_df_best_features.drop('result', axis=1)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

svm_model = svm.SVC(kernel='linear').fit(X_st_train, y_st_train)
y_pred = svm_model.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing - 0', 'Legitimate - 1'], digits=4))

Accuracy: 81.26%

                precision    recall  f1-score   support

  Phishing - 0     0.8341    0.7822    0.8073      2121
Legitimate - 1     0.7936    0.8433    0.8177      2106

      accuracy                         0.8126      4227
     macro avg     0.8138    0.8127    0.8125      4227
  weighted avg     0.8139    0.8126    0.8125      4227



### LR

#### Dataset_1 + All_features

In [16]:
# #Atributo de saída
y_st = ST_df['Result'].values

# #Atributos de entrada
X_st = ST_df.drop('Result', axis=1)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

clf = LogisticRegression().fit(X_st_train, y_st_train)
y_pred = clf.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing - 0', 'Legitimate - 1'], digits=4))

Accuracy: 92.11%

                precision    recall  f1-score   support

  Phishing - 0     0.9284    0.9119    0.9201      1464
Legitimate - 1     0.9141    0.9302    0.9220      1475

      accuracy                         0.9211      2939
     macro avg     0.9212    0.9210    0.9210      2939
  weighted avg     0.9212    0.9211    0.9211      2939



#### Dataset_1 + Best_features

In [17]:
# #Atributo de saída
y_st = ST_df_best_features['Result'].values

# #Atributos de entrada
X_st = ST_df_best_features.drop('Result', axis=1)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

clf = LogisticRegression().fit(X_st_train, y_st_train)
y_pred = clf.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing - 0', 'Legitimate - 1'], digits=4))

Accuracy: 92.00%

                precision    recall  f1-score   support

  Phishing - 0     0.9312    0.9064    0.9187      1464
Legitimate - 1     0.9095    0.9336    0.9214      1475

      accuracy                         0.9200      2939
     macro avg     0.9204    0.9200    0.9200      2939
  weighted avg     0.9203    0.9200    0.9200      2939



#### Dataset_2 + All_features

In [18]:
# #Atributo de saída
y_st = ND_df['result'].values

# #Atributos de entrada
X_st = ND_df.drop('result', axis=1)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

clf = LogisticRegression().fit(X_st_train, y_st_train)
y_pred = clf.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing - 0', 'Legitimate - 1'], digits=4))

Accuracy: 84.32%

                precision    recall  f1-score   support

  Phishing - 0     0.8349    0.8543    0.8445      2107
Legitimate - 1     0.8518    0.8322    0.8419      2121

      accuracy                         0.8432      4228
     macro avg     0.8434    0.8432    0.8432      4228
  weighted avg     0.8434    0.8432    0.8432      4228



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Dataset_2 + Best_features

In [19]:
# #Atributo de saída
y_st = ND_df_best_features['result'].values

# #Atributos de entrada
X_st = ND_df_best_features.drop('result', axis=1)

X_st_train, X_st_test, y_st_train, y_st_test = train_test_split(X_st, y_st, test_size=0.3,random_state=42)

clf = LogisticRegression().fit(X_st_train, y_st_train)
y_pred = clf.predict(X_st_test)

print("Accuracy: %.2f%%\n" % (accuracy_score(y_st_test, y_pred) * 100))
print(classification_report(y_st_test, y_pred, target_names=['Phishing - 0', 'Legitimate - 1'], digits=4))

Accuracy: 81.55%

                precision    recall  f1-score   support

  Phishing - 0     0.8197    0.8105    0.8151      2121
Legitimate - 1     0.8113    0.8205    0.8159      2106

      accuracy                         0.8155      4227
     macro avg     0.8155    0.8155    0.8155      4227
  weighted avg     0.8155    0.8155    0.8155      4227



### ANN

In [None]:
model = keras.Sequential()
model.add(layers.Dense(2, activation="relu"))
model.add(layers.Dense(3, activation="relu"))
model.add(layers.Dense(4))