In [3]:
# Importação das bibliotecas

import os
import zipfile

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
# Caminho dos dados do dataset em formato zip
dataset_zip = "../dados/phiusiil+phishing+url+dataset.zip"

# Extração dos dados zipados
with zipfile.ZipFile(dataset_zip, 'r') as zip_referencia:
    zip_referencia.extractall("../dados/dataset_phishing")

print("Arquivos extraídos para a pasta dados/dataset_phishing")

Arquivos extraídos para a pasta dados/dataset_phishing


In [47]:
# Caminho do arquivo extraido
dataset_extraido = "../dados/dataset_phishing/PhiUSIIL_Phishing_URL_Dataset.csv"

# Carregamento dos dados usando o pandas
dataframe = pd.read_csv(dataset_extraido)

# Exibindo as dimensões dos dados
print("Dimensões dos Dados:", dataframe.shape, "\n")

# Exibe as informações gerais do conjunto e todas suas caracteristicas
dataframe.info()

Dimensões dos Dados: (235795, 56) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235795 entries, 0 to 235794
Data columns (total 56 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   FILENAME                    235795 non-null  object 
 1   URL                         235795 non-null  object 
 2   URLLength                   235795 non-null  int64  
 3   Domain                      235795 non-null  object 
 4   DomainLength                235795 non-null  int64  
 5   IsDomainIP                  235795 non-null  int64  
 6   TLD                         235795 non-null  object 
 7   URLSimilarityIndex          235795 non-null  float64
 8   CharContinuationRate        235795 non-null  float64
 9   TLDLegitimateProb           235795 non-null  float64
 10  URLCharProb                 235795 non-null  float64
 11  TLDLength                   235795 non-null  int64  
 12  NoOfSubDomain               235795 n

# Informações das Caracteristicas:

* FILENAME: Nome do arquivo associado ao site (talvez o HTML salvo ou algo similar).
* URL: Endereço completo do site.
* URLLength: Comprimento total do URL (indicador de URLs suspeitos, que costumam ser mais longos).
* Domain: Nome do domínio no URL.
* DomainLength: Comprimento do nome do domínio.
* IsDomainIP: Indica se o domínio é um endereço IP (1) ou um nome de domínio normal (0).
* TLD: Extensão do domínio, como .com, .org.
* URLSimilarityIndex: Similaridade do URL com URLs legítimos conhecidos.
* CharContinuationRate: Taxa de continuidade entre caracteres no URL.
* TLDLegitimateProb: Probabilidade de a extensão do domínio ser legítima.
* URLCharProb: Probabilidade de caracteres no URL corresponderem a URLs legítimos.
* TLDLength: Comprimento da extensão do domínio.
* NoOfSubDomain: Número de subdomínios no URL.
* HasObfuscation: Indica se há ofuscação no URL (exemplo: caracteres substituídos).
* NoOfObfuscatedChar: Número de caracteres ofuscados no URL.
* ObfuscationRatio: Razão de caracteres ofuscados no URL em relação ao total.
* NoOfLettersInURL: Quantidade de letras no URL.
* LetterRatioInURL: Proporção de letras no URL.
* NoOfDegitsInURL: Número de dígitos no URL.
* DegitRatioInURL: Proporção de dígitos no URL.
* NoOfEqualsInURL: Quantidade de símbolos de igualdade (=) no URL.
* NoOfQMarkInURL: Quantidade de interrogações (?) no URL.
* NoOfAmpersandInURL: Número de & no URL.
* NoOfOtherSpecialCharsInURL: Número de outros caracteres especiais no URL.
* SpacialCharRatioInURL: Proporção de caracteres especiais no URL.
* IsHTTPS: Indica se o site usa HTTPS (1 para sim, 0 para não).
* LineOfCode: Número de linhas de código HTML no site.
* LargestLineLength: Comprimento da maior linha de código HTML.
* HasTitle: Indica se o site tem título (1 para sim, 0 para não).
* Title: O texto do título do site.
* DomainTitleMatchScore: Similaridade entre o domínio e o título.
* URLTitleMatchScore: Similaridade entre o URL e o título.
* HasFavicon: Indica se o site tem favicon (1 para sim, 0 para não).
* Robots: Presença de um arquivo robots.txt (1 para sim, 0 para não).
* sResponsive: Indica se o site é responsivo (adapta-se a diferentes dispositivos).
* NoOfURLRedirect: Número de redirecionamentos do URL.
* NoOfSelfRedirect: Número de redirecionamentos para o próprio site.
* HasDescription: Indica se o site tem uma descrição (1 para sim, 0 para não).
* NoOfPopup: Número de pop-ups presentes no site.
* NoOfiFrame: Número de iframes no site (iframes podem ser usados para ataques).
* HasExternalFormSubmit: Indica se há formulários que enviam dados para domínios externos.
* HasSocialNet: Indica se há links para redes sociais.
* HasSubmitButton: Presença de botões de envio (1 para sim, 0 para não).
* HasHiddenFields: Presença de campos ocultos no formulário.
* HasPasswordField: Indica se há campos de senha no site.
* Bank: Indica se o site aparenta ser relacionado a bancos.
* Pay: Indica se o site aparenta ser relacionado a pagamentos.
* Crypto: Indica se o site aparenta ser relacionado a criptomoedas.
* HasCopyrightInfo: Indica se o site tem informações de copyright.
* NoOfImage: Número de imagens no site.
* NoOfCSS: Número de arquivos CSS usados no site.
* NoOfJS: Número de scripts JavaScript usados no site.
* NoOfSelfRef: Número de referências internas no site.
* NoOfEmptyRef: Número de referências vazias no site.
* NoOfExternalRef: Número de referências externas no site.

# Alguns Detalhes:
## Estrutura de Dados:
    * Filename; deve ser ignorada, é apenas o nome do arquivo associado ao site.
    * Url; Endereço completo do site.
    * Domain; Nome do domínio.
    * TLD; extensão do domínio, como .com, .org
    * Title; título do site

## Urls e domínios:
    * UrlLenght; DomainLenght; Comprimento do url e do dominio.
    * IsDomainIP; Indica se o dominio é representado como endereço IP sendo 0 = false, 1 = true.
    * URLSimilarityIndex; Similaridade com URLs legítimos.
    * CharContinuationRate; Taxa de continuidade de caracteres.
    * NoOfSubDomain; Número de subdomínios.
    * HasObfuscation, NoOfObfuscatedChar, ObfuscationRatio; Características de ofuscação

## Elementos HTML:
    * NoOfImage, NoOfCSS, NoOfJS; Número de imagens, arquivos CSS e scripts JS no site.
    * HasTitle, HasFavicon, HasDescription; Presença de elementos de descrição e icones.

## Redirecionamentos:
    * NoOfURLRedirect, NoOfSelfRedirect; Número de redirecionamentos internos e externos.

## Interação:
    * HasSubmition, HasHiddenFileds, HasPasswordFiled; Presença de campos e interação.

# Alvo:
    * Label; 0 phishing, 1 legítimo
  

In [9]:
# Configurando para o pandas exibir todas as colunas
pd.set_option('display.max_columns', None)

# Inspeção dos 10 primeiros dados iniciais do dataframe
inicio = dataframe.head(10)

# Inspeção dos 10 útlimos dados do dataframe
fim = dataframe.tail(10)

# Concatenando para exibir únicamente as 10 primeiras linhas e 10 últimas linhas
inicio_fim = pd.concat([inicio, fim])

# Exibindo linhas inicias e finais
display(inicio_fim)

Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,TLDLength,NoOfSubDomain,HasObfuscation,NoOfObfuscatedChar,ObfuscationRatio,NoOfLettersInURL,LetterRatioInURL,NoOfDegitsInURL,DegitRatioInURL,NoOfEqualsInURL,NoOfQMarkInURL,NoOfAmpersandInURL,NoOfOtherSpecialCharsInURL,SpacialCharRatioInURL,IsHTTPS,LineOfCode,LargestLineLength,HasTitle,Title,DomainTitleMatchScore,URLTitleMatchScore,HasFavicon,Robots,IsResponsive,NoOfURLRedirect,NoOfSelfRedirect,HasDescription,NoOfPopup,NoOfiFrame,HasExternalFormSubmit,HasSocialNet,HasSubmitButton,HasHiddenFields,HasPasswordField,Bank,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,521848.txt,https://www.southbankmosaics.com,31,www.southbankmosaics.com,24,0,com,100.0,1.0,0.522907,0.061933,3,1,0,0,0.0,18,0.581,0,0.0,0,0,0,1,0.032,1,558,9381,1,à¸‚à¹ˆà¸²à¸§à¸ªà¸” à¸‚à¹ˆà¸²à¸§à¸§à¸±à¸™à¸™à¸µ...,0.0,0.0,0,1,1,0,0,0,0,1,0,0,1,1,0,1,0,0,1,34,20,28,119,0,124,1
1,31372.txt,https://www.uni-mainz.de,23,www.uni-mainz.de,16,0,de,100.0,0.666667,0.03265,0.050207,2,1,0,0,0.0,9,0.391,0,0.0,0,0,0,2,0.087,1,618,9381,1,johannes gutenberg-universitÃ¤t mainz,55.555556,55.555556,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,50,9,8,39,0,217,1
2,597387.txt,https://www.voicefmradio.co.uk,29,www.voicefmradio.co.uk,22,0,uk,100.0,0.866667,0.028555,0.064129,2,2,0,0,0.0,15,0.517,0,0.0,0,0,0,2,0.069,1,467,682,1,voice fm southampton,46.666667,46.666667,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,10,2,7,42,2,5,1
3,554095.txt,https://www.sfnmjournal.com,26,www.sfnmjournal.com,19,0,com,100.0,1.0,0.522907,0.057606,3,1,0,0,0.0,13,0.5,0,0.0,0,0,0,1,0.038,1,6356,26824,1,home page: seminars in fetal and neonatal medi...,0.0,0.0,0,1,1,0,0,0,1,12,0,1,1,1,0,0,1,1,1,3,27,15,22,1,31,1
4,151578.txt,https://www.rewildingargentina.org,33,www.rewildingargentina.org,26,0,org,100.0,1.0,0.079963,0.059441,3,1,0,0,0.0,20,0.606,0,0.0,0,0,0,1,0.03,1,6089,28404,1,fundaciÃ³n rewilding argentina,100.0,100.0,0,1,1,1,1,1,0,2,0,1,1,1,0,1,1,0,1,244,15,34,72,1,85,1
5,23107.txt,https://www.globalreporting.org,30,www.globalreporting.org,23,0,org,100.0,1.0,0.079963,0.060614,3,1,0,0,0.0,17,0.567,0,0.0,0,0,0,1,0.033,1,1210,737,1,gri - home,0.0,0.0,0,0,1,0,0,1,1,1,0,1,0,1,0,0,0,0,1,35,1,11,86,0,14,1
6,23034.txt,https://www.saffronart.com,25,www.saffronart.com,18,0,com,100.0,1.0,0.522907,0.063549,3,1,0,0,0.0,12,0.48,0,0.0,0,0,0,1,0.04,1,1024,984,1,0,0.0,0.0,1,0,1,1,1,0,2,4,0,1,0,1,0,0,0,0,1,32,4,14,44,2,17,1
7,696732.txt,https://www.nerdscandy.com,25,www.nerdscandy.com,18,0,com,100.0,1.0,0.522907,0.060486,3,1,0,0,0.0,12,0.48,0,0.0,0,0,0,1,0.04,1,514,399,1,nerds candy,100.0,100.0,1,0,1,0,0,1,0,1,0,1,1,0,0,0,0,0,1,24,2,22,36,0,15,1
8,739255.txt,https://www.hyderabadonline.in,29,www.hyderabadonline.in,22,0,in,100.0,1.0,0.005084,0.05698,2,1,0,0,0.0,16,0.552,0,0.0,0,0,0,1,0.034,1,2371,12913,1,hyderabadonline - business listing in hyderaba...,100.0,100.0,0,1,1,0,0,1,0,0,0,1,1,1,1,0,0,0,1,71,4,9,40,1,317,1
9,14486.txt,https://www.aap.org,18,www.aap.org,11,0,org,100.0,1.0,0.079963,0.070497,3,1,0,0,0.0,5,0.278,0,0.0,0,0,0,1,0.056,1,2730,481,1,home,0.0,0.0,1,1,1,0,0,1,0,2,0,1,0,0,0,0,0,0,1,10,1,12,173,6,65,1


In [37]:
# Função para aplicar coloração vermelha em valores NaNs
def cor_nan(valor):
    """
        Recebe cada célula do dataframe e verifica se um valor é NaN,
        e retorna um estilo CSS se for, caso contrário retorna string vazia.
        
        Parameros**:
        valor

        Retorno:
        cor vermelha em estilo CSS
    """
    if pd.isna(valor):
        return 'color: red'
    return ''

# Exibe a descrição dos dados e destaca em vermelho os valores nan devidos aos tipos das caracteristicas
descricao_dados = dataframe.describe(include='all').style.map(cor_nan)
descricao_dados

Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,TLDLength,NoOfSubDomain,HasObfuscation,NoOfObfuscatedChar,ObfuscationRatio,NoOfLettersInURL,LetterRatioInURL,NoOfDegitsInURL,DegitRatioInURL,NoOfEqualsInURL,NoOfQMarkInURL,NoOfAmpersandInURL,NoOfOtherSpecialCharsInURL,SpacialCharRatioInURL,IsHTTPS,LineOfCode,LargestLineLength,HasTitle,Title,DomainTitleMatchScore,URLTitleMatchScore,HasFavicon,Robots,IsResponsive,NoOfURLRedirect,NoOfSelfRedirect,HasDescription,NoOfPopup,NoOfiFrame,HasExternalFormSubmit,HasSocialNet,HasSubmitButton,HasHiddenFields,HasPasswordField,Bank,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
count,235795,235795,235795.0,235795,235795.0,235795.0,235795,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0
unique,235795,235370,,220086,,,695,,,,,,,,,,,,,,,,,,,,,,,197874.0,,,,,,,,,,,,,,,,,,,,,,,,,,
top,521848.txt,https://disclosepack.myportfolio.com/,,ipfs.io,,,com,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,
freq,1,2,,1197,,,112554,,,,,,,,,,,,,,,,,,,,,,,32719.0,,,,,,,,,,,,,,,,,,,,,,,,,,
mean,,,34.573095,,21.470396,0.002706,,78.430778,0.845508,0.260423,0.055747,2.764456,1.164758,0.002057,0.024861,0.000138,19.428919,0.515946,1.881011,0.028616,0.062241,0.029403,0.025056,2.340198,0.063309,0.782625,1141.900443,12789.532624,0.861261,,50.131427,52.122098,0.361768,0.266541,0.624513,0.133438,0.040107,0.440183,0.221765,1.588638,0.043987,0.45657,0.414301,0.377799,0.102263,0.127089,0.237007,0.023474,0.486775,26.075689,6.333111,10.522305,65.071113,2.377629,49.262516,0.571895
std,,,41.314153,,9.150793,0.051946,,28.976055,0.216632,0.251628,0.010587,0.599739,0.600969,0.045306,1.876249,0.003817,29.09033,0.123315,11.886695,0.070897,0.934704,0.193505,0.836448,3.527603,0.032393,0.412461,3419.950513,152201.099228,0.345675,,49.676981,49.600564,0.480513,0.442151,0.484249,0.340048,0.19621,0.49641,3.87054,5.762561,0.205067,0.498111,0.492602,0.484838,0.302994,0.333074,0.425247,0.151403,0.499826,79.411815,74.866296,22.312192,176.687539,17.641097,161.02743,0.494805
min,,,13.0,,4.0,0.0,,0.155574,0.0,0.0,0.001083,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,22.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,23.0,,16.0,0.0,,57.024793,0.68,0.005977,0.050747,2.0,1.0,0.0,0.0,0.0,10.0,0.435,0.0,0.0,0.0,0.0,0.0,1.0,0.038,1.0,18.0,200.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,,,27.0,,20.0,0.0,,100.0,1.0,0.079963,0.05797,3.0,1.0,0.0,0.0,0.0,14.0,0.519,0.0,0.0,0.0,0.0,0.0,1.0,0.05,1.0,429.0,1090.0,1.0,,75.0,100.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2.0,6.0,12.0,0.0,10.0,1.0
75%,,,34.0,,24.0,0.0,,100.0,1.0,0.522907,0.062875,3.0,1.0,0.0,0.0,0.0,20.0,0.594,0.0,0.0,0.0,0.0,0.0,3.0,0.083,1.0,1277.0,8047.0,1.0,,100.0,100.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,29.0,8.0,15.0,88.0,1.0,57.0,1.0
