In [1]:
import pandas as pd
import re
from urllib.parse import urlparse

### 0. Loading the Data
- load the data from the dataset, as it is. It will be processed later

In [19]:
data = pd.read_csv('./dataset/malicious_phish.csv')
data # all the data from the dtsset is now loaded

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement
...,...,...
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing
651190,www.angelfire.com/goth/devilmaycrytonite/,phishing
651191,http://www.garage-pirenne.be/index.php?option=...,defacement
651192,zstoimchev.github.io,benign


### 1. Data Preprocessing:
- Clean the Data: Remove any irrelevant or redundant information from your URLs.
  - remove missing values
  - remove unnecessary characters
  - keep consistent formatting
- Label Encoding: If your labels are not already in numerical format (0 for non-malicious, 1 for malicious), you’ll need to encode them.

In [None]:
# removing missing values
data.dropna(inplace=True)

# removing the protocol from the url
#protocols = ['sftp', 'smtp', 'snmp', 'http', 'https', 'ftp'] # protocols to be removed from the URLs
#pattern = r'(' + '|'.join(f'{protocol}://' for protocol in protocols) + ')' # regular expression to match any protocols in the URLs
#data['url'] = data['url'].apply(lambda url: re.sub(pattern, '', url)) # apply the regular expression and remove the protocol

# removing query parameters
#data['url'] = data['url'].apply(lambda x: x.split('?')[0])

# remove non-alphanumeric characters
#pattern = r'[^A-Za-z0-0_.]' # regular expression
#data['url'] = data['url'].apply(lambda x: re.sub(pattern, '', x)) # applying the RE
# not removing them because big count can suggest malicious URL
data # data without query parameters

# TODO: maybe a good idea is to convert the types from nominal to numeric 

### 2. Feature Extraction:
- Tokenize URLs: Break down each URL into its components (scheme, netloc, path, params, query, fragment).
- Extract Features: Some possible features could be the length of the URL, the number of special characters, or the presence of certain keywords.

In [14]:
# function to parse everything in single call for each url
def parse_url_funct(url):
    res = urlparse(url)
    #netloc = url.split('/')[0] if '://' not in url else res.netloc
    netloc = res.netloc
    path = res.path
    if not res.scheme:
        parts = res.path.lstrip('/').split('/', 1)
        netloc = parts[0]
        path = '/' + parts[1] if len(parts) > 1 else ''
    return pd.Series({
        'scheme': res.scheme,
        'netloc': netloc,
        'path': path,
        'params': res.params,
        'query': res.query,
        'fragment': res.fragment
    })
# calculating the URL components
data[['scheme', 
      'netloc', 
      'path', 
      'params', 
      'query', 
      'fragment']] = data['url'].apply(parse_url_funct)
data

# extracting features, like length, # special characters, keywords, etc...
data['length'] = data['url'].apply(lambda x: len(x))

def count_special_chars(string):
    special_chars = "!@#$%^&*()-_=+[]{}|;:,.<>?/`~"
    return sum(1 for char in string if char in special_chars)
data['specchar'] = data['url'].apply(count_special_chars)


In [22]:
# extracting features, like length, # special characters, keywords, etc...
data['length'] = data['url'].apply(lambda x: len(x))

def count_special_chars(string):
    special_chars = r'[^A-Za-z0-0_.]'
    return sum(1 for char in string if char in special_chars)
data['specchar'] = data['url'].apply(count_special_chars)

In [23]:
data

Unnamed: 0,url,type,length,specchar
0,br-icloud.com.br,phishing,16,3
1,mp3raid.com/music/krizz_kaliko.html,benign,35,7
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,2
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,12
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,21
...,...,...,...,...
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45,9
651190,www.angelfire.com/goth/devilmaycrytonite/,phishing,41,4
651191,http://www.garage-pirenne.be/index.php?option=...,defacement,88,12
651192,zstoimchev.github.io,benign,20,3


In [21]:
data

Unnamed: 0,url,type,length,specchar
0,br-icloud.com.br,phishing,16,3
1,mp3raid.com/music/krizz_kaliko.html,benign,35,5
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,5
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,18
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,14
...,...,...,...,...
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45,9
651190,www.angelfire.com/goth/devilmaycrytonite/,phishing,41,5
651191,http://www.garage-pirenne.be/index.php?option=...,defacement,88,18
651192,zstoimchev.github.io,benign,20,2


### 3. Split the Dataset: 
- Divide your data into a training set and a testing set. A common ratio is 80% for training and 20% for testing.

### 4. Choose a Model: 
- There are many machine learning algorithms you can choose from. Decision trees, random forest, and logistic regression are a few options.

### 5. Train the Model: 
- Feed your training data into the model and allow it to learn from the features and corresponding labels.

### 6. Evaluate the Model: 
- Use your testing data to evaluate the performance of your model. Common metrics include accuracy, precision, recall, and the F1 score.

### 7. Optimize: 
- Based on your evaluation, you may need to go back and adjust your feature extraction, choose a different model, or fine-tune your model parameters.