In [31]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

### 0. Loading the Data
- load the data from the dataset, as it is. It will be processed later

In [32]:
data = pd.read_csv('./dataset/malicious_phish.csv')
data # all the data from the dtsset is now loaded

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement
...,...,...
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing
651190,www.angelfire.com/goth/devilmaycrytonite/,phishing
651191,http://www.garage-pirenne.be/index.php?option=...,defacement
651192,zstoimchev.github.io,benign


### 1. Data Preprocessing:
- Clean the Data: Remove any irrelevant or redundant information from your URLs.
  - remove missing values
  - remove unnecessary characters
  - keep consistent formatting
- Label Encoding: If your labels are not already in numerical format (0 for non-malicious, 1 for malicious), you’ll need to encode them.

In [33]:
# removing missing values
data.dropna(inplace=True)
data # data without empty values

# converting nominal type into numeric: 0=benign, 1=defacement, 3=phishing
le = LabelEncoder() # Create an instance of LabelEncoder
data['type'] = le.fit_transform(data['type'])
data

Unnamed: 0,url,type
0,br-icloud.com.br,3
1,mp3raid.com/music/krizz_kaliko.html,0
2,bopsecrets.org/rexroth/cr/1.htm,0
3,http://www.garage-pirenne.be/index.php?option=...,1
4,http://adventure-nicaragua.net/index.php?optio...,1
...,...,...
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),3
651190,www.angelfire.com/goth/devilmaycrytonite/,3
651191,http://www.garage-pirenne.be/index.php?option=...,1
651192,zstoimchev.github.io,0


### 2. Feature Extraction:
- Tokenize URLs: Break down each URL into its components (scheme, netloc, path, params, query, fragment).
- Extract Features: Some possible features could be the length of the URL, the number of special characters, or the presence of certain keywords.

In [34]:
# function to parse everything in single call for each url
def parse_url_funct(url):
    res = urlparse(url)
    #netloc = url.split('/')[0] if '://' not in url else res.netloc
    netloc = res.netloc
    path = res.path
    if not res.scheme:
        parts = res.path.lstrip('/').split('/', 1)
        netloc = parts[0]
        path = '/' + parts[1] if len(parts) > 1 else ''
    return pd.Series({
        'scheme': res.scheme,
        'netloc': netloc,
        'path': path,
        'params': res.params,
        'query': res.query,
        'fragment': res.fragment
    })
# calculating the URL components
data[['scheme', 
      'netloc', 
      'path', 
      'params', 
      'query', 
      'fragment']] = data['url'].apply(parse_url_funct)
data

# extracting features, like length, # special characters, keywords, etc...
data['length'] = data['url'].apply(lambda x: len(x))

def count_special_chars(string):
    special_chars = r'[^A-Za-z0-0_.]'
    return sum(1 for char in string if char in special_chars)
data['specchar'] = data['url'].apply(count_special_chars)


### 3. Split the Dataset: 
- Divide your data into a training set and a testing set. A common ratio is 80% for training and 20% for testing.

In [37]:
# this allows extra data preparation or something like that, idk for sure
# Assume df is your DataFrame and you want to split "y" column as target
X = data.drop('url', axis=1)  # features
y = data['url']  # target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
# Split the dataset into training and testing sets
train, test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print(f'Training set shape: {train.shape}')
print(f'Testing set shape: {test.shape}')

Training set shape: (520955, 10)
Testing set shape: (130239, 10)


### 4. Choose a Model: 
- There are many machine learning algorithms you can choose from. Decision trees, random forest, and logistic regression are a few options.

In [None]:

# Create your machine learning model
model = YourModel()

# Perform cross-validation
scores = cross_val_score(model, X_train, y_train, cv=5)

# Print the mean score and standard deviation of the scores
print(f"Accuracy: {scores.mean():.2f} (+/- {scores.std():.2f})")

### 5. Train the Model: 
- Feed your training data into the model and allow it to learn from the features and corresponding labels.

### 6. Evaluate the Model: 
- Use your testing data to evaluate the performance of your model. Common metrics include accuracy, precision, recall, and the F1 score.

### 7. Optimize: 
- Based on your evaluation, you may need to go back and adjust your feature extraction, choose a different model, or fine-tune your model parameters.