In [1]:
import pandas as pd
from urllib.parse import urlparse
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


### 0. Loading the Data
- load the data from the dataset, as it is. It will be processed later

In [2]:
data = pd.read_csv('./dataset/malicious_phish.csv')
#data # all the data from the dtsset is now loaded

### 1. Data Preprocessing:
- Clean the Data: Remove any irrelevant or redundant information from your URLs.
  - remove missing values
  - remove unnecessary characters
  - keep consistent formatting
- Label Encoding: If your labels are not already in numerical format (0 for non-malicious, 1 for malicious), you’ll need to encode them.

In [3]:
# removing missing values
data.dropna(inplace=True)

# converting nominal type into numeric: 0=benign, 1=defacement, 3=phishing
le = LabelEncoder() # Create an instance of LabelEncoder
data['type'] = le.fit_transform(data['type'])
#data

### 2. Feature Extraction:
- Tokenize URLs: Break down each URL into its components (scheme, netloc, path, params, query, fragment).
- Extract Features: Some possible features could be the length of the URL, the number of special characters, or the presence of certain keywords.

In [4]:
# function to parse everything in single call for each url
def parse_url_funct(url):
    res = urlparse(url)
    netloc = res.netloc
    path = res.path
    if not res.scheme:
        parts = res.path.lstrip('/').split('/', 1)
        netloc = parts[0]
        path = '/' + parts[1] if len(parts) > 1 else ''
    return pd.Series({
        'scheme': res.scheme,
        'netloc': netloc,
        'path': path,
        'params': res.params,
        'query': res.query,
        'fragment': res.fragment
    })
# calculating the URL components
data[['scheme', 
      'netloc', 
      'path', 
      'params', 
      'query', 
      'fragment']] = data['url'].apply(parse_url_funct)
data

# extracting features, like length, # special characters, keywords, etc...
data['length'] = data['url'].apply(lambda x: len(x))

def count_special_chars(string):
    special_chars = r'[^A-Za-z0-9_.]'
    return sum(1 for char in string if char in special_chars)
data['specchar'] = data['url'].apply(count_special_chars)
#data

# encoding nominal into numeric tokens
le = LabelEncoder()
for col in ['scheme', 'netloc', 'path', 'params', 'query', 'fragment']:
    data[col] = le.fit_transform(data[col])

### 3. Split the Dataset: 
- Divide your data into a training set and a testing set. A common ratio is 80% for training and 20% for testing.

In [5]:
X = data.drop(['url', 'type'], axis=1)  # assuming 'url' is the target variable (for now)
y = data['type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
print(X_train.shape)
print(y.shape)

(520955, 8)
(651194,)


In [7]:
# Split the dataset into training and testing sets
#train, test = train_test_split(data, test_size=0.2, random_state=42)
#
# Print the shapes of the training and testing sets
#print(f'Training set shape: {train.shape}')
#print(f'Testing set shape: {test.shape}')

### 4. Choose a Model: 
- There are many machine learning algorithms you can choose from. Decision trees, random forest, and logistic regression are a few options.

In [8]:
model = RandomForestClassifier() # chosing model - random forest clasif...

### 5. Train the Model: 
- Feed your training data into the model and allow it to learn from the features and corresponding labels.

In [9]:
model.fit(X_train, y_train) # training the model

### 6. Evaluate the Model: 
- Use your testing data to evaluate the performance of your model. Common metrics include accuracy, precision, recall, and the F1 score.

In [11]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     85604
           1       0.96      0.99      0.97     19370
           2       0.97      0.92      0.95      6533
           3       0.92      0.83      0.88     18732

    accuracy                           0.96    130239
   macro avg       0.96      0.93      0.94    130239
weighted avg       0.96      0.96      0.96    130239



### 7. Optimize: 
- Based on your evaluation, you may need to go back and adjust your feature extraction, choose a different model, or fine-tune your model parameters.

In [None]:
data

In [12]:
def is_url_malicious(url):
    # Parse the URL
    parsed_url = parse_url_funct(url)
    
    # Calculate the length and special characters
    parsed_url['length'] = len(url)
    parsed_url['specchar'] = count_special_chars(url)
    
    # Encode the nominal tokens
    for col in ['scheme', 'netloc', 'path', 'params', 'query', 'fragment']:
        parsed_url[col] = le.fit_transform([parsed_url[col]])
    
    # Predict if the URL is malicious
    prediction = model.predict([parsed_url])
    
    # Return True if the URL is predicted to be malicious, False otherwise
    return prediction[0] != 0  # assuming '0' is benign


In [17]:
url = "https://zstoimchev.github.io"
url2 = "telefonservis.mk"
url3 = "dobridaskalov.edu.mk"
url4 = "www.famnit.upr.si"
url5 = "google.com"
if is_url_malicious(url):
    print(f"The URL {url5} is malicious.")
else:
    print(f"The URL {url5} is not malicious.")


The URL google.com is malicious.


