## Importing necessary libraries

In [1]:
import pandas as pd
import tldextract
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Loading the data from the dataset

In [2]:
data = pd.read_csv('dataset/malicious_phish.csv')
print(data)

                                                      url        type
0                                        br-icloud.com.br    phishing
1                     mp3raid.com/music/krizz_kaliko.html      benign
2                         bopsecrets.org/rexroth/cr/1.htm      benign
3       http://www.garage-pirenne.be/index.php?option=...  defacement
4       http://adventure-nicaragua.net/index.php?optio...  defacement
...                                                   ...         ...
651188         www.gamespot.com/xbox360/action/deadspace/    phishing
651189      en.wikipedia.org/wiki/Dead_Space_(video_game)    phishing
651190          www.angelfire.com/goth/devilmaycrytonite/    phishing
651191                               zstoimchev.github.io      benign
651192                                   telefonservis.mk      benign

[651193 rows x 2 columns]


## Preprocessing the data
Removing all non-alpanumeric character and whitespaces from the 'url' column, except underscores

In [3]:
data['url'] = data['url'].str.replace('[^\w\s]','')
print(data)

                                                      url        type
0                                        br-icloud.com.br    phishing
1                     mp3raid.com/music/krizz_kaliko.html      benign
2                         bopsecrets.org/rexroth/cr/1.htm      benign
3       http://www.garage-pirenne.be/index.php?option=...  defacement
4       http://adventure-nicaragua.net/index.php?optio...  defacement
...                                                   ...         ...
651188         www.gamespot.com/xbox360/action/deadspace/    phishing
651189      en.wikipedia.org/wiki/Dead_Space_(video_game)    phishing
651190          www.angelfire.com/goth/devilmaycrytonite/    phishing
651191                               zstoimchev.github.io      benign
651192                                   telefonservis.mk      benign

[651193 rows x 2 columns]


## Still preprocessing
Extracting the domain & suffix from the URL

In [4]:
data['domain'] = data['url'].apply(lambda url: tldextract.extract(url).domain)
print(data['domain'])
data['suffix'] = data['url'].apply(lambda url: tldextract.extract(url).suffix)
print(data['suffix'])

0                   br-icloud
1                     mp3raid
2                  bopsecrets
3              garage-pirenne
4         adventure-nicaragua
                 ...         
651188               gamespot
651189              wikipedia
651190              angelfire
651191                 github
651192          telefonservis
Name: domain, Length: 651193, dtype: object
0         com.br
1            com
2            org
3             be
4            net
           ...  
651188       com
651189       org
651190       com
651191        io
651192        mk
Name: suffix, Length: 651193, dtype: object


## Vectorizing the data
Converting nominal into numeric attributes

In [5]:
vectorizer = CountVectorizer()
features = vectorizer.fit_transform(data['domain'] + ' ' + data['suffix'])

## Splitting the data
Splitting the data by the type

In [6]:
X_train, X_test, y_train, y_test = train_test_split(features, data['type'], test_size=0.2, random_state=42)

## Training the model
Training the model using test size 20% and maximum iteration is 1000

In [7]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

## Predicting function
Function to predict if a given URL is malicious or not

In [8]:
def predict(url):
    new_url = url
    new_features = vectorizer.transform([new_url])
    prediction = model.predict(new_features)
    print(prediction)

## Using the trained model
Once we trained our model, we can use it to predict new URLs

In [9]:
url = "youtube.com"
predict(url)

['benign']
