In [1]:
import pandas as pd
import tldextract
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('dataset/malicious_phish.csv') # loading the data from the dataset
print(data)

                                                      url        type
0                                        br-icloud.com.br    phishing
1                     mp3raid.com/music/krizz_kaliko.html      benign
2                         bopsecrets.org/rexroth/cr/1.htm      benign
3       http://www.garage-pirenne.be/index.php?option=...  defacement
4       http://adventure-nicaragua.net/index.php?optio...  defacement
...                                                   ...         ...
651186            xbox360.ign.com/objects/850/850402.html    phishing
651187       games.teamxbox.com/xbox-360/1860/Dead-Space/    phishing
651188         www.gamespot.com/xbox360/action/deadspace/    phishing
651189      en.wikipedia.org/wiki/Dead_Space_(video_game)    phishing
651190          www.angelfire.com/goth/devilmaycrytonite/    phishing

[651191 rows x 2 columns]


In [3]:
data['url'] = data['url'].str.replace('[^\w\s]','') # preprocessing the data, removing all non-alphanumeric characters and whitespaces from the url column, except underscores
print(data)

                                                      url        type
0                                        br-icloud.com.br    phishing
1                     mp3raid.com/music/krizz_kaliko.html      benign
2                         bopsecrets.org/rexroth/cr/1.htm      benign
3       http://www.garage-pirenne.be/index.php?option=...  defacement
4       http://adventure-nicaragua.net/index.php?optio...  defacement
...                                                   ...         ...
651186            xbox360.ign.com/objects/850/850402.html    phishing
651187       games.teamxbox.com/xbox-360/1860/Dead-Space/    phishing
651188         www.gamespot.com/xbox360/action/deadspace/    phishing
651189      en.wikipedia.org/wiki/Dead_Space_(video_game)    phishing
651190          www.angelfire.com/goth/devilmaycrytonite/    phishing

[651191 rows x 2 columns]


In [4]:
data['domain'] = data['url'].apply(lambda url: tldextract.extract(url).domain) # extract the domain from the url
print(data['domain'])
data['suffix'] = data['url'].apply(lambda url: tldextract.extract(url).suffix) # extract the suffix from the url
print(data['suffix'])

0                   br-icloud
1                     mp3raid
2                  bopsecrets
3              garage-pirenne
4         adventure-nicaragua
                 ...         
651186                    ign
651187               teamxbox
651188               gamespot
651189              wikipedia
651190              angelfire
Name: domain, Length: 651191, dtype: object
0         com.br
1            com
2            org
3             be
4            net
           ...  
651186       com
651187       com
651188       com
651189       org
651190       com
Name: suffix, Length: 651191, dtype: object


In [24]:
# vectorizing my data, converting nominal into numerical attricutes
vectorizer = CountVectorizer()
features = vectorizer.fit_transform(data['domain'] + ' ' + data['suffix'])

In [25]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, data['type'], test_size=0.2, random_state=42)

In [None]:
# Train your model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)