import Neccessary libraries


In [1]:
import re
import pandas as pd
from urllib.parse import urlparse
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

Load the data

In [2]:
df = pd.read_csv('malicious_dataset.csv')

In [3]:

# df.head()
# df.tail()
# df.describe()
# duplicates=df[df.duplicated('url')]
# duplicates
# df.shape
# df.isnull().any()

# df[df['url'].isin([duplicates])]
# df = df.drop_duplicates()
# df[df['url'].duplicated()==1]
# df
# Remove duplicates and null values
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

df.shape



(659953, 24)

Extract features


In [4]:

def use_of_ip(url):
    ipv4_pattern = r'(\d{1,3}\.){3}\d{1,3}'
    ipv6_pattern = r'([a-fA-F0-9:]+:+)+[a-fA-F0-9]+'
    return 1 if re.search(ipv4_pattern, url) or re.search(ipv6_pattern, url) else 0

def abnormal_url(url):
    try:
        parsed_url = urlparse(url)
        hostname = parsed_url.hostname
        if not hostname:
            return 1
        return 0 if hostname in url and url.index(hostname) == url.find(parsed_url.netloc) else 1
    except:
        return 1

def count_character(url, char):
    return url.count(char)

def domain_length(url):
    return len(urlparse(url).netloc)

def short_url(url):
    return 1 if len(url) < 20 else 0

def no_of_embed(url):
    return urlparse(url).path.count('//')

def count_https(url):
    return url.count('https')

def count_http(url):
    return url.count('http')

def suspicious_words(url):
    return 1 if re.search(
        "account|alert|api|auth|bank|bonus|cmd|confirm|credit|dashboard|discount|download|exe|fake|"
        "free|gift|hacked|important|invoice|key|limited|login|malware|money|offer|password|pay|payment|"
        "paypal|prize|promo|recovery|redeem|refund|register|reset|reward|root|secure|setup|signin|shell|"
        "special|support|token|update|urgent|verify|warning|winner", url) else 0

def fd_length(url):
    urlpath = urlparse(url).path
    try:
        return len(urlpath.split('/')[1])
    except:
        return 0

def legitimate_TLD(url):
    # List of legitimate TLDs
    legitimate_tlds = (
        ".com|.org|.net|.info|.biz|.name|.pro|.edu|.gov|.mil|.us|.uk|.ca|.au|"
        ".in|.de|.fr|.jp|.cn|.ru|.br|.za|.nz|.mx|.sg|.tech|.app|.io|.ai|.dev|"
        ".online|.store|.blog|.design|.law|.health|.hotel|.travel|.bank|"
        ".finance|.insurance|.media|.agency|.realty|.arpa|.pharmacy|.tv|.me|.cc"
    )

    # Use regex to search for the legitimate TLDs in the URL
    return 0 if re.search(r'\b(' + legitimate_tlds + r')\b', url, re.IGNORECASE) else 1      

def extract_features(df):
    df['use_of_ip'] = df['url'].apply(use_of_ip)
    df['abnormal_url'] = df['url'].apply(abnormal_url)
    df['count.'] = df['url'].apply(lambda x: count_character(x, '.'))
    df['count-www'] = df['url'].apply(lambda x: count_character(x, 'www'))
    df['count@'] = df['url'].apply(lambda x: count_character(x, '@'))
    df['count-dir'] = df['url'].apply(lambda x: count_character(x, '/'))
    df['short_url'] = df['url'].apply(short_url)
    df['url_length'] = df['url'].apply(len)
    df['hostname_length'] = df['url'].apply(domain_length)
    df['count-'] = df['url'].apply(lambda x: count_character(x, '-'))
    df['count='] = df['url'].apply(lambda x: count_character(x, '='))
    df['count?'] = df['url'].apply(lambda x: count_character(x, '?'))
    df['count%'] = df['url'].apply(lambda x: count_character(x, '%'))
    df['count-digits'] = df['url'].apply(lambda x: sum(c.isdigit() for c in x))
    df['count-letters'] = df['url'].apply(lambda x: sum(c.isalpha() for c in x))
    df['count_embed_domian'] = df['url'].apply(no_of_embed)
    df['count-https'] = df['url'].apply(count_https)
    df['count-http'] = df['url'].apply(count_http)
    df['sus_url'] = df['url'].apply(suspicious_words)
    df['fd_length'] = df['url'].apply(fd_length)
    df['legitimate_TLD'] = df['url'].apply(legitimate_TLD)
    # Only process 'type' during training
    if 'type' in df.columns:
        from sklearn.preprocessing import LabelEncoder
        lb_make = LabelEncoder()
        df["url_type"] = lb_make.fit_transform(df["type"])
    
    return df


df = extract_features(df)

df.head()

Unnamed: 0,url,type,use_of_ip,abnormal_url,count.,count-www,count@,count-dir,short_url,url_length,...,count%,count-digits,count-letters,count_embed_domian,count-https,count-http,sus_url,fd_length,legitimate_TLD,url_type
0,br-icloud.com.br,phishing,0,1,2,0,0,0,1,16,...,0,0,13,0,0,0,0,0,0,3
1,mp3raid.com/music/krizz_kaliko.html,benign,0,1,2,0,0,2,0,35,...,0,1,29,0,0,0,0,5,0,0
2,http://9779.info/%E6%A0%91%E5%8F%B6%E7%B2%98%E...,malware,0,0,1,0,0,4,0,63,...,15,21,21,0,0,1,0,45,0,2
3,http://www.garage-pirenne.be/index.php?option=...,defacement,0,0,3,1,0,3,0,88,...,0,7,63,0,0,1,0,9,1,1
4,bopsecrets.org/rexroth/cr/1.htm,benign,0,1,2,0,0,3,0,31,...,0,1,25,0,0,0,0,7,0,0


In [4]:
column = 'type'
values_count = df[column].value_counts()
values_count

type
benign        443906
phishing       97094
defacement     95308
malware        23645
Name: count, dtype: int64

In [5]:
df.isnull().any()

url                   False
type                  False
use_of_ip             False
abnormal_url          False
count.                False
count-www             False
count@                False
count-dir             False
short_url             False
url_length            False
hostname_length       False
count-                False
count=                False
count?                False
count%                False
count-digits          False
count-letters         False
count_embed_domian    False
count-https           False
count-http            False
sus_url               False
fd_length             False
legitimate_TLD        False
url_type              False
dtype: bool

Split the data for test and train

In [6]:
# Model Training
x = df[['use_of_ip', 'abnormal_url', 'count.', 'count-www', 'count@',
        'count-dir', 'count_embed_domian', 'short_url', 'count-https',
        'count-http', 'count%', 'count?', 'count-', 'count=', 'url_length',
        'hostname_length', 'sus_url', 'fd_length', 'count-digits',
        'count-letters', 'legitimate_TLD']]
y = df['url_type']

# Train-Test Split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

Train the model

In [7]:

#randomforest
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
rf_preds = rf.predict(x_test)
rf_acc = accuracy_score(y_test, rf_preds)
rf_acc


# from xgboost import XGBClassifier
# from sklearn.metrics import accuracy_score

# # XGBoost Model Training
# xgb = XGBClassifier()
# xgb.fit(x_train, y_train)
# xgb_preds = xgb.predict(x_test)
# xgb_acc = accuracy_score(y_test, xgb_preds)

# from sklearn.linear_model import LogisticRegression

# #LogisticRegression
# lr = LogisticRegression(max_iter=10000000)
# lr.fit(x_train, y_train)
# lr_preds = lr.predict(x_test)
# lr_acc = accuracy_score(y_test, lr_preds)


# #lightbgm
# from lightgbm import LGBMClassifier
# from sklearn.metrics import accuracy_score

# # 1. Initialize the LightGBM classifier
# lightgbm = LGBMClassifier()

# # 2. Train the model using the training data
# lightgbm.fit(x_train, y_train)

# # 3. Make predictions on the test data
# lightgbm_preds = lightgbm.predict(x_test)

# # 4. Evaluate the predictions using accuracy score
# accuracy = accuracy_score(y_test, lightgbm_preds)

# # Print the accuracy
# print(f"Accuracy: {accuracy}")


0.9508148282837466

In [8]:

model = rf
y_pred = model.predict(x_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97     88810
           1       0.97      0.99      0.98     18861
           2       0.98      0.91      0.94      4708
           3       0.85      0.83      0.84     19612

    accuracy                           0.95    131991
   macro avg       0.94      0.93      0.93    131991
weighted avg       0.95      0.95      0.95    131991



Export the model

In [9]:
 joblib.dump(rf, 'best_model.pkl')

['best_model.pkl']