In [101]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import ipaddress
import tldextract
from urllib.parse import urlparse

In [78]:
benign_df = pd.read_csv("benign_url.csv")

In [79]:
benign_df['URL'] = benign_df['URL'].str.replace(r'https://', '')
benign_df['URL'] = benign_df['URL'].str.replace(r'http://', '')

In [80]:
malicious_df = pd.read_csv('url-list.txt', header=None)
malicious_df = malicious_df.rename(columns = {0: "URL"})
malicious_df = malicious_df.sample(5000).reset_index(drop=True)

In [81]:
malicious_df.loc[~malicious_df['URL'].str.endswith(('.php', '.html', '/')), 'URL'] = malicious_df.loc[~malicious_df['URL'].str.endswith(('.php', '.html', '/')), 'URL'] + '/'

In [82]:
malicious_df['Lable'] = [1 for i in range(len(malicious_df))]

In [83]:
def urllen(url):
    return len(url)

In [84]:
def dotcount(url):
    return url.count('.')

In [85]:
def delimitercount(url):
    return url.count(';') + url.count('_') + url.count('?') + url.count('=') + url.count('&')

In [86]:
def ipcheck(url):
    try:
        if ipaddress.ip_address(url):
            return 1
    except:
        return 0

In [87]:
def hyphencount(url):
    return url.count('-')

In [88]:
def atcount(url):
    return url.count('@')

In [89]:
def subdircount(url):
    return url.count('/')

In [90]:
def subdomaincount(subdomain):
    if subdomain:
        return len(subdomain.split('.'))
    else:
        return 0

In [91]:
def querycount(query):
    if query:
        return len(query.split('&'))
    else:
        return 0

In [92]:
def preprocess(df):
    for i in range(len(df)):
        url = str(df.loc[i, "URL"])
        ext = tldextract.extract(url)
        path = urlparse(url)

        df.loc[i, "urllen"] = urllen(url)
        df.loc[i, "dotcount"] = dotcount(ext.subdomain)
        df.loc[i, "delimitercount"] = delimitercount(url)
        df.loc[i, "ipcheck"] = ipcheck(ext.domain)
        df.loc[i, "hyphencount"] = hyphencount(path.netloc)
        df.loc[i, "atcount"] = atcount(path.netloc)
        df.loc[i, "subdircount"] = subdircount(path.path)
        df.loc[i, "subdomaincount"] = subdomaincount(ext.subdomain)
        df.loc[i, "querycount"] = querycount(path.query)
        
    return df

In [93]:
benign_df = preprocess(benign_df)

In [94]:
malicious_df = preprocess(malicious_df)

In [99]:
df = pd.concat([benign_df, malicious_df]).reset_index(drop=True)

In [144]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import joblib
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn.linear_model import LogisticRegression

In [103]:
df['Lable'].value_counts()

1    5000
0    3494
Name: Lable, dtype: int64

In [104]:
X = df.drop(["URL", "Lable"], axis=1).values
y = df["Lable"].values

X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.2)

# Adaboost

In [154]:
ada_model = AdaBoostClassifier().fit(X_train, y_train)

In [155]:
y_pred = ada_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8928781636256622

In [156]:
ada_params = { "n_estimators": [10, 50, 100, 500],
               "learning_rate": [0.01, 0.1, 1.0],}
ada_model = AdaBoostClassifier()
ada_cv_model = GridSearchCV(ada_model, ada_params, cv=10, n_jobs=-1).fit(X_train, y_train)
ada_cv_model.best_params_

{'learning_rate': 1.0, 'n_estimators': 500}

In [157]:
ada_tuned = AdaBoostClassifier(learning_rate=1.0, n_estimators=500).fit(X_train, y_train)
y_pred = ada_tuned.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.88      0.87       713
           1       0.91      0.90      0.91       986

    accuracy                           0.89      1699
   macro avg       0.89      0.89      0.89      1699
weighted avg       0.89      0.89      0.89      1699



In [158]:
accuracy_score(y_test, y_pred)

0.8928781636256622

# DecisionTree

In [159]:
cart_model = DecisionTreeClassifier().fit(X_train, y_train)

In [160]:
y_pred = cart_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8940553266627428

In [161]:
cart_params = {"max_depth": range(1, 5), "min_samples_split": list(range(2, 20))}
cart = DecisionTreeClassifier()
cart_cv_model = GridSearchCV(cart, cart_params, cv=10, n_jobs=-1).fit(X_train, y_train)
cart_cv_model.best_params_

{'max_depth': 4, 'min_samples_split': 2}

In [162]:
cart_tuned = DecisionTreeClassifier(max_depth=4, min_samples_split=2).fit(X_train, y_train)
y_pred = cart_tuned.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.88      0.88       713
           1       0.91      0.91      0.91       986

    accuracy                           0.90      1699
   macro avg       0.90      0.90      0.90      1699
weighted avg       0.90      0.90      0.90      1699



In [163]:
accuracy_score(y_test, y_pred)

0.898175397292525

# Random Forest

In [164]:
rf_model = RandomForestClassifier().fit(X_train, y_train)

In [165]:
y_pred = rf_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8946439081812831

In [166]:
rf_params = {'max_depth': list(range(1, 5)), 'max_features': [1, 2, 3], 'n_estimators': [25, 35, 50]}
rf_model = RandomForestClassifier()
rf_cv_model = GridSearchCV(rf_model, rf_params, cv=10, n_jobs=-1).fit(X_train, y_train)
rf_cv_model.best_params_

{'max_depth': 4, 'max_features': 3, 'n_estimators': 35}

In [167]:
rf_tuned = RandomForestClassifier(max_depth= 4, max_features= 3, n_estimators= 25).fit(X_train, y_train)
y_pred = rf_tuned.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.89      0.87       713
           1       0.92      0.90      0.91       986

    accuracy                           0.89      1699
   macro avg       0.89      0.89      0.89      1699
weighted avg       0.89      0.89      0.89      1699



In [168]:
accuracy_score(y_test, y_pred)

0.8934667451442024

# Logistic Regression

In [169]:
logistic_model = LogisticRegression(solver="liblinear").fit(X_train, y_train)

In [170]:
y_pred = logistic_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8964096527369041

In [171]:
logistic_params = {"penalty": ["l1", "l2"]}
logistic_model = LogisticRegression(solver="liblinear")
logistic_cv_model = GridSearchCV(logistic_model, logistic_params, cv=10).fit(X_train, y_train)
logistic_cv_model.best_params_

{'penalty': 'l1'}

In [172]:
logistic_tuned = LogisticRegression(solver="liblinear", penalty="l1").fit(X_train, y_train)
y_pred = logistic_tuned.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.88      0.88       713
           1       0.91      0.91      0.91       986

    accuracy                           0.90      1699
   macro avg       0.89      0.89      0.89      1699
weighted avg       0.90      0.90      0.90      1699



In [173]:
accuracy_score(y_test, y_pred)

0.8964096527369041

# XGBoost

In [184]:
xgb_model = XGBClassifier().fit(X_train, y_train)

In [185]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8952324896998234

In [186]:
xgb_params = {'n_estimators': [100, 250, 500],
              'max_depth': [3, 5, 10],
              'learning_rate': [0.01, 0.1],
}
xgb_model = XGBClassifier()
xgb_cv_model = GridSearchCV(xgb_model, xgb_params, cv=10, n_jobs=-1).fit(X_train, y_train)
xgb_cv_model.best_params_

{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}

In [187]:
xgb_tuned = XGBClassifier(max_depth=5, max_features=50, n_estimators=100).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
print(classification_report(y_test, y_pred))

Parameters: { "max_features" } are not used.

              precision    recall  f1-score   support

           0       0.87      0.89      0.88       713
           1       0.92      0.90      0.91       986

    accuracy                           0.89      1699
   macro avg       0.89      0.89      0.89      1699
weighted avg       0.90      0.89      0.89      1699



In [188]:
accuracy_score(y_test, y_pred)

0.8946439081812831

# Gradient Boosting

In [179]:
gbm_model = GradientBoostingClassifier().fit(X_train, y_train)

In [180]:
y_pred = gbm_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8934667451442024

In [181]:
gbm_params = {"learning_rate": [0.001, 0.01, 0.1], 
              "n_estimators": [100, 250, 500], 
              "max_depth": [3, 5, 10]}
gbm_model = GradientBoostingClassifier()
gbm_cv_model = GridSearchCV(gbm_model, gbm_params, cv=10, n_jobs=-1).fit(X_train, y_train)
gbm_cv_model.best_params_

{'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 500}

In [182]:
gbm_tuned = GradientBoostingClassifier(max_depth=5, max_features=50, n_estimators=100).fit(X_train, y_train)
y_pred = gbm_tuned.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87       713
           1       0.91      0.90      0.90       986

    accuracy                           0.89      1699
   macro avg       0.89      0.89      0.89      1699
weighted avg       0.89      0.89      0.89      1699



In [183]:
accuracy_score(y_test, y_pred)

0.8887580929958799

In [189]:
joblib.dump(cart_tuned, 'cart_tuned.pkl')
best = joblib.load('cart_tuned.pkl')
best

In [191]:
columns = df.columns

In [203]:
output = pd.DataFrame(columns=columns)
output.loc[0, "URL"] = 'google.ru/shell.php'

In [204]:
output = preprocess(output)
output.drop(columns=["URL", "Lable"], inplace=True)
output

Unnamed: 0,urllen,dotcount,delimitercount,ipcheck,hyphencount,atcount,subdircount,subdomaincount,querycount
0,19,0,0,0,0,0,1,0,0


In [207]:
result = best.predict(output)
result.item()



1

In [208]:
df.columns

Index(['URL', 'Lable', 'urllen', 'dotcount', 'delimitercount', 'ipcheck',
       'hyphencount', 'atcount', 'subdircount', 'subdomaincount',
       'querycount'],
      dtype='object')