# Import Dependencies

In [14]:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import re
import random
from sklearn.metrics import accuracy_score

In [15]:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [16]:
#this function is taken from https://github.com/faizann24/Using-machine-learning-to-detect-malicious-URLs
def getTokens(input):
    tokensBySlash = str(input.encode('utf-8')).split('/')
    allTokens = []
    for i in tokensBySlash:
        tokens = str(i).split('-')
        tokensByDot = []
        for j in range(0,len(tokens)):
            tempTokens = str(tokens[j]).split('.')
            tokensByDot = tokensByDot + tempTokens
        allTokens = allTokens + tokens + tokensByDot
    allTokens = list(set(allTokens))
    if 'com' in allTokens:
        allTokens.remove('com')
    return allTokens

#function to remove "http://" from URL
def trim(url):
    return re.match(r'(?:\w*://)?(?:.*\.)?([a-zA-Z-1-9]*\.[a-zA-Z]{1,}).*', url).groups()[0]

# Prepare Dataset

In [17]:
#read from a file
data = pd.read_csv("dataNN.csv",',',error_bad_lines=False)	#reading file
data['url'].values

array(['diaryofagameaddict.com', 'espdesign.com.au', 'iamagameaddict.com',
       ..., 'owens.edu/news-releases/?p=2052',
       '1.safesecureweb.com/egale/index.asp?item=1173',
       'yurika.otakuthon.com/reg/main.pl/en/'], dtype=object)

In [18]:
len(data)

388447

In [20]:
#convert it into numpy array and shuffle the dataset
data = np.array(data)
random.shuffle(data)


In [21]:
#convert text data into numerical data for machine learning models
y = [d[1] for d in data]
corpus = [d[0] for d in data]
vectorizer = TfidfVectorizer(tokenizer=getTokens)
X = vectorizer.fit_transform(corpus)



In [22]:
#split the data set inot train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Train Machine Learning Models 

In [23]:
#1 - Logistic Regression
model = LogisticRegression(C=1)
model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
print(model.score(X_test,y_test))

0.9927017634187154


In [25]:
#save the model and vectorizer
joblib.dump(model, "mal-logireg1.pkl", protocol=2)
joblib.dump(vectorizer, "vectorizer1.pkl", protocol=2)

['vectorizer1.pkl']

In [26]:
#make prediction
a = "https://www.clearnetwork.com"
aa = vectorizer.transform([trim(a)])
s = model.predict(aa)
s[0] #0 for good

0

# Further experiment

In [27]:
index = int(0.3 * len(data))

In [29]:
from sklearn.utils import shuffle
data = pd.read_csv("dataNN.csv",',',error_bad_lines=False)

data = shuffle(data)
url_train = data['url'][index:].values
label_train = data['label'][index:].values
url_test = data['url'][:index].values
label_test = data['label'][:index].values

In [30]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
        ("vectorizer", TfidfVectorizer(tokenizer=getTokens)),
        ("classifier", LogisticRegression())])

pipeline.fit(url_train, label_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
pipeline.score(url_test, label_test)

0.9707124101120703