In [31]:
# EDA Packages
import pandas as pd

from urllib.parse import urlparse

# sklearn libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# deployment libraries
import pickle


In [32]:
# Load URL Data 
urls_data = pd.read_csv("data/urldata.csv")

In [33]:
type(urls_data)

pandas.core.frame.DataFrame

In [34]:
urls_data.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


In [35]:
def makeTokens(f):
    tokens_slash = str(f.encode('utf-8')).split('/') # make tokens after splitting by slash
    total_tokens = []
    scheme = ""
    url_parsed = urlparse(f)
    scheme = url_parsed.scheme
    for i in tokens_slash:
        # split tokens by dash character
        tokens = str(i).split('-') 
        tokens_dot = []
        for j in range(0,len(tokens)):
            # split tokens by dot
            temp_tokens = str(tokens[j]).split('.') 
            tokens_dot = tokens_dot + temp_tokens
        total_tokens = list(scheme) + total_tokens + tokens + tokens_dot
    total_tokens = list(set(total_tokens)) #remove redundant tokens
    if 'com' in total_tokens:
        total_tokens.remove('com') #removing .com since it occurs a lot of times and it should not be included in our features
    return total_tokens

In [36]:
# Features and Labels
url_list = urls_data["url"]
y = urls_data["label"]

In [37]:
# using custom tokenizer

vectorizer = TfidfVectorizer(tokenizer=makeTokens)

In [38]:
# Store vectors into X variable as Our XFeatures
X = vectorizer.fit_transform(url_list)

In [39]:
# reviewing values


In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
# Model Building
#using logistic regression
logit = LogisticRegression(max_iter=500)
logit.fit(X_train, y_train)

LogisticRegression(max_iter=500)

In [42]:
# Accuracy of Our Model
print("Accuracy ",logit.score(X_test, y_test))

Accuracy  0.9616733854185248


In [43]:
X_predict = ["https://www.section.io/engineering-education/",
"https://www.youtube.com/",
"https://www.traversymedia.com/", 
"https://www.kleinehundezuhause.com ", 
"http://ttps://www.mecymiafinance.com  ",
"https://www.atlanticoceanicoilandgas.com "]

In [44]:
X_predict = vectorizer.transform(X_predict)
New_predict = logit.predict(X_predict)

In [45]:
print(New_predict)

['good' 'good' 'good' 'bad' 'bad' 'bad']


In [46]:
# https://db.aa419.org/fakebankslist.php
X_predict1 = [
    "www.buyfakebillsonlinee.blogspot.com", 
    "www.unitedairlineslogistics.com",
    "www.stonehousedelivery.com",
    "http://en.wikipedia.org",
    "http://tobiasfaiss.com/test"
]

In [47]:
X_predict1 = vectorizer.transform(X_predict1)
New_predict1 = logit.predict(X_predict1)
print(New_predict1)

['bad' 'bad' 'bad' 'good' 'bad']


In [48]:
X_predict2 = [
    "https://serialz.com"
]

In [49]:
X_predict2 = vectorizer.transform(X_predict2)
New_predict2 = logit.predict(X_predict2)
print(New_predict2)

['bad']


In [50]:
filename = "url_detection.pkl"

In [51]:
pickle.dump(logit, open(filename, 'wb')) #save model
pickle.dump(vectorizer, open("vectorizer.pkl", "wb")) #Save vectorizer


In [52]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.9616733854185248
