In [4]:
import math
import time
import pylab
import pickle
import warnings
import operator
import tldextract
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

pd.options.mode.chained_assignment = None
pylab.rcParams['figure.figsize'] = (14.0, 5.0)
pylab.rcParams['axes.grid'] = True
warnings.filterwarnings("ignore", category = DeprecationWarning)

In [2]:
df = pd.read_csv('data.csv', encoding='utf-8')

print ("size of dataset :", df.shape[0])
df.head()

size of dataset : 420464


Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


In [3]:
df.label = df.label.replace(to_replace="good", value=0)
df.label = df.label.replace(to_replace="bad", value=1)

print (df.label.value_counts())
y = df.label

0    344821
1     75643
Name: label, dtype: int64


In [4]:
# converting data to vectors

vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(3,5))
vectorizer.fit(df.url)

X =  vectorizer.transform(df.url)
print ("sparse matrix :", X.shape)

sparse matrix : (420464, 3180519)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #splitting data

good = df.label.value_counts()[0]
bad = df.label.value_counts()[1]
print ("size of class :", good, bad)

size of class : 344821 75643


In [7]:
lgs = LogisticRegression(class_weight={0 : good, 1 : bad})
lgs.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight={0: 344821, 1: 75643}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [8]:
##############
# Evaluation #
##############

predicted = lgs.predict(X_test)

fpr, tpr, _ = metrics.roc_curve(y_test, (lgs.predict_proba(X_test)[:, 1]))
auc = metrics.auc(fpr, tpr)

print ("Bad samples : %d" % bad)
print ("Good samples: %d" % good)
print ("Baseline Constant negative: %.6f" % (good / (good + bad)))
print ("------------")
print ("Accuracy    : %f" % lgs.score(X_test, y_test))  #checking the accuracy
print ("Precision   : %f" % metrics.precision_score(y_test, predicted))
print ("Recall      : %f" % metrics.recall_score(y_test, predicted))
print ("F1-Score    : %f" % metrics.f1_score(y_test, predicted))
print ("AUC         : %f" % auc)

Bad samples : 75643
Good samples: 344821
Baseline Constant negative: 0.820096
------------
Accuracy    : 0.987835
Precision   : 0.984972
Recall      : 0.946071
F1-Score    : 0.965129
AUC         : 0.997829


In [13]:
filename = 'url_malicious_model.sav'

pickle.dump(lgs, open(filename, 'wb'))

In [15]:
filename1 = 'url_tfidf_model.sav'

pickle.dump(vectorizer, open(filename1, 'wb'))

### real rowdata

In [7]:
# load the model from disk
filename = 'url_malicious_model.sav'

loaded_model = pickle.load(open(filename, 'rb'))

In [8]:
# load the model from disk
filename1 = 'url_tfidf_model.sav'

loaded_model1 = pickle.load(open(filename1, 'rb'))

In [9]:
t0 = time.time()

X_real = ["wikipedia.com",
          "google.com/search=faizanahmad",
          "pakistanifacebookforever.com/getpassword.php/",
          "www.radsport-voggel.de/wp-admin/includes/log.exe",
          "ahrenhei.without-transfer.ru/nethost.exe",
          "www.itidea.it/centroesteticosothys/img/_notes/gum.exe",
          "xn--cgopolygon-wy2e.com/login.php"]

X_real = loaded_model1.transform(X_real)
y_real = loaded_model.predict(X_real)

t1 = time.time()
print ("cost time :", t1-t0)
print (y_real)

cost time : 0.04265284538269043
[0 0 0 1 1 1 1]
