In [1]:
# IMPORTING REQUIRED LIBARIES:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set()

import warnings
warnings.simplefilter("ignore")

In [2]:
data=pd.read_csv("./Preprocessed dataset/preprocessed_dataset.csv")

In [3]:
data.head(5)

Unnamed: 0,URL,Label
0,www.atpa.cl/foros/install/wp-content/cimb/index/,bad
1,cetis43.mx/Modpro/alibaba/index.php?email=abus...,bad
2,service.confirm.paypal.cmd.cgi-bin.2466sd4f3e6...,bad
3,0zz0.com/2012/05/05/18/394995639.gif,bad
4,centralcoastplasterer.com.au/logs/dropbox/auth...,bad


In [4]:
import nltk
import re

from nltk.tokenize import RegexpTokenizer

In [5]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')

In [6]:
data.URL[4]

'centralcoastplasterer.com.au/logs/dropbox/auth/view/share/'

In [7]:
tokenizer.tokenize(data.URL[4]) # using the fourth row..

['centralcoastplasterer',
 'com',
 'au',
 'logs',
 'dropbox',
 'auth',
 'view',
 'share']

In [8]:
data['text_tokenized'] = data.URL.map(lambda t: tokenizer.tokenize(t))

In [9]:
data

Unnamed: 0,URL,Label,text_tokenized
0,www.atpa.cl/foros/install/wp-content/cimb/index/,bad,"[www, atpa, cl, foros, install, wp, content, c..."
1,cetis43.mx/Modpro/alibaba/index.php?email=abus...,bad,"[cetis, mx, Modpro, alibaba, index, php, email..."
2,service.confirm.paypal.cmd.cgi-bin.2466sd4f3e6...,bad,"[service, confirm, paypal, cmd, cgi, bin, sd, ..."
3,0zz0.com/2012/05/05/18/394995639.gif,bad,"[zz, com, gif]"
4,centralcoastplasterer.com.au/logs/dropbox/auth...,bad,"[centralcoastplasterer, com, au, logs, dropbox..."
...,...,...,...
4995,caselaw.lp.findlaw.com/scripts/getcase.pl?cour...,good,"[caselaw, lp, findlaw, com, scripts, getcase, ..."
4996,trimaris.org/op-award&award=AOA,good,"[trimaris, org, op, award, award, AOA]"
4997,cbc.ca/news/arts/story/2011/05/03/douglas-mont...,good,"[cbc, ca, news, arts, story, douglas, montreal..."
4998,bartonchronicle.com/index.php/obituaries/obitu...,good,"[bartonchronicle, com, index, php, obituaries,..."


In [10]:
# Stemming To find the rootwords:
from nltk.stem.snowball import SnowballStemmer

In [11]:
stemmer = SnowballStemmer("english")

In [12]:
data['text_stemmed'] = data['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])

In [13]:
data['text_sent'] = data['text_stemmed'].map(lambda l: ' '.join(l))

In [14]:
data

Unnamed: 0,URL,Label,text_tokenized,text_stemmed,text_sent
0,www.atpa.cl/foros/install/wp-content/cimb/index/,bad,"[www, atpa, cl, foros, install, wp, content, c...","[www, atpa, cl, foro, instal, wp, content, cim...",www atpa cl foro instal wp content cimb index
1,cetis43.mx/Modpro/alibaba/index.php?email=abus...,bad,"[cetis, mx, Modpro, alibaba, index, php, email...","[ceti, mx, modpro, alibaba, index, php, email,...",ceti mx modpro alibaba index php email abus rs...
2,service.confirm.paypal.cmd.cgi-bin.2466sd4f3e6...,bad,"[service, confirm, paypal, cmd, cgi, bin, sd, ...","[servic, confirm, paypal, cmd, cgi, bin, sd, f...",servic confirm paypal cmd cgi bin sd f e sqd e...
3,0zz0.com/2012/05/05/18/394995639.gif,bad,"[zz, com, gif]","[zz, com, gif]",zz com gif
4,centralcoastplasterer.com.au/logs/dropbox/auth...,bad,"[centralcoastplasterer, com, au, logs, dropbox...","[centralcoastplaster, com, au, log, dropbox, a...",centralcoastplaster com au log dropbox auth vi...
...,...,...,...,...,...
4995,caselaw.lp.findlaw.com/scripts/getcase.pl?cour...,good,"[caselaw, lp, findlaw, com, scripts, getcase, ...","[caselaw, lp, findlaw, com, script, getcas, pl...",caselaw lp findlaw com script getcas pl court ...
4996,trimaris.org/op-award&award=AOA,good,"[trimaris, org, op, award, award, AOA]","[trimari, org, op, award, award, aoa]",trimari org op award award aoa
4997,cbc.ca/news/arts/story/2011/05/03/douglas-mont...,good,"[cbc, ca, news, arts, story, douglas, montreal...","[cbc, ca, news, art, stori, dougla, montreal, ...",cbc ca news art stori dougla montreal fundrais...
4998,bartonchronicle.com/index.php/obituaries/obitu...,good,"[bartonchronicle, com, index, php, obituaries,...","[bartonchronicl, com, index, php, obituari, ob...",bartonchronicl com index php obituari obituari...


In [15]:
data.shape

(5000, 5)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
cv = CountVectorizer()
#cv = CountVectorizer(lowercase=False)

In [18]:
feature = cv.fit_transform(data.text_sent)

In [19]:
feature

<5000x12871 sparse matrix of type '<class 'numpy.int64'>'
	with 41166 stored elements in Compressed Sparse Row format>

In [20]:
feature = feature[:].toarray()

In [21]:
feature

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [2, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [22]:
#from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
data["Label"] = data["Label"].map({"bad":1,"good":0})

In [24]:
X=feature

In [25]:
#vectorizer = TfidfVectorizer()
#vectors = vectorizer.fit_transform(data.text_sent)
#vectors.shape

In [26]:
print(X.shape)
print(data.shape)

(5000, 12871)
(5000, 5)


In [27]:
Y=data["Label"]

In [28]:
Y

0       1
1       1
2       1
3       1
4       1
       ..
4995    0
4996    0
4997    0
4998    0
4999    0
Name: Label, Length: 5000, dtype: int64

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20,random_state=42)

In [31]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [32]:
Y_train

4227    0
4676    0
800     1
3671    0
4193    0
       ..
4426    0
466     1
3092    0
3772    0
860     1
Name: Label, Length: 4000, dtype: int64

In [33]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(4000, 12871)
(1000, 12871)
(4000,)
(1000,)


In [34]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [35]:
from sklearn.linear_model import LogisticRegression

In [36]:
LR=LogisticRegression()

In [37]:
LR.fit(X_train,Y_train)
LR.score(X_train,Y_train)

0.98125

In [38]:
from sklearn.naive_bayes import GaussianNB

In [39]:
NB=GaussianNB()

In [40]:
NB.fit(X_train,Y_train)
NB.score(X_train,Y_train)

0.9975

In [41]:
from sklearn.svm import SVC

In [42]:
SVM=SVC(kernel='rbf')

In [None]:
SVM.fit(X_train,Y_train)
SVM.score(X_train,Y_train)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
KNN=KNeighborsClassifier(n_neighbors=1)
KNN.fit(X_train,Y_train)
KNN.score(X_train,Y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RF=RandomForestClassifier(n_estimators=250)
RF.fit(X_train,Y_train)
RF.score(X_train,Y_train)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
AdB=AdaBoostClassifier(n_estimators=50)
AdB.fit(X_train,Y_train)
AdB.score(X_train,Y_train)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
XgB=GradientBoostingClassifier(n_estimators=100)
XgB.fit(X_train,Y_train)
XgB.score(X_train,Y_train)

In [None]:
from lightgbm import LGBMClassifier

In [None]:
Lgbm=LGBMClassifier(num_leaves=100)
Lgbm.fit(X_train,Y_train)
Lgbm.score(X_train,Y_train)

In [None]:
# Test score for all Algorithms:
print("****Test_score*****")
print("LR:",LR.score(X_test,Y_test))
print("NB:",NB.score(X_test,Y_test))
print("SVM:",SVM.score(X_test,Y_test))
print("KNN:",KNN.score(X_test,Y_test))
print("RF:",RF.score(X_test,Y_test))
print("AdB:",AdB.score(X_test,Y_test))
print("XgB:",XgB.score(X_test,Y_test))
print("Lgbm:",Lgbm.score(X_test,Y_test))

In [None]:
y_LR_pred=LR.predict(X_test)
y_NB_pred=NB.predict(X_test)
y_SVM_pred=SVM.predict(X_test)
y_KNN_pred=KNN.predict(X_test)
y_RF_pred=RF.predict(X_test)
y_AdB_pred=AdB.predict(X_test)
y_XgB_pred=XgB.predict(X_test)
y_Lgbm_pred=Lgbm.predict(X_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import precision_score,recall_score,accuracy_score

In [None]:
print("precision_score for LogisticRegression:",metrics.precision_score(Y_test,y_LR_pred))
print("Precision_score for Navie_bayes:",metrics.precision_score(Y_test,y_NB_pred))
print("Precision_score for Support vector Machine:",metrics.precision_score(Y_test,y_SVM_pred))
print("Precision_score for Knearestneighbors:",metrics.precision_score(Y_test,y_KNN_pred))
print("Precision_score for RandomForestclassifier:",metrics.precision_score(Y_test,y_RF_pred))
print("Precision_score for Adaboost classifier:",metrics.precision_score(Y_test,y_AdB_pred))
print("Precision_score for GradientBoostingclassifier:",metrics.precision_score(Y_test,y_XgB_pred))
print("Precision_score for LightGBM:",metrics.precision_score(Y_test,y_Lgbm_pred))

In [None]:
print("RECALL_score for LogisticRegression:",metrics.recall_score(Y_test,y_LR_pred))
print("RECALL_score for Navie_bayes:",metrics.recall_score(Y_test,y_NB_pred))
print("RECALL_score for Support vector Machine:",metrics.recall_score(Y_test,y_SVM_pred))
print("RECALL_score for Knearestneighbors:",metrics.recall_score(Y_test,y_KNN_pred))
print("RECALL_score for RandomForestclassifier:",metrics.recall_score(Y_test,y_RF_pred))
print("RECALL_score for Adaboost classifier:",metrics.recall_score(Y_test,y_AdB_pred))
print("RECALL_score for GradientBoostingclassifier:",metrics.recall_score(Y_test,y_XgB_pred))
print("RECALL_score for LightGBM:",metrics.recall_score(Y_test,y_Lgbm_pred))

In [None]:
print("ACCURACY_score for LogisticRegression:",metrics.accuracy_score(Y_test,y_LR_pred))
print("ACCURACY_score for Navie_bayes:",metrics.accuracy_score(Y_test,y_NB_pred))
print("ACCURACY_score for Support vector Machine:",metrics.accuracy_score(Y_test,y_SVM_pred))
print("ACCURACY_score for Knearestneighbors:",metrics.accuracy_score(Y_test,y_KNN_pred))
print("ACCURACY_score for RandomForestclassifier:",metrics.accuracy_score(Y_test,y_RF_pred))
print("ACCUARCY_score for Adaboost classifier:",metrics.accuracy_score(Y_test,y_AdB_pred))
print("ACCURACY_score for GradientBoostingclassifier:",metrics.accuracy_score(Y_test,y_XgB_pred))
print("ACCUARCY_score for LightGBM:",metrics.accuracy_score(Y_test,y_Lgbm_pred))

In [None]:
from sklearn.pipeline import make_pipeline

In [None]:
pipeline= make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words='english'),LR)

In [None]:
trainX, testX, trainY, testY = train_test_split(data.URL, data.Label)

In [None]:
pipeline.fit(trainX,trainY)

In [None]:
pipeline.score(testX,testY) 

In [None]:
import pickle
model_predict=pickle.dump(pipeline,open('./Preprocessed dataset/phishing.pkl','wb'))

In [None]:
print(model_predict)

In [None]:
loaded_model = pickle.load(open('./Preprocessed dataset/phishing.pkl', 'rb'))

In [None]:
print(loaded_model)

In [None]:
result = loaded_model.score(testX,testY)
print(result)

In [None]:
test=[data["URL"][4995]]
result3 = loaded_model.predict(test)
result3

In [None]:
prediction=['www.atpa.cl/foros/install/wp-content/cimb/index/','service.confirm.paypal.cmd.cgi-bin.2466sd4f3e6... ','retailhellunderground.com/','restorevisioncenters.com/html/technology.html',"https://www.google.com/search?client=firefox-b-d&q=amazon+prime+membership","https://www.google.com/search?client=firefox-b-d&q=amazon+prime"]
result=loaded_model.predict(prediction)
result