In [8]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.ml.classification import LinearSVC
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import NGram,HashingTF, IDF
from pyspark.ml.feature import StandardScaler
from pyspark.sql.functions import lit
from pyspark.mllib.feature import StandardScaler, StandardScalerModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.util import MLUtils
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml import Pipeline
from pyspark.sql import Row,SparkSession
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import Word2Vec
from sklearn.metrics import confusion_matrix
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.classification import NaiveBayes
from pyspark import SparkConf, SQLContext, SparkContext

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("Detecting-Malicious-URL-App")\
        .getOrCreate()

#Reading in the URL file and storing into dataframe
data_df = spark.read.csv(path='/C:/Users/Timothy/Downloads/Detecting-Malicious-URL-Machine-Learning-master/dataset.csv',
                        sep=',',
                        encoding='UTF-8',
                        comment=None,
                        header=True, 
                        inferSchema=True)

data_df.groupby('label').count().toPandas()

#data_df = pd.read_csv('Downloads/Detecting-Malicious-URL-Machine-Learning-master/dataset.csv')

#data_df.groupby('label').count()

Unnamed: 0,label,count
0,1,56937
1,0,1000000


undersample to make it balance

In [9]:
malicious = data_df.filter("label = 1") #Filters the malicious URLs
benign = data_df.filter("label = 0") #Filters the benign URLs

sampleRatio = malicious.count() / data_df.count() #Calculates the fraction of malicious URLs in the dataset
print("sampleRatio = ", sampleRatio) #Displays the calculated fraction of malicious URLs
sample_benign = benign.sample(False, sampleRatio) #Takes a sample of benign URLs around the same amount as malicious URLs

sampled = malicious.unionAll(sample_benign) #Combines the sampled benign URLs with the malicious URLs
sampled.groupby('label').count().toPandas() #Groups the new sample by benign/malicious and displays the numbers of each

sampleRatio =  0.053869814378718885


Unnamed: 0,label,count
0,1,56937
1,0,53906


Data Ingestion and Vectorization

In [10]:
#Tokennize the TrainData - sparse the URL string into words
regexTokenizer = RegexTokenizer(inputCol="url", outputCol="Words", pattern="\\W")

#CountVectorizer converts the the words into feature vectors - Thi is used as it gives better results
countVectors = CountVectorizer(inputCol=regexTokenizer.getOutputCol(), outputCol="rawfeatures", vocabSize=10000, minDF=5)

#
idf = IDF(inputCol=countVectors.getOutputCol(), outputCol="features") 

#create the pipline 
pipeline = Pipeline(stages=[regexTokenizer, countVectors, idf ])


# Fit the pipeline to training documents.
# Pass 'sampled' in the param to set Balanced datasets
pipelineFit = pipeline.fit(sampled)

#Transform the pipeline to dataset
# Pass 'sampled' in the param to set Balanced datasets
dataset = pipelineFit.transform(sampled)

#randomly split the dataset to traning and testing 80%, 20% respectively
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
print("\nTraining Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))
print("Total Dataset Count: " + str(dataset.count()))

dataset.show(10)


Training Dataset Count: 88524
Test Dataset Count: 22319
Total Dataset Count: 110843
+--------------------+-----+--------------------+--------------------+--------------------+
|                 url|label|               Words|         rawfeatures|            features|
+--------------------+-----+--------------------+--------------------+--------------------+
|http://br-ofertas...|    1|[http, br, oferta...|(6988,[0,1,2,3,18...|(6988,[0,1,2,3,18...|
|https://semana-da...|    1|[https, semana, d...|(6988,[0,3,6,18,2...|(6988,[0,3,6,18,2...|
|https://scrid-app...|    1|[https, scrid, ap...|(6988,[0,6,844],[...|(6988,[0,6,844],[...|
|http://my-softban...|    1|[http, my, softba...|(6988,[0,1,29,162...|(6988,[0,1,29,162...|
|http://www.my-sof...|    1|[http, www, my, s...|(6988,[0,1,4,29,1...|(6988,[0,1,4,29,1...|
|http://diadesalda...|    1|[http, diadesalda...|(6988,[0,1],[1.0,...|(6988,[0,1],[0.55...|
|https://sites.goo...|    1|[https, sites, go...|(6988,[0,6,27,30,...|(6988,[0,6,27,30,

In [11]:
#Importing modules for Logistic Regression
import seaborn as sns
import statsmodels.api as sm
from pyspark.ml.classification import LogisticRegression, OneVsRest

#Instantiating, training, and testing the model
log_reg = LogisticRegression()
logModel = log_reg.fit(trainingData)
predictions = logModel.transform(testData)
predictions.show(10)

#Calculating and displaying the accuracy of the model
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(predictions.count())
print("Accuracy : ",accuracy)

#Calculating the amount of true positives, true negatives, false positives, and false negatives from the model
df = predictions.select('prediction', 'label')

tp = df[(df.label == 1) & (df.prediction == 1)].count() #True positive
tn = df[(df.label == 0) & (df.prediction == 0)].count() #True negative
fp = df[(df.label == 0) & (df.prediction == 1)].count() #False positive
fn = df[(df.label == 1) & (df.prediction == 0)].count() #False negative

#Calculating precision, recall, and F score
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f = 2 * (precision * recall)/(precision + recall)
print("Precision: ", precision)
print("Recall : ", recall)
print("F Score : ",f)                        

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                 url|label|               Words|         rawfeatures|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|       2amsports.com|    1|    [2amsports, com]|    (6988,[0],[1.0])|(6988,[0],[0.5561...|[4.57140644683777...|[0.98976248865805...|       0.0|
|TRIANGLESERVICESL...|    1|[triangleservices...|    (6988,[0],[1.0])|(6988,[0],[0.5561...|[4.57140644683777...|[0.98976248865805...|       0.0|
|above.e-rezerwacj...|    1|[above, e, rezerw...|(6988,[42,94],[1....|(6988,[42,94],[4....|[4.37659996444787...|[0.98758797676265...|       0.0|
|     ad.getfond.info|    1| [ad, getfond, info]|(6988,[34,527],[1...|(6988,[34,527],[4...|[-2.0614051677728...|[0.11290501544829.

In [12]:
#Importing Support Vector Machine
from pyspark.ml.classification import LinearSVC

#Instantiating, training, and testing the model
lsvc = LinearSVC()
lsvcModel = lsvc.fit(trainingData)
svcpredict = lsvcModel.transform(testData)
svcpredict.show(10)

#Calculating and displaying the accuracy of the model
accuracy = svcpredict.filter(svcpredict.label == svcpredict.prediction).count() / float(svcpredict.count())
print("Accuracy : ",accuracy)

#Calculating the amount of true positives, true negatives, false positives, and false negatives from the model
df = svcpredict.select('prediction', 'label')

tp = df[(df.label == 1) & (df.prediction == 1)].count() #True positive
tn = df[(df.label == 0) & (df.prediction == 0)].count() #True negative
fp = df[(df.label == 0) & (df.prediction == 1)].count() #False positive
fn = df[(df.label == 1) & (df.prediction == 0)].count() #False negative

#Calculating precision, recall, and F score
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f = 2 * (precision * recall)/(precision + recall)
print("Precision: ", precision)
print("Recall : ", recall)
print("F Score : ",f)    

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|                 url|label|               Words|         rawfeatures|            features|       rawPrediction|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|       2amsports.com|    1|    [2amsports, com]|    (6988,[0],[1.0])|(6988,[0],[0.5561...|[1.05899563413891...|       0.0|
|TRIANGLESERVICESL...|    1|[triangleservices...|    (6988,[0],[1.0])|(6988,[0],[0.5561...|[1.05899563413891...|       0.0|
|above.e-rezerwacj...|    1|[above, e, rezerw...|(6988,[42,94],[1....|(6988,[42,94],[4....|[1.05551590276667...|       0.0|
|     ad.getfond.info|    1| [ad, getfond, info]|(6988,[34,527],[1...|(6988,[34,527],[4...|[-1.0451741689724...|       1.0|
|adserving.favorit...|    1|[adserving, favor...|(6988,[0,447],[1....|(6988,[0,447],[0....|[1.47156802141571...|       0.0|
|       