Skip to content

Commit fc11f01

Browse files
authored
Create code
1 parent c782c31 commit fc11f01

File tree

1 file changed

+85
-0
lines changed

1 file changed

+85
-0
lines changed

code

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import pandas as pd
2+
import numpy as np
3+
import random
4+
5+
6+
# Machine Learning Packages
7+
from sklearn.feature_extraction.text import CountVectorizer
8+
from sklearn.feature_extraction.text import TfidfVectorizer
9+
from sklearn.linear_model import LogisticRegression
10+
from sklearn.model_selection import train_test_split
11+
12+
# Load Url Data
13+
urls_data = pd.read_csv("url_features.csv")
14+
15+
type(urls_data)
16+
17+
urls_data.head()
18+
19+
def makeTokens(f):
20+
tkns_BySlash = str(f.encode('utf-8')).split('/') # make tokens after splitting by slash
21+
total_Tokens = []
22+
for i in tkns_BySlash:
23+
tokens = str(i).split('-') # make tokens after splitting by dash
24+
tkns_ByDot = []
25+
for j in range(0,len(tokens)):
26+
temp_Tokens = str(tokens[j]).split('.') # make tokens after splitting by dot
27+
tkns_ByDot = tkns_ByDot + temp_Tokens
28+
total_Tokens = total_Tokens + tokens + tkns_ByDot
29+
total_Tokens = list(set(total_Tokens)) #remove redundant tokens
30+
if 'com' in total_Tokens:
31+
total_Tokens.remove('com') #removing .com since it occurs a lot of times and it should not be included in our features
32+
return total_Tokens
33+
34+
y = urls_data["label"]
35+
url_list = urls_data["url"]
36+
37+
#Using Default Tokenizer
38+
#vectorizer = TfidfVectorizer()
39+
40+
# Using Custom Tokenizer
41+
vectorizer = TfidfVectorizer(tokenizer=makeTokens)
42+
43+
X = vectorizer.fit_transform(url_list)
44+
45+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
46+
47+
# Model Building
48+
#using logistic regression
49+
print("TRAINING PHASE")
50+
logit = LogisticRegression()
51+
logit.fit(X_train, y_train)
52+
print("Accuracy ",logit.score(X_test, y_test))
53+
#print("coefficient :\n",logit.coef_)
54+
#print("Intercept:\n",logit.intercept_)
55+
56+
PRINT("TESTING PHASE")
57+
X_predict = ["google.com/search=jcharistech",
58+
"google.com/search=faizanahmad",
59+
"pakistanifacebookforever.com/getpassword.php/",
60+
"www.radsport-voggel.de/wp-admin/includes/log.exe",
61+
"ahrenhei.without-transfer.ru/nethost.exe ",
62+
"www.itidea.it/centroesteticosothys/img/_notes/gum.exe"]
63+
64+
X_predict = vectorizer.transform(X_predict)
65+
New_predict = logit.predict(X_predict)
66+
67+
print("THE GIVEN URLS ARE : ",New_predict)
68+
69+
"""X_predict1 = ["www.buyfakebillsonlinee.blogspot.com",
70+
"www.unitedairlineslogistics.com",
71+
"www.stonehousedelivery.com",
72+
"www.silkroadmeds-onlinepharmacy.com" ]
73+
74+
X_predict1 = vectorizer.transform(X_predict1)
75+
New_predict1 = logit.predict(X_predict1)
76+
print(New_predict1)
77+
78+
vectorizer = TfidfVectorizer()
79+
X = vectorizer.fit_transform(url_list)
80+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
81+
82+
logit = LogisticRegression() #using logistic regression
83+
logit.fit(X_train, y_train)
84+
85+
print("Accuracy ",logit.score(X_test, y_test))"""

0 commit comments

Comments
 (0)