|
| 1 | +import pandas as pd |
| 2 | +import numpy as np |
| 3 | +import random |
| 4 | + |
| 5 | + |
| 6 | +# Machine Learning Packages |
| 7 | +from sklearn.feature_extraction.text import CountVectorizer |
| 8 | +from sklearn.feature_extraction.text import TfidfVectorizer |
| 9 | +from sklearn.linear_model import LogisticRegression |
| 10 | +from sklearn.model_selection import train_test_split |
| 11 | + |
| 12 | +# Load Url Data |
| 13 | +urls_data = pd.read_csv("url_features.csv") |
| 14 | + |
| 15 | +type(urls_data) |
| 16 | + |
| 17 | +urls_data.head() |
| 18 | + |
| 19 | +def makeTokens(f): |
| 20 | + tkns_BySlash = str(f.encode('utf-8')).split('/') # make tokens after splitting by slash |
| 21 | + total_Tokens = [] |
| 22 | + for i in tkns_BySlash: |
| 23 | + tokens = str(i).split('-') # make tokens after splitting by dash |
| 24 | + tkns_ByDot = [] |
| 25 | + for j in range(0,len(tokens)): |
| 26 | + temp_Tokens = str(tokens[j]).split('.') # make tokens after splitting by dot |
| 27 | + tkns_ByDot = tkns_ByDot + temp_Tokens |
| 28 | + total_Tokens = total_Tokens + tokens + tkns_ByDot |
| 29 | + total_Tokens = list(set(total_Tokens)) #remove redundant tokens |
| 30 | + if 'com' in total_Tokens: |
| 31 | + total_Tokens.remove('com') #removing .com since it occurs a lot of times and it should not be included in our features |
| 32 | + return total_Tokens |
| 33 | + |
| 34 | +y = urls_data["label"] |
| 35 | +url_list = urls_data["url"] |
| 36 | + |
| 37 | + #Using Default Tokenizer |
| 38 | +#vectorizer = TfidfVectorizer() |
| 39 | + |
| 40 | +# Using Custom Tokenizer |
| 41 | +vectorizer = TfidfVectorizer(tokenizer=makeTokens) |
| 42 | + |
| 43 | +X = vectorizer.fit_transform(url_list) |
| 44 | + |
| 45 | +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
| 46 | + |
| 47 | +# Model Building |
| 48 | +#using logistic regression |
| 49 | +print("TRAINING PHASE") |
| 50 | +logit = LogisticRegression() |
| 51 | +logit.fit(X_train, y_train) |
| 52 | +print("Accuracy ",logit.score(X_test, y_test)) |
| 53 | +#print("coefficient :\n",logit.coef_) |
| 54 | +#print("Intercept:\n",logit.intercept_) |
| 55 | + |
| 56 | +PRINT("TESTING PHASE") |
| 57 | +X_predict = ["google.com/search=jcharistech", |
| 58 | +"google.com/search=faizanahmad", |
| 59 | +"pakistanifacebookforever.com/getpassword.php/", |
| 60 | +"www.radsport-voggel.de/wp-admin/includes/log.exe", |
| 61 | +"ahrenhei.without-transfer.ru/nethost.exe ", |
| 62 | +"www.itidea.it/centroesteticosothys/img/_notes/gum.exe"] |
| 63 | + |
| 64 | +X_predict = vectorizer.transform(X_predict) |
| 65 | +New_predict = logit.predict(X_predict) |
| 66 | + |
| 67 | +print("THE GIVEN URLS ARE : ",New_predict) |
| 68 | + |
| 69 | +"""X_predict1 = ["www.buyfakebillsonlinee.blogspot.com", |
| 70 | +"www.unitedairlineslogistics.com", |
| 71 | +"www.stonehousedelivery.com", |
| 72 | +"www.silkroadmeds-onlinepharmacy.com" ] |
| 73 | + |
| 74 | +X_predict1 = vectorizer.transform(X_predict1) |
| 75 | +New_predict1 = logit.predict(X_predict1) |
| 76 | +print(New_predict1) |
| 77 | + |
| 78 | +vectorizer = TfidfVectorizer() |
| 79 | +X = vectorizer.fit_transform(url_list) |
| 80 | +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
| 81 | + |
| 82 | +logit = LogisticRegression() #using logistic regression |
| 83 | +logit.fit(X_train, y_train) |
| 84 | + |
| 85 | +print("Accuracy ",logit.score(X_test, y_test))""" |
0 commit comments