In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# !pip install --upgrade numpy

In [3]:
import numpy as np
import pandas as pd
from utils import run_ML
from sklearn.metrics import f1_score
from itertools import groupby

In [4]:
# !pip install tldextract
# !pip install torch_geometric

In [5]:
data_dir = "data/URLdatasetX2_1.csv"
df = pd.read_csv(data_dir,index_col=0)

In [6]:
df.shape, df.head(2)

((2802, 2),
                                           url        type
 0       http://www.crestonwood.com/router.php  legitimate
 1  http://vamoaestudiarmedicina.blogspot.com/  legitimate)

In [7]:
# smalldata = df.sample(n = 20000, random_state=1)
# smalldata = df.sample(n = 300, random_state=1) # take random 300 samples
smalldata = df

In [8]:
# get labels of urls
labels = smalldata.iloc[:,-1].values
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

### Conventional Models

In [9]:
from utils import extract_features

In [10]:
# Example usage:
url = "http://www.example.com/path/to/==file.html"
url_features = extract_features(url)
print(url_features)

{'domain': 'www.example.com', 'num_subdomains': 2, 'contains_ip': 0, 'path_length': 20, 'num_path_segments': 3, 'uses_https': 0, 'file_extension': 'html', 'count_special_characters': 11, 'count_non_alphanumeric_characters': 11, 'TLD': 'com', 'count_obfuscated_characters': 0, 'letter_ratio_in_url': 0.7380952380952381, 'digit_ratio_in_url': 0.0, 'count_equals_in_url': 2, 'NoOfAmpersandInURL': 0, 'CharContinuationRate': 0.11904761904761904, 'ratio_obfuscated_characters': 0.0, 'NoOfQMarkInURL': 0}


In [11]:
# print(url_features.keys())

In [12]:
# get numerical and catergorical features
phish_url = []
for link in list(smalldata.iloc[:,0]):
    url_features = extract_features(link)
    phish_url.append(list(url_features.values())[1:])

In [13]:
phish_url_df = pd.DataFrame(phish_url, columns = list(url_features.keys())[1:])

In [14]:
# phish_url_df.head(2)

In [15]:
phish_url_df.iloc[:,5] = pd.Categorical(phish_url_df.iloc[:,5]).codes
phish_url_df.iloc[:,8] = pd.Categorical(phish_url_df.iloc[:,8]).codes

In [16]:
phish_url_df.head(2)

Unnamed: 0,num_subdomains,contains_ip,path_length,num_path_segments,uses_https,file_extension,count_special_characters,count_non_alphanumeric_characters,TLD,count_obfuscated_characters,letter_ratio_in_url,digit_ratio_in_url,count_equals_in_url,NoOfAmpersandInURL,CharContinuationRate,ratio_obfuscated_characters,NoOfQMarkInURL
0,2,0,11,1,0,47,7,7,38,0,0.810811,0.0,0,0,0.135135,0.0,0
1,2,0,1,1,0,0,6,6,38,0,0.857143,0.0,0,0,0.047619,0.0,0


In [17]:
# test on URLs features
run_ML(phish_url_df, labels, "URLdatasetX2", "manual")

Run:  0 , fold:  0
Train freq:  [1630, 611]
kNN, LightGBM, Run:  0 , fold:  1
Train freq:  [1617, 624]
kNN, LightGBM, Run:  0 , fold:  2
Train freq:  [1602, 640]
kNN, LightGBM, Run:  0 , fold:  3
Train freq:  [1645, 597]
kNN, LightGBM, Run:  0 , fold:  4
Train freq:  [1622, 620]
kNN, LightGBM, ['kNN', 'LightGBM']
[0.73 0.86]


In [18]:
## test on numerical URLs features
from utils import extract_numerical_features
phish_url = []
for link in list(smalldata.iloc[:,0]):
    url_features = extract_numerical_features(link)
    phish_url.append(list(url_features.values()))
run_ML(np.array(phish_url), labels, "URLdatasetX2", "manual_numerical")

Run:  0 , fold:  0
Train freq:  [1630, 611]
kNN, LightGBM, Run:  0 , fold:  1
Train freq:  [1617, 624]
kNN, LightGBM, Run:  0 , fold:  2
Train freq:  [1602, 640]
kNN, LightGBM, Run:  0 , fold:  3
Train freq:  [1645, 597]
kNN, LightGBM, Run:  0 , fold:  4
Train freq:  [1622, 620]
kNN, LightGBM, ['kNN', 'LightGBM']
[0.73 0.83]


In [19]:
np.random.seed(0)
n_samples = len(smalldata.index)
train_idx = list(np.random.choice(list(range(n_samples)), int(0.8*n_samples), replace=False))
test_idx = list(set(list(range(n_samples))).difference(set(train_idx)))
data_df = np.array(phish_url)
import lightgbm as lgb
model = lgb.LGBMClassifier(verbose=-1)
model.fit(data_df[train_idx], labels[train_idx])
y_predict=model.predict(data_df[test_idx]) 
print(f1_score(y_predict, labels[test_idx], average='macro'))

0.8240798811685096


# Graph NMF

### Extract graph features from URLs for PyG

In [20]:
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError

In [21]:
# return root and hyperlinks features
def get_graph_features(idx):
    url = smalldata.iloc[idx,0]
    root_feature = extract_numerical_features(url) # dict
    hyperlink_data = [list(root_feature.values())]
    try:    
        # find all hyperlinks
        reqs = requests.get(url)
        soup = BeautifulSoup(reqs.text, 'html.parser')
        urls = []
        count = 0;
        for link in soup.find_all('a'):
            # print(link.get('href'))
            weblink = link.get('href')
            if (weblink is not None) and ('http' in weblink):
                urls.append(weblink)
            count += 1
            if count > 50:
                break
        # extract numerical features in from hyperlinks
        if len(urls) > 0:
            for link in urls:
                try:
                    url_features = extract_numerical_features(link)
                    datalinkssss = list(url_features.values())
                except ValueError as ve:
                    # datalinkssss = list(np.zeros(15))#raw_graph_features
                    error_here = 1;
                hyperlink_data.append(datalinkssss)
        else:
            # hyperlink_data.append(list(np.zeros(15)))#raw_graph_features
            error_here = 1;
    
    except ConnectionError as e:
        # print("No rep", end = ',')
        # hyperlink_data.append(list(np.zeros(15))) #raw_graph_features
        error_here = 1; #v2
    return (idx,  hyperlink_data)

In [26]:
# n_test_samples = int(df.shape[0]) # how many link we want to test
# from joblib import Parallel, delayed
# results = Parallel(n_jobs=8)(delayed(get_graph_features)(i) for i in range(n_test_samples)) # test on 100 links

In [27]:
import pickle
data_file = "data/raw_graph_features_v2.pickle"
# with open(data_file, "wb") as fp:   #Pickling
#     pickle.dump(results, fp)
with open(data_file, "rb") as fp:   # Unpickling
     results = pickle.load(fp)

In [28]:
n_samples = len(df.index)
np.random.seed(0)
train_idx = list(np.random.choice(list(range(n_samples)), int(0.8*n_samples), replace=False))
test_idx = list(set(list(range(n_samples))).difference(set(train_idx)))
train_idx[:10]

[855, 615, 70, 352, 118, 124, 1620, 298, 1992, 2262]

In [29]:
from sklearn.decomposition import NMF
n_components = 10
nmf = NMF(
    n_components=n_components,
    random_state=1,
    init='random',
    beta_loss="kullback-leibler",
    alpha_W=0.00005,
    alpha_H=0.005,
    l1_ratio=1,
    solver = 'mu',
    max_iter = 5000
)

In [30]:
graph_NMF_data = np.zeros((n_samples, 3*n_components))
for i in range(len(results)):
    idx, graph_feature = results[i]
    if len(graph_feature) >= n_components:
        data_ = np.array(graph_feature)
        W = nmf.fit_transform(data_)
        # graph_NMF_data[idx, :] = np.hstack((W.min(axis=0), W.max(axis=0), W.mean(axis=0))) # 0.85-0.87
        graph_NMF_data[idx, :] = np.hstack((W.min(axis=0), W.max(axis=0), W.mean(axis=0))) # 0.85-0.87

# ## PCA
# from sklearn.decomposition import PCA
# pca = PCA(n_components=n_components)
# graph_NMF_data = np.zeros((n_samples, 3*n_components))
# for i in range(len(results)):
#     idx, graph_feature = results[i]
#     if len(graph_feature) >= n_components:
#         data_ = np.array(graph_feature)
#         W = pca.fit_transform(data_)
#         graph_NMF_data[idx, :] = np.hstack((W.min(axis=0), W.max(axis=0), W.mean(axis=0)))

In [31]:
graph_nmf_concat = np.concatenate((data_df, graph_NMF_data),axis=1)

In [32]:
# Approach 1
import lightgbm as lgb
model = lgb.LGBMClassifier(verbose=-1)
model.fit(graph_nmf_concat[train_idx], labels[train_idx])
y_predict=model.predict(graph_nmf_concat[test_idx]) 
print(f1_score(y_predict, labels[test_idx], average='macro'))

0.8498746382211391


In [33]:
graph_data = np.zeros((n_samples, 15*3))
for i in range(len(results)):
    idx, graph_feature = results[i]
    if len(graph_feature) > 1:
        data_ = np.array(graph_feature[1:])
        graph_data[idx, :] = np.hstack((data_.min(axis=0), data_.max(axis=0), data_.mean(axis=0)))

In [34]:
graph_concat = np.concatenate((data_df, graph_data),axis=1)

In [35]:
model = lgb.LGBMClassifier(verbose=-1)
model.fit(graph_concat[train_idx], labels[train_idx])
y_predict=model.predict(graph_concat[test_idx]) 
print(f1_score(y_predict, labels[test_idx], average='macro'))

0.8629210920701114


In [36]:
graph_concat_3 = np.concatenate((data_df, graph_data, graph_nmf_concat),axis=1)

In [37]:
# Approach 2
model = lgb.LGBMClassifier(verbose=-1)
model.fit(graph_concat_3[train_idx], labels[train_idx])
y_predict=model.predict(graph_concat_3[test_idx]) 
print(f1_score(y_predict, labels[test_idx], average='macro'))

0.8700528696302376


In [38]:
## Graph autoencoder for every graph then concat

In [39]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

# List of URLs
urls = list(smalldata['url'])

# Tokenization and N-grams Generation
# You can adjust ngram_range to extract different n-grams (e.g., (1, 1) for unigrams, (2, 2) for bigrams, etc.)
vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 5))
X_counts = vectorizer.fit_transform(urls)

# TF-IDF Transformation
transformer = TfidfTransformer()
X_tfidf = transformer.fit_transform(X_counts)

# Extracted Features
feature_names = vectorizer.get_feature_names_out()
print("Feature Names (N-grams):", feature_names)
print("TF-IDF Matrix:")
X_tfidf_data = X_tfidf.toarray()

Feature Names (N-grams): ['!' '!t' '!to' ... '~mr' '~mri' '~mric']
TF-IDF Matrix:


In [40]:
X_tfidf_data.shape

(2802, 123908)

In [41]:
run_ML(X_tfidf_data, labels, "URLdatasetX2", "tfIDF")

Run:  0 , fold:  0


MemoryError: Unable to allocate 2.07 GiB for an array with shape (2241, 123908) and data type float64