# CL_K-means_BC
This is the code for the paper entitled "[**Enhancing Online Security: A Novel Machine Learning Framework for Robust Detection of Known and Unknown Malicious URLs**]" 
Authors: Li Shiyun (lishiyu@kean.edu), Omar Dib(odib@kean.edu)  
Organization: Wenzhou Kean University

If you find this repository useful in your research, please cite:  


## Import libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_recall_fscore_support
from sklearn.metrics import f1_score,roc_auc_score
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import plot_importance

## Read the sampled CICIDS2017 dataset
The CICIDS2017 dataset is publicly available at: https://www.unb.ca/cic/datasets/ids-2017.html  
Due to the large size of this dataset, the sampled subsets of CICIDS2017 is used. The subsets are in the "data" folder.  
If you want to use this code on other datasets (e.g., CAN-intrusion dataset), just change the dataset name and follow the same steps. The models in this code are generic models that can be used in any intrusion detection/network traffic datasets.

In [3]:
#Read dataset
df = pd.read_csv('./data/CICIDS2017.csv') 
# The results in this code is based on the original CICIDS2017 dataset. Please go to cell [21] if you work on the sampled dataset. 

In [4]:
df

Unnamed: 0,url,type,use_of_ip,url_property,abnormal_url,count.,count-www,count@,count_dir,count_embed_domian,...,count?,count-,count=,url_length,hostname_length,sus_url,fd_length,tld_length,count-digits,count-letters
0,br-icloud.com.br,phishing,0,1,0,2,0,0,0,0,...,0,1,0,16,0,0,0,-1,0,13
1,mp3raid.com/music/krizz_kaliko.html,benign,0,0,0,2,0,0,2,0,...,0,0,0,35,0,0,5,-1,1,29
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,0,0,2,0,0,3,0,...,0,0,0,31,0,0,7,-1,1,25
3,http://www.garage-pirenne.be/index.php?option=...,defacement,0,1,1,3,1,0,1,0,...,1,1,4,88,21,0,9,2,7,63
4,http://adventure-nicaragua.net/index.php?optio...,defacement,0,1,1,2,0,0,1,0,...,1,1,3,235,23,0,9,3,22,199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,0,1,0,3,0,0,3,0,...,0,0,0,39,0,0,7,-1,12,21
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,0,1,0,2,0,0,4,0,...,0,2,0,44,0,0,8,-1,7,29
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,0,1,0,2,1,0,4,0,...,0,0,0,42,0,0,7,-1,3,33
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,0,1,0,2,0,0,2,0,...,0,0,0,45,0,0,4,-1,0,36


In [5]:
df.type.value_counts()

type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64

## data split for biased classifiers

In [6]:
import pandas as pd
import numpy as np

# Filter the dataset to include only the 'benign', 'phishing', and 'malware' classes
train_df = df[df['type'].isin(['benign', 'defacement', 'malware'])]

# Get the 'benign' URLs
benign_urls = train_df[train_df['type'] == 'benign']['url']

# Randomly sample 94110 'benign' URLs
test_benign_urls = benign_urls.sample(n=94110, random_state=42)

# Create the test set
test_df = df[df['url'].isin(test_benign_urls) | (df['type'] == 'phishing')]

# Create the training set
train_df = train_df[~train_df['url'].isin(test_benign_urls)]

In [7]:
#Predictor Variables
X_train_biased = train_df[['use_of_ip','abnormal_url', 'count.', 'count-www', 'count@',
       'count_dir', 'count_embed_domian', 'short_url', 'count-https',
       'count-http', 'count%', 'count?', 'count-', 'count=', 'url_length',
       'hostname_length', 'sus_url', 'fd_length', 'tld_length', 'count-digits',
       'count-letters']]

X_test_biased = test_df[['use_of_ip','abnormal_url', 'count.', 'count-www', 'count@',
       'count_dir', 'count_embed_domian', 'short_url', 'count-https',
       'count-http', 'count%', 'count?', 'count-', 'count=', 'url_length',
       'hostname_length', 'sus_url', 'fd_length', 'tld_length', 'count-digits',
       'count-letters']]

#Target Variable
y_train_biased = train_df['url_property']

y_test_biased = test_df['url_property']

In [8]:
# Z-score normalization
features_test_X = X_test_biased.dtypes[X_test_biased.dtypes != 'object'].index
X_test_biased[features_test_X] = X_test_biased[features_test_X].apply(
    lambda x: (x - x.mean()) / (x.std()))
# Fill empty values by 0
X_test_biased = X_test_biased.fillna(0)

### Preprocessing (normalization and padding values)

In [9]:
# Z-score normalization
features = df.dtypes[df.dtypes != 'object'].index
df[features] = df[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
# Fill empty values by 0
df = df.fillna(0)

In [10]:
labelencoder = LabelEncoder()
df.iloc[:, 1] = labelencoder.fit_transform(df.iloc[:, 1])

In [11]:
df.type.value_counts()

type
0    428103
1     96457
3     94111
2     32520
Name: count, dtype: int64

In [12]:
df = df.drop(['url_property'],axis=1)

In [13]:
df.to_csv('./data/CICIDS2017_sample_km.csv',index=0)

## Anomaly-based IDS

### Generate the port-scan datasets for unknown attack detection

In [14]:
df=pd.read_csv('./data/CICIDS2017_sample_km.csv')

In [15]:
df.type.value_counts()

type
0    428103
1     96457
3     94111
2     32520
Name: count, dtype: int64

In [16]:
df1 = df[df['type'] != 1]
df1['type'][df1['type'] > 0] = 1
df1.to_csv('./data/CICIDS2017_sample_km_without_defacement.csv',index=0)

In [17]:
df2 = df[df['type'] == 1]
df2['type'][df2['type'] == 1] = 1
df2.to_csv('./data/CICIDS2017_sample_km_defacement.csv',index=0)

### Read the generated datasets for unknown attack detection

In [18]:
df1 = pd.read_csv('./data/CICIDS2017_sample_km_without_defacement.csv')
df2 = pd.read_csv('./data/CICIDS2017_sample_km_defacement.csv')

In [19]:
features = df1.drop(['type'],axis=1).dtypes[df1.dtypes != 'object'].index
df1[features] = df1[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
df2[features] = df2[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
df1 = df1.fillna(0)
df2 = df2.fillna(0)

In [20]:
df1.type.value_counts()

type
0    428103
1    126631
Name: count, dtype: int64

In [21]:
df2.type.value_counts()

type
1    96457
Name: count, dtype: int64

In [22]:
df2p = df1[df1['type'] == 0]
df2pp = df2p.sample(n=None, frac=96457/428103, replace=False, weights=None, random_state=None, axis=0)
df2 = pd.concat([df2, df2pp])

# Remove the selected rows from df1
df1 = df1.drop(df2pp.index)

In [23]:
df1.type.value_counts()

type
0    331646
1    126631
Name: count, dtype: int64

In [24]:
df2.type.value_counts()

type
1    96457
0    96457
Name: count, dtype: int64

In [25]:
df2

Unnamed: 0,url,type,use_of_ip,abnormal_url,count.,count-www,count@,count_dir,count_embed_domian,short_url,...,count?,count-,count=,url_length,hostname_length,sus_url,fd_length,tld_length,count-digits,count-letters
0,http://www.garage-pirenne.be/index.php?option=...,1,0.000396,0.000423,0.181424,0.655655,-0.004554,-0.661530,0.000000,-0.166470,...,0.867129,-0.262667,0.675820,0.039647,0.341876,-0.163185,-0.124144,-0.776486,0.360470,-0.117240
1,http://adventure-nicaragua.net/index.php?optio...,1,0.000396,0.000423,-0.988493,-1.518748,-0.004554,-0.661530,0.000000,-0.166470,...,0.867129,-0.262667,0.272379,3.549246,0.684898,-0.163185,-0.124144,-0.028379,2.606372,4.011416
2,http://www.pashminaonline.com/pure-pashminas,1,0.000396,0.000423,-0.988493,0.655655,-0.004554,-0.661530,0.000000,-0.166470,...,-1.149912,-0.262667,-0.937943,-1.010846,0.513387,-0.163185,0.787566,-0.028379,-0.687617,-0.906542
3,http://www.ikenmijnkunst.nl/index.php/expositi...,1,0.000396,0.000423,0.181424,0.655655,-0.004554,0.940087,0.000000,-0.166470,...,-1.149912,-0.262667,-0.937943,-0.533349,0.170366,-0.163185,-0.124144,-0.776486,-0.088710,-0.511891
4,http://www.lebensmittel-ueberwachung.de/index....,1,0.000396,0.000423,1.351341,0.655655,-0.004554,0.139278,0.000000,-0.166470,...,-1.149912,-0.262667,-0.937943,-0.604974,2.228495,-0.163185,-0.124144,-0.776486,-0.537891,-0.511891
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8806,beemp3.com/index.php?q=hey+say+jump&st=song,0,-0.115421,-0.444579,-0.053317,-0.337779,-0.044340,-0.876193,-0.041951,-0.267925,...,2.082603,-0.503818,1.671762,-0.288689,-0.376392,-0.282022,0.023927,-0.379739,-0.379435,-0.284547
244166,evri.com/person/macgillivray-milne-0x557b15,0,-0.115421,-0.444579,-0.699413,-0.337779,-0.044340,-0.251417,-0.041951,-0.267925,...,-0.407253,0.156501,-0.300372,-0.288689,-0.376392,-0.282022,-0.213305,-0.379739,0.025082,-0.318067
369610,dojos.com/sansookungfu/index.htm,0,-0.115421,-0.444579,-0.053317,-0.337779,-0.044340,-0.251417,-0.041951,-0.267925,...,-0.407253,-0.503818,-0.300372,-0.540635,-0.376392,-0.282022,0.261159,-0.379739,-0.460338,-0.452149
467392,pastehtml.com/view/bxeea71ky.html,0,-0.115421,-0.444579,-0.053317,-0.337779,-0.044340,-0.251417,-0.041951,-0.267925,...,-0.407253,-0.503818,-0.300372,-0.517731,-0.376392,-0.282022,-0.371459,-0.379739,-0.298531,-0.485670


In [26]:
df = pd.concat([df1, df2], ignore_index=True)

In [27]:
X = df.drop(['type','url'],axis=1) .values
y = df.iloc[:, 1].values.reshape(-1,1)
y=np.ravel(y)
pd.Series(y).value_counts()

0    428103
1    223088
Name: count, dtype: int64

In [28]:
y

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [29]:
print(y.dtype)

int64


In [30]:
X.dtype

dtype('float64')

### Feature engineering (IG, FCBF, and KPCA)

#### Feature selection by information gain (IG)

In [31]:
from sklearn.feature_selection import mutual_info_classif
importances = mutual_info_classif(X, y)

In [32]:
# calculate the sum of importance scores
f_list = sorted(zip(map(lambda x: round(x, 4), importances), features), reverse=True)
Sum = 0
fs = []
for i in range(0, len(f_list)):
    Sum = Sum + f_list[i][0]
    fs.append(f_list[i][1])

In [33]:
# select the important features from top to bottom until the accumulated importance reaches 90%
f_list2 = sorted(zip(map(lambda x: round(x, 4), importances/Sum), features), reverse=True)
Sum2 = 0
fs = []
for i in range(0, len(f_list2)):
    Sum2 = Sum2 + f_list2[i][0]
    fs.append(f_list2[i][1])
    if Sum2>=0.9:
        break        

In [34]:
fs

['hostname_length',
 'abnormal_url',
 'count-http',
 'count-www',
 'count_dir',
 'tld_length',
 'count.',
 'count-',
 'count-https',
 'use_of_ip',
 'count?',
 'count-letters',
 'sus_url',
 'fd_length',
 'short_url',
 'count=',
 'count-digits',
 'url_length',
 'count%']

In [35]:
X_fs = df[fs].values

In [36]:
X_fs.shape

(651191, 19)

In [37]:
X_fs

array([[-0.3763922 , -0.44457923, -0.43718317, ..., -0.46033787,
        -0.90710258, -0.12494503],
       [-0.3763922 , -0.44457923, -0.43718317, ..., -0.37943453,
        -0.56353968, -0.12494503],
       [ 1.22118181,  2.24931381,  2.07075749, ..., -0.37943453,
         1.42912516, -0.12494503],
       ...,
       [-0.3763922 , -0.44457923, -0.43718317, ..., -0.46033787,
        -0.54063548, -0.12494503],
       [-0.3763922 , -0.44457923, -0.43718317, ..., -0.2985312 ,
        -0.51773129, -0.12494503],
       [-0.3763922 , -0.44457923, -0.43718317, ..., -0.46033787,
        -0.67806065, -0.12494503]])

#### Feature selection by Fast Correlation Based Filter (FCBF)

The module is imported from the GitHub repo: https://github.com/SantiagoEG/FCBF_module

In [38]:
from FCBF_module import FCBF, FCBFK, FCBFiP, get_i
fcbf = FCBFK(k = 12)
#fcbf.fit(X_fs, y)

In [39]:
X_fss = fcbf.fit_transform(X_fs,y)

In [40]:
X_fss.shape

(651191, 12)

In [41]:
X_fss

array([[-0.11542079, -0.17239686, -0.44457923, ..., -0.40725307,
        -0.3763922 , -0.30037198],
       [-0.11542079, -0.17239686, -0.44457923, ..., -0.40725307,
        -0.3763922 , -0.30037198],
       [-0.11542079, -0.17239686,  2.24931381, ..., -0.40725307,
         1.22118181, -0.30037198],
       ...,
       [-0.11542079, -0.17239686, -0.44457923, ..., -0.40725307,
        -0.3763922 , -0.30037198],
       [-0.11542079, -0.17239686, -0.44457923, ..., -0.40725307,
        -0.3763922 , -0.30037198],
       [-0.11542079, -0.17239686, -0.44457923, ..., -0.40725307,
        -0.3763922 , -0.30037198]])

### Train-test split after feature selection

In [42]:
X_train = X_fss[:len(df1)]
y_train = y[:len(df1)]
X_test = X_fss[len(df1):]
y_test = y[len(df1):]

### Solve class-imbalance by SMOTE

In [43]:
pd.Series(y_train).value_counts()

0    331646
1    126631
Name: count, dtype: int64

In [44]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(n_jobs=-1,sampling_strategy={1:331646})
X_train, y_train = smote.fit_resample(X_train, y_train)

In [45]:
pd.Series(y_train).value_counts()

1    331646
0    331646
Name: count, dtype: int64

In [46]:
pd.Series(y_test).value_counts()

1    96457
0    96457
Name: count, dtype: int64

### Apply the cluster labeling (CL) k-means method

In [47]:
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN,MeanShift
from sklearn.cluster import SpectralClustering,AgglomerativeClustering,AffinityPropagation,Birch,MiniBatchKMeans,MeanShift 
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.metrics import classification_report
from sklearn import metrics

In [48]:
import time

def CL_kmeans(X_train, X_test, y_train, y_test, n, b=100):
    start_time = time.time()
    
    km_cluster = MiniBatchKMeans(n_clusters=n, batch_size=b)
    result = km_cluster.fit_predict(X_train)
    result2 = km_cluster.predict(X_test)

    count = 0
    a = np.zeros(n)
    b = np.zeros(n)
    for v in range(0, n):
        for i in range(0, len(y_train)):
            if result[i] == v:
                if y_train[i] == 1:
                    a[v] = a[v] + 1
                else:
                    b[v] = b[v] + 1
    list1 = []
    list2 = []
    for v in range(0, n):
        if a[v] <= b[v]:
            list1.append(v)
        else: 
            list2.append(v)
    for v in range(0, len(y_test)):
        if result2[v] in list1:
            result2[v] = 0
        elif result2[v] in list2:
            result2[v] = 1
        else:
            print("-1")
    
    end_time = time.time()
    total_time = end_time - start_time
    print(f"Total execution time: {total_time:.2f} seconds")
    
    single_prediction_time = total_time / len(X_test)
    print(f"Single data point prediction time: {single_prediction_time:.6f} seconds")
    
    print(classification_report(y_test, result2))
    cm = confusion_matrix(y_test, result2)
    acc = metrics.accuracy_score(y_test, result2)
    print(str(acc))
    print(cm)

In [49]:
CL_kmeans(X_train, X_test, y_train, y_test, 8)

Total execution time: 0.83 seconds
Single data point prediction time: 0.000004 seconds
              precision    recall  f1-score   support

           0       0.48      0.89      0.62     96457
           1       0.22      0.03      0.05     96457

    accuracy                           0.46    192914
   macro avg       0.35      0.46      0.34    192914
weighted avg       0.35      0.46      0.34    192914

0.4591579667623915
[[85558 10899]
 [93437  3020]]


### Hyperparameter optimization of CL-k-means
Tune "k"

In [50]:
#Hyperparameter optimization by BO-GP
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from sklearn import metrics

space  = [Integer(2, 50, name='n_clusters')]
@use_named_args(space)
def objective(**params):
    km_cluster = MiniBatchKMeans(batch_size=100, **params)
    n=params['n_clusters']
    
    result = km_cluster.fit_predict(X_train)
    result2 = km_cluster.predict(X_test)

    count=0
    a=np.zeros(n)
    b=np.zeros(n)
    for v in range(0,n):
        for i in range(0,len(y_train)):
            if result[i]==v:
                if y_train[i]==1:
                    a[v]=a[v]+1
                else:
                    b[v]=b[v]+1
    list1=[]
    list2=[]
    for v in range(0,n):
        if a[v]<=b[v]:
            list1.append(v)
        else: 
            list2.append(v)
    for v in range(0,len(y_test)):
        if result2[v] in list1:
            result2[v]=0
        elif result2[v] in list2:
            result2[v]=1
        else:
            print("-1")
    cm=metrics.accuracy_score(y_test,result2)
    precision = metrics.precision_score(y_test, result2)
    recall = metrics.recall_score(y_test, result2)
    f1 = metrics.f1_score(y_test, result2)

    # Print the confusion matrix
    print("Confusion matrix for n_clusters=%d:" % n)
    print(metrics.confusion_matrix(y_test, result2))
    
    print(str(n)+" Accuracy: %.4f, Precision: %.4f, Recall: %.4f, F1-score: %.4f" % (cm, precision, recall, f1))
    return (1-cm)
from skopt import gp_minimize
import time
t1=time.time()
res_gp = gp_minimize(objective, space, n_calls=20, random_state=0)
t2=time.time()
print(t2-t1)
print("Best score=%.4f" % (1-res_gp.fun))
print("""Best parameters: n_clusters=%d""" % (res_gp.x[0]))

Confusion matrix for n_clusters=30:
[[85986 10471]
 [46232 50225]]
30 Accuracy: 0.7061, Precision: 0.8275, Recall: 0.5207, F1-score: 0.6392
Confusion matrix for n_clusters=43:
[[86031 10426]
 [93556  2901]]
43 Accuracy: 0.4610, Precision: 0.2177, Recall: 0.0301, F1-score: 0.0528
Confusion matrix for n_clusters=43:
[[86141 10316]
 [93663  2794]]
43 Accuracy: 0.4610, Precision: 0.2131, Recall: 0.0290, F1-score: 0.0510
Confusion matrix for n_clusters=43:
[[86393 10064]
 [93758  2699]]
43 Accuracy: 0.4618, Precision: 0.2115, Recall: 0.0280, F1-score: 0.0494
Confusion matrix for n_clusters=32:
[[85950 10507]
 [46382 50075]]
32 Accuracy: 0.7051, Precision: 0.8266, Recall: 0.5191, F1-score: 0.6377
Confusion matrix for n_clusters=20:
[[85383 11074]
 [40923 55534]]
20 Accuracy: 0.7305, Precision: 0.8337, Recall: 0.5757, F1-score: 0.6811
Confusion matrix for n_clusters=16:
[[86534  9923]
 [94451  2006]]
16 Accuracy: 0.4590, Precision: 0.1682, Recall: 0.0208, F1-score: 0.0370
Confusion matrix for

In [51]:
#Hyperparameter optimization by BO-TPE
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.cluster import MiniBatchKMeans
from sklearn import metrics
import numpy as np
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

def objective(params):
    params = {
        'n_clusters': int(params['n_clusters']), 
    }
    km_cluster = MiniBatchKMeans(batch_size=100, **params)
    n=params['n_clusters']
    
    result = km_cluster.fit_predict(X_train)
    result2 = km_cluster.predict(X_test)

    count=0
    a=np.zeros(n)
    b=np.zeros(n)
    for v in range(0,n):
        for i in range(0,len(y_train)):
            if result[i]==v:
                if y_train[i]==1:
                    a[v]=a[v]+1
                else:
                    b[v]=b[v]+1
    list1=[]
    list2=[]
    for v in range(0,n):
        if a[v]<=b[v]:
            list1.append(v)
        else: 
            list2.append(v)
    for v in range(0,len(y_test)):
        if result2[v] in list1:
            result2[v]=0
        elif result2[v] in list2:
            result2[v]=1
        else:
            print("-1")
    score=metrics.accuracy_score(y_test,result2)
    
    # Calculate and print additional metrics
    cm = confusion_matrix(y_test, result2)
    tn, fp, fn, tp = cm.ravel()
    precision = precision_score(y_test, result2)
    recall = recall_score(y_test, result2)
    f1 = f1_score(y_test, result2)
    print(f"n_clusters: {n}, Accuracy: {score}, Precision: {precision}, Recall: {recall}, F1-score: {f1}")
    print("Confusion Matrix:")
    print(cm)
    
    return {'loss':1-score, 'status': STATUS_OK}

space = {
    'n_clusters': hp.quniform('n_clusters', 2, 50, 1),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=20)
print("Random Forest: Hyperopt estimated optimum {}".format(best))

n_clusters: 8, Accuracy: 0.4582197248514882, Precision: 0.17421180274858528, Recall: 0.022341561524824534, F1-score: 0.03960414235437897
Confusion Matrix:                                     
[[86242 10215]                                        
 [94302  2155]]
n_clusters: 31, Accuracy: 0.4588262127165473, Precision: 0.20923932937989603, Recall: 0.029629783219465668, F1-score: 0.05190889607323187
Confusion Matrix:                                                               
[[85656 10801]                                                                  
 [93599  2858]]
n_clusters: 32, Accuracy: 0.46031910592284647, Precision: 0.2042803059568879, Recall: 0.027411178037882164, F1-score: 0.048336380255941495
Confusion Matrix:                                                               
[[86158 10299]                                                                  
 [93813  2644]]
n_clusters: 46, Accuracy: 0.46924536321884364, Precision: 0.25642499384185896, Recall: 0.032377121411613

In [52]:
CL_kmeans(X_train, X_test, y_train, y_test, 37)

Total execution time: 4.39 seconds
Single data point prediction time: 0.000023 seconds
              precision    recall  f1-score   support

           0       0.48      0.89      0.62     96457
           1       0.20      0.03      0.05     96457

    accuracy                           0.46    192914
   macro avg       0.34      0.46      0.33    192914
weighted avg       0.34      0.46      0.33    192914

0.4587432742050862
[[85937 10520]
 [93896  2561]]


### Apply the CL-k-means model with biased classifiers

In [53]:
# CL-kmeans
n=37
b=100
km_cluster = MiniBatchKMeans(n_clusters=n,batch_size=b)
result = km_cluster.fit_predict(X_train)
result2 = km_cluster.fit_predict(X_test)

count=0
a=np.zeros(n)
b=np.zeros(n)
for v in range(0,n):
    for i in range(0,len(y_train)):
        if result[i]==v:
            if y_train[i]==1:
                a[v]=a[v]+1
            else:
                b[v]=b[v]+1

# 计算每个聚类的 confidence
confidence = np.zeros(n)
for v in range(0, n):
    for i in range(0, len(y_train)):
        total = a[v] + b[v]
        if result[i] == v:
            if total > 0:
                confidence[v] = max(a[v], b[v]) / total
            else:
                confidence[v] = 0

# 根据 confidence 阈值将聚类划分为"可信"和"不可信"
threshold_confidence = 0.93  # 可以根据需要调整阈值
list1 = []
list2 = []
list3 = []
list4 = []

added_indices = set()

for v in range(0, n):
    if v in added_indices:
        continue
    if confidence[v] > threshold_confidence:
        if a[v] > b[v]:
            list1.append(v)
        else:
            list2.append(v)
    elif a[v] > b[v]:
        list3.append(v)
    else:
        list4.append(v)

    added_indices.add(v)

X_new_testp = []
y_new_testp = []
X_new_testn = []
y_new_testn = []
for i in range(0, len(y_test)):
    if result2[i] in list1:
        result2[i] = 1
    elif result2[i] in list2:
        result2[i] = 0
    elif result2[i] in list3:
        X_new_testp.append(X_test[i])
        y_new_testp.append(y_test[i])
    elif result2[i] in list4:
        X_new_testn.append(X_test[i])
        y_new_testn.append(y_test[i])
    else:
        print("-1")  # 未分类

X_new_testp = pd.DataFrame(X_new_testp)
y_new_testp = pd.DataFrame(y_new_testp)
X_new_testn = pd.DataFrame(X_new_testn)
y_new_testn = pd.DataFrame(y_new_testn)

#X_testp = new_testp.drop(['type','url'],axis=1)
#y_testp = new_testp.iloc[:, -1].values.reshape(-1,1)
#y_testp=np.ravel(y_testp)
#threshold = 0.5
#y_testp = (y_testp > threshold).astype(int)
    
#X_testn = new_testn.drop(['type','url'],axis=1)  
#y_testn = new_testn.iloc[:, -1].values.reshape(-1,1)
#y_testn=np.ravel(y_testn)
#threshold = 0.5
#y_testn = (y_testn > threshold).astype(int)
    
#Biased classifier construction
count=0
print(len(y_train))
a=np.zeros(n)
b=np.zeros(n)
FNL=[]
FPL=[]
for v in range(0,n):
    al=[]
    bl=[]
    for i in range(0,len(y_train)):   
        if result[i]==v:        
            if y_train[i]==1:        #label 1
                a[v]=a[v]+1
                al.append(i)
            else:             #label 0
                b[v]=b[v]+1
                bl.append(i)
    if a[v]<=b[v]:
        FNL.extend(al)
    else:
        FPL.extend(bl)
    #print(str(v)+"="+str(a[v]/(a[v]+b[v])))
    
X_dffp = X_train[FPL, :]
y_dffp = y_train[FPL]
X_dffn = X_train[FNL, :]
y_dffn = y_train[FNL]
X_dfva0=X_train[y_train==0]
y_dfva0=y_train[y_train==0]
X_dfva1=X_train[y_train==1]
y_dfva1=y_train[y_train==1]

X_dffpp = X_dfva1[np.random.choice(X_dfva1.shape[0], size=len(FPL), replace=False)]
X_dffnp = X_dfva0[np.random.choice(X_dfva0.shape[0], size=len(FPL), replace=False)]
#X_dffpp=X_dfva1.sample(n=None, frac=len(FPL)/X_dfva1.shape[0], replace=False, weights=None, random_state=None, axis=0)
y_dffpp = y_dfva1[:len(X_dffpp)]
#X_dffnp=X_dfva0.sample(n=None, frac=len(FNL)/X_dfva0.shape[0], replace=False, weights=None, random_state=None, axis=0)
y_dffnp = y_dfva0[:len(X_dffnp)]

Xp = np.concatenate([X_dffp, X_dffpp], axis=0)
yp = np.concatenate([y_dffp, y_dffpp], axis=0)
Xn = np.concatenate([X_dffn, X_dffnp], axis=0)
yn = np.concatenate([y_dffn, y_dffnp], axis=0)

# first biased classifier
rfp = RandomForestClassifier(random_state = 0)
rfp.fit(Xp,yp)

result3 = rfp.predict(X_new_testp)
#result3 = result3.astype(y_new_testp.dtype)
print(classification_report(y_new_testp, result3))
acc=metrics.accuracy_score(y_new_testp,result3)
print(acc)
cm=confusion_matrix(y_new_testp,result3)
print(cm)

# second biased classifier
rfn = RandomForestClassifier(random_state = 0)
rfn.fit(Xn,yn)

result4 = rfn.predict(X_new_testn)
#result4 = result4.astype(y_new_testn.dtype)
print(classification_report(y_new_testn, result4))
acc=metrics.accuracy_score(y_new_testn,result4)
print(acc)
cm=confusion_matrix(y_new_testn,result4)
print(cm)

#print(Xp.shape)
#print(Xn.shape)

#dffnn_f=pd.concat([dffn, dffnp])
    
#Xnn = dffn_f.drop(['type','url'],axis=1)  
#ynn = dffn_f.iloc[:, -1].values.reshape(-1,1)
#ynn=np.ravel(ynn)
#ynn = (ynn > threshold).astype(int)

#rfnn = RandomForestClassifier(random_state = 0)
#rfnn.fit(Xnn,ynn)

result5 = km_cluster.predict(X_test)

count=0
a=np.zeros(n)
b=np.zeros(n)
for v in range(0,n):
    for i in range(0,len(y_train)):
        if result[i]==v:
            if y_train[i]==1:
                a[v]=a[v]+1
            else:
                b[v]=b[v]+1
list5=[]
list6=[]
for v in range(0,n):
    if a[v]<=b[v]:
        list5.append(v)
    else: 
        list6.append(v)
for v in range(0,len(y_test)):
    if result5[v] in list5:
        result5[v]=0
    elif result5[v] in list6:
        result5[v]=1
    else:
        print("-1")
print(classification_report(y_test, result5))
cm=confusion_matrix(y_test,result5)
acc=metrics.accuracy_score(y_test,result5)
print(str(acc))
print(cm)

663292
              precision    recall  f1-score   support

           0       0.98      0.68      0.80     10700
           1       0.91      0.99      0.95     34474

    accuracy                           0.92     45174
   macro avg       0.94      0.84      0.87     45174
weighted avg       0.92      0.92      0.91     45174

0.9191791738610705
[[ 7228  3472]
 [  179 34295]]
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      5819
           1       0.98      0.97      0.97     13216

    accuracy                           0.96     19035
   macro avg       0.95      0.96      0.95     19035
weighted avg       0.96      0.96      0.96     19035

0.9609666403992645
[[ 5516   303]
 [  440 12776]]
              precision    recall  f1-score   support

           0       0.49      0.24      0.32     96457
           1       0.50      0.74      0.59     96457

    accuracy                           0.49    192914
   macro avg       0.

In [54]:
#Xp = np.concatenate([X_dffp, X_dffpp], axis=0)
yp1 = np.concatenate([y_dffp, y_dffpp], axis=0)
#Xn = np.concatenate([X_dffn, X_dffnp], axis=0)
#yp = np.concatenate([y_dffn, y_dffnp], axis=0)

In [55]:
yp.shape

(68266,)

In [56]:
y_dffpp.shape

(34133,)

In [57]:
X_dffp.shape

(34133, 12)

In [58]:
X_dffpp.shape

(34133, 12)

In [59]:
y_dffp.shape

(34133,)

In [60]:
Xp.shape

(68266, 12)

In [61]:
yp.shape

(68266,)

In [62]:
Xp.shape

(68266, 12)

In [63]:
list1

[1, 4, 5, 11, 15, 17, 21, 25, 34, 36]

In [64]:
list2

[0, 3, 12, 13, 16, 22, 32]

In [65]:
list3

[2, 7, 8, 9, 18, 19, 20, 23, 27, 28, 29, 31, 33]

In [66]:
list4

[6, 10, 14, 24, 26, 30, 35]

In [67]:
print(len(FNL))

105918


In [68]:
print(type(result2))
print(len(result2))

<class 'numpy.ndarray'>
192914


In [69]:
print(result2[:1000])

[ 1 28 23  1 27  0  1 18 28 18 35  9 23  9  2  9 18  0  7  1  0  0  1 23
  9  0  0  0 23  1 20  1 10 23  0  1  1  1 29 35  1 27  1  1  1 10 23  0
 29 28  1  0  6  1 29 27  1  9 27 10 23  1  1 35 35  1  1  1 23 33  1  9
  2 35 19  1 23  1 35  1  1  0 18  9 14  1  9 29  1 19 10  1  1 35  0  1
  1  0 27  0 29  1  0  0  1  1 35 33 35  8  0  8 35  9  0  2 29  1  1 35
 35  1 35  1 23  1  1  0  1  1  1 18  1 19 28 23 10  1  1  1  1 29 23  6
  1  1  0 27 19 35  1  1  1 28 10  1 35  0  0  1 23  9  1  1  0  0  0 10
 18 33  9  0 35  1  1  1  1  1 35 10  1 27  8  1  1  1 18 23  1 18  1  0
 35  0  1  1  0  1  1  1  1 18  1 19  1  9  1  1 23  1  9  1  1 26  0  1
  0  1  1 35 10  1 23  0  1 28  1  1  1  1  0  9  1  1  1  1 27  0 35  1
  1 18 10  1 35  1 33  1  1 19  1  1  1 20  1 29  1  1  1  1  9 29  1 29
  1 33  6  1  1  1  1  1 19  1 10  1  1 18  1 35  1  9  1 29  1  0  1  1
  0  1 35 35  1 18  1 10  1 10  9  1  1  1  1  1 23 33 18 18 35  1  0 19
  1  1 18  1 23  9 27 23  1 28  1  1 28  1 26 35 10

In [70]:
print(result[:1000])

[30 30  8 30 14 30 30 30 30 30 10 30 30 30 32 30 27 23 30 30 32 14 15 33
 30 30 30 30 30 30  2 30 30 30 30 30 30 14 14 30 30 30 30 30 32 30 30 30
 30 28 32 30  0 30 21 30 30  6 10 30 30 30 30 30 30 30 21 30 30 30 14  2
 19 14 30 30 10 30 30 30 10  6 27 30 30 21 30 30 10 30 30 14  8 30 19 30
  0 10 32 30 30 30 30 30 30 30 30 30 32  8 30 14 30 27 30 30 32 30 30 30
 30 30 30 30 30 32 18 30 30 30 19 30 30 30 30 30  8 30 32 22 30 14 32 30
 32 30  0 14 20 14 30 19 30 30 30 30 30 21 30  7 30 32 14 30 32 32 30 30
 30 30 32 30 30 30 32 26 30 32 10 30 32 32 30 30 32 30 32 30 30 14 30 32
 30 30 30 30 30 18 18 10 30 30 30 30 23 30 32 30 19 27 30 30 30 30 30 30
 30 32 14 30 27 19 30 30 30 30 30 30 30 19 30 32 30 30 30 30 30 14 30 30
 30  8 22 30 30  8 30 30  7 30 32  0 30 30 26 26 30 30 30 30 30 30 30 30
 30 30 30 30 30 30 23  7 18 14 30 30 30 30 14 19 30 30 22 32 32  8 30 30
 32  0 30 30 30 32 30  7 30 30 30 32 14 30 30 18 14 30 30 22 30 30 33 30
 19 30 30 30 30 32 30 27 30 30 30 30 30 14  8 30 30

In [71]:
import pandas as pd

y_test_series = pd.Series(y_test)
y_test_series.value_counts()

1    96457
0    96457
Name: count, dtype: int64

In [72]:
X_testp

NameError: name 'X_testp' is not defined

In [None]:
Xp

In [None]:
import pandas as pd

# 假设 X_testp 是一个 Pandas DataFrame
unique_rows = X_testp.drop_duplicates()
duplicate_rows = X_testp[X_testp.duplicated()]

print(f"Number of unique rows: {len(unique_rows)}")
print(f"Number of duplicate rows: {len(duplicate_rows)}")

In [None]:
df2

In [None]:
Xp

In [None]:
X_testn

In [None]:
Xn

In [None]:
X_testn

In [None]:
# 假设你已经有了 X_testp 这个 pandas DataFrame
print(X_testp.columns)

In [None]:
# 假设 X_testn 和 Xn 是 pandas DataFrame, y_testn 是 numpy array
# 我们先定义要比较的列名
compare_cols = ['use_of_ip', 'abnormal_url', 'count.', 'count-www', 'count@',
               'count_dir', 'count_embed_domian', 'short_url', 'count-https',
               'count-http', 'count%']

# 找出 X_testn 和 Xn 中相同的行
common_rows_p = X_testp[X_testp.apply(lambda row: any(Xp[compare_cols].eq(row[compare_cols]).all(axis=1)), axis=1)].index.tolist()

# 确保 common_rows 中的索引值都存在于 y_testn 中
common_rows_p = [index for index in common_rows_p if index < len(y_testn)]

# 从 X_testn 中删除这些相同的行
X_testp_cleaned = X_testp.drop(common_rows_p)

# 从 y_testn 中删除这些相同的行
y_testp_cleaned = np.delete(y_testp, common_rows_p)