#  Load Packages and Dataset

In [1]:
import sys
# config the path of your project
sys.path.append(r"F:\Lecture\Project\Project for Network IDS")

In [6]:
#importing packages that are necessary
from config import *

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import csv
%matplotlib inline
from tqdm import tqdm
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth

In [7]:
LIVE_DATASET_PATH

'....\\dataset\\used\\live_data.csv'

In [8]:
#reading training data to understant existing features and dimentions of the same. 
data = pd.read_csv(LIVE_DATASET_PATH)
print(data.shape)
data.head(10) #printing the top 10 records

FileNotFoundError: [Errno 2] No such file or directory: '....\\dataset\\used\\live_data.csv'

In [None]:
total = len(data)*1.
ax=sns.countplot(x="label", data=data)
for p in ax.patches:
    print(p)
    ax.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x()+0.3, p.get_height()+5))

#on y axis  from 0 to the total number of rows in the dataframe
ax.yaxis.set_ticks(np.linspace(0, total, 11))
print(ax.yaxis.get_majorticklocs())
# with out changing the position converting to percentage on y axis without changing the positions. 
ax.set_yticklabels(map('{:.1f}%'.format, 100*ax.yaxis.get_majorticklocs()/total))
plt.show()

# Data Preprocessing

## Features

In [None]:
# description of all features
data_features =pd.read_csv(FEATURE_DATASET_PATH, sep=",", encoding='cp1252')
print(data_features.shape)
data_features.head(49)

In [None]:
# categorical features
cat_feature = data.select_dtypes(include=['category', object]).columns
cat_feature

In [None]:
# understaing of numerical features/data/variables
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num_features = data.drop(['id','label'], axis=1).select_dtypes(include=numerics).columns
num_features

## Split Training and Test Data

In [None]:
from sklearn.model_selection import train_test_split
X, y = data.iloc[:, :43], data["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

## Dropping Highly Relevant Features

### Feature Correlation

In [None]:
# from the above graphs while understanding the density of numerical data  we see there is high corelation for certain features. 
# we shall find the highly corelated data and drop the columns to avoid overhead 
df_corr = X_train.corr()

plt.figure(figsize=(30,20))
sns.heatmap(df_corr, annot=True, cmap=plt.cm.viridis)
plt.savefig('figures/correlation_matrix.png')
plt.show()

### Drop Features Correlation Above 0.95

In [None]:
from utils.graph_utils import *
fig_name = "figures/cor_matrix"
visualize_cor_matrix(fig_name, X_train)

In [None]:
# we shall find the correlation above 0.95 andn shall drop the columns to avoid overhead  
high_corr_var=np.where(df_corr>0.95)
high_corr_var
# for x,y in zip(*high_corr_var):  解包操作符（unpacking operator）
#     print(x, y)
#calculate correlation betweena ll columns and remove highly correlated one
high_corr_var_pairs=[(df_corr.columns[x],df_corr.columns[y]) 
               for x,y in zip(*high_corr_var)
               if x!=y and x<y]
high_corr_var_pairs

In [None]:
# Remove highly correlated features
corr_matrix = X_train.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
X_train = X_train.drop(to_drop, axis=1)

# Print summary of dropped features
print(f"Dropped {len(to_drop)} highly correlated features:")
for feature in to_drop:
    print(f"- {feature}")
    
# Print summary of remaining features
print(f"\n{len(X_train.columns)} features remaining:")
print(X_train.columns.tolist())


## Feature Mapping （category to numarical）

In [None]:
#lets find any catagorial data and lable it usinng lable encoder  with out disturbing the shape/dimentions
cat_feature = X_train.select_dtypes(include=['category', object]).columns
cat_feature

In [None]:
from sklearn.preprocessing import LabelEncoder
X_train[cat_feature] = X_train[cat_feature].apply(LabelEncoder().fit_transform)
X_train.head()

## ARM for feature selection

In [None]:
# now we shall proceed with ARM for feature selection 
# for any feature set to be consider - we are fixing the minimum threshold value with the help of "support and confidance"
#in this case we  fix the support as 30% and confidance as 70%
# any set of features meet the above frequent set we shall consider them 

In [None]:
# To reduce the time complexity, the dataset is divided into equal parts
shuffled = X_train.sample(frac=1)
data_42 = np.array_split(shuffled, 42)
len(data_42)

In [None]:
from utils.arm import *

col_ruled_sets = []
rules_list = []
i=1
for part in data_42:
    """find columns of frequent transaction for all the dataset"""
    print("===Started dataset "+ str(i) +"====")
    #drop id and label
    part = part.drop(['id'], axis=1)
    print(part.shape)
    #create the binary mode data
    part_binary = create_arm_data(part)
    #Use apriori algorithm to find the subsets of frequent item
    result = apriori(part_binary, min_support=0.3, use_colnames=True, max_len=2)
        #Create the rule from subsets
    arm_rules = create_arm_rule(result)
    rules_list.append(arm_rules)
    final_columns = arm_rules['rules_sorted'].unique()
    #print(final_columns)
    col_final = set()
    #add each frequent columns to set
    for row in final_columns:
        for col in row.split(","):
            col_final.add(col)
    print(col_final)
    col_ruled_sets.append(col_final)
    print("===Completed dataset "+ str(i) +"====")
    i+=1

 # Concatenate all the rules into a single DataFrame
rules_df = pd.concat(rules_list, ignore_index=True)

# Truncate decimal values to 8 digits
for col in rules_df.columns:
    if rules_df[col].dtype == 'float64':
        rules_df[col] = rules_df[col].apply(lambda x: round(x, 8))  # Keep 8 decimal places
        
# Write the rules to a CSV file
rules_df.to_csv('csvs/association_rules.csv', index=False)

In [None]:
#iterate over all the 42 data set to find all possibel columns 
# When using associative rule mining, we want to identify interesting relationships between the items (features) in our dataset.
#The process involves identifying frequent itemsets, which are sets of items that frequently appear together in the data,
# and then using those itemsets to generate association rules, which are statements that describe the relationships between
# items.
col_set = set()
for set_i in col_ruled_sets:
    for col in set_i:
        col_set.add(col)
print(len(col_set))
col_set

In [None]:
with open("txts/feature_selected.txt", "w") as file:
    file.write(str(list(col_set)))

Apply Blooms Filter betwen Rules genrated from orginal data and Live data (previously any)

For an initial round, we still apply blooms filter, code is designed in such a way that it stores the same rules generated from original data, which will nnot have any effect onn the first round. 

# ML Process

## Construct Dataset with Selected Features

Prepare Extracted train data  from Original Data D

2.1. Using the rules repository R obtained in step 1.3, 
extract the relevant features from the Original Data D and 
create a new Train data D' containing only these features.

In [None]:
from sklearn.preprocessing import LabelEncoder

X_test[cat_feature] = X_test[cat_feature].apply(LabelEncoder().fit_transform)
df_train = X_train[list(col_set)]
df_test = X_test[list(col_set)]

In [None]:
df_train

In [None]:
df_test

Train the Machine and Standardize the data

4.1. Train a machine learning model on the preprocessed D' obtained in step 2.1.

In [None]:
from sklearn import preprocessing
x = df_train.values
x_test = df_test.values
std_scaler = preprocessing.MinMaxScaler()
std_scaler.fit(x)
x_scaled = std_scaler.transform(x)
df_train = pd.DataFrame(x_scaled)
x_scaled_test = std_scaler.transform(x_test)
df_test = pd.DataFrame(x_scaled_test)

In [None]:
df_train.head()

In [None]:
print("train data shape", df_train.shape, y_train.shape)
print("test data shape", df_test.shape, y_test.shape)

## Models

In [None]:
# from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
# from sklearn.datasets import make_classification
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder, Normalizer

# import xgboost as xgb
# from sklearn.calibration import CalibratedClassifierCV
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.linear_model import LogisticRegression, SGDClassifier
# from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier, GradientBoostingClassifier, AdaBoostClassifier
# from sklearn.svm import SVC
# from sklearn.tree import DecisionTreeClassifier

In [None]:
from pyod.models import lof, cblof, cof, pca, iforest, knn, mcd, ocsvm, abod, hbos, inne
# from pyod.models.feature_bagging import FeatureBagging 
from sklearn.metrics import log_loss, accuracy_score, f1_score, confusion_matrix, roc_curve, auc
from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

In [None]:
outliers_fraction = 0.05
# random_state = np.random.RandomState(42) 

classifiers = { 
#         'Angle-based Outlier Detector (ABOD)': abod.ABOD(contamination=outliers_fraction), 
        'Cluster-based Local Outlier Factor (CBLOF)':cblof.CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=1), 
#         'Feature Bagging':FeatureBagging(lof.LOF(n_neighbors=35),contamination=outliers_fraction,check_estimator=False,
# random_state=random_state), 
        'Histogram-base Outlier Detection (HBOS)': hbos.HBOS(contamination=outliers_fraction), 
        'Isolation Forest': iforest.IForest(contamination=outliers_fraction, random_state=1), 
        'Isolation-based Anomaly Detection Using Nearest-Neighbor Ensembles': inne.INNE(contamination=outliers_fraction, random_state=1)
#         'K Nearest Neighbors (KNN)': knn.KNN(contamination=outliers_fraction), 
#         'Average KNN': knn.KNN(method='mean',contamination=outliers_fraction) 
} 

metrics = defaultdict(list)

In [None]:
# Acc of train: 0.41369
# F1_weighted of train: 0.40623
# Acc of test: 0.68055
# F1_weighted of train: 0.55322

In [None]:
# df_train = df_train.iloc[:1000]
# y_test = pd.concat([y_test[y_test == 0].head(100), y_test[y_test == 1].head(100)])
# df_test = df_test.loc[y_test.index]

for i, (clf_name, clf) in enumerate(classifiers.items()): 
    clf.fit(df_train)
    y_predict = clf.predict(df_test)
    y_predict_tr = clf.predict(df_train)
    # compute metrics
    cm = confusion_matrix(y_test, y_predict)
    tn, fp, fn, tp = cm.ravel()
    fpr = fp/(fp+tn)*100
    fnr = fn/(fn+tp)*100
    far = (fpr+fnr)/2
    fpr_te, tpr_te, t_te = roc_curve(y_test, y_predict)
    auc_value = auc(fpr_te, tpr_te)
    metrics[clf_name].append(accuracy_score(y_test, y_predict))
    metrics[clf_name].append(f1_score(y_test, y_predict))
    metrics[clf_name].append(auc_value)
    metrics[clf_name].append(fpr)
    metrics[clf_name].append(fnr)
    metrics[clf_name].append(far)
    print(f"Training of {i} {clf_name} finished")
#     print(f'Acc of test: {accuracy_score(y_test, predictions):.5f}')
#     print(f"F1_weighted of train: {f1_score(y_test, predictions, average='weighted'):.5f}")
metrics

# Model Evaluation

In [None]:
#Model Evaluation
import prettytable
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Model", "F1 Score", "AUC","FPR %","FNR %","FAR %"]
for model_name, model_metrics_list in metrics.items():
    acc, f1, auc, fpr, fnr, far = model_metrics_list
    x.add_row([model_name, "{0:.4f}".format(f1), "{0:.4f}".format(auc), "{:.2f}".format(fpr), "{:.2f}".format(fnr), "{:.2f}".format(far)])

print(x)

In [None]:
import csv
csv_file_path = "csvs/model_evaluation.csv"
table_data = [
    ["Model", "F1 Score", "AUC", "FPR %", "FNR %", "FAR %"]
]

for model_name, model_metrics_list in metrics.items():
    table_data.append([model_name] + [round(value, 4) for value in model_metrics_list[1:]])

with open(csv_file_path, mode='w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(table_data)

print(f"Data has been written to {csv_file_path}")