In [None]:
%matplotlib inline


In [None]:
import concurrent
import concurrent.futures
import pickle
import re
import statistics
import warnings

import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import regex
import tqdm
from itables import init_notebook_mode, show
from keras import models
from keras.layers import Dense, Input, Reshape
from keras.models import Model
from sklearn.cluster import (AffinityPropagation, AgglomerativeClustering,
                             KMeans)
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import (adjusted_mutual_info_score, adjusted_rand_score,
                             auc, classification_report, davies_bouldin_score,
                             roc_auc_score, roc_curve, silhouette_score,
                             v_measure_score)
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import (LabelEncoder, QuantileTransformer,
                                   label_binarize, normalize)

from featgenerator.config import Config
from featgenerator.exif_feat import ExifFeatures
from featgenerator.featurizer import Featurizer
from featgenerator.floss_general_feat import FlossFeatures
from featgenerator.floss_regex import FlossRegexFeatures
from featgenerator.lief_features import (LiefFeatures,
                                         get_features_from_function_lists)
from featgenerator.malcat import MalcatFeatures
from featgenerator.util import ClusteringMetrics, MinHashLSHForest
from featgenerator import feature_transformation


In [None]:
from featgenerator import doc_features
from featgenerator.util import ClusteringMetrics

In [None]:
from itables import init_notebook_mode

init_notebook_mode(all_interactive=True)

In [None]:
from importlib import reload
from featgenerator import util

util = reload(util)



In [None]:
conf = Config()
adversary_dataset = pd.read_csv(conf.get_adversary_mapping())
adversary_dataset['hash'] = adversary_dataset['sha256'].copy()

In [None]:

joined_df, joined_data_dict = feature_transformation.get_combined_features(
    floss_features=False,
    exif_features=True,
    malcat_features=True,
    lief_features=False,
    flossregex_features=True,
    exported_functions_features=False,
    configuration_version=False,
    document_features=True
)

In [None]:
embedding_df = util.Util().process_raw_strings_dataset(conf.get_root_dir(), conf.get_floss_file(), joined_df['hash'])

In [None]:
embedding_df['hash'] = joined_df['hash'].astype(str).copy()

In [None]:
joined_inner = joined_df.merge(embedding_df,how="inner", on=["hash"])

In [None]:
autoencoder, X_encoded, combined_features, all_features  = feature_transformation.prepare_and_encode_features(joined_df, embedding_df, adversary_dataset, target_column="Campaign_Tag", include_embedding=False, num_epochs=20)

In [None]:
n_clusters = list(np.arange(5, 60, 10))

In [None]:
modelling = util.Modelling()
all_params, best_param, best_truth_matrix = modelling.find_best_agglo(combined_features, n_clusters,all_features[['hash', 'Campaign_Tag']], 'Campaign_Tag')

In [None]:
show(pd.DataFrame(all_params))

In [None]:
agglomerative = AgglomerativeClustering(n_clusters=25, metric="euclidean", linkage="ward")
y_pred = agglomerative.fit_predict(combined_features)

In [None]:
merg_res = all_features.copy()
merg_res['labels'] = y_pred

merged_adversary_experiment_final = all_features.copy()
merged_adversary_experiment_final['labels'] = y_pred
clustering_metrics = util.ClusteringMetrics()
truth_matrix, precision, recall = clustering_metrics.metrics_using_labels(merged_adversary_experiment_final, evaluation_column = 'Campaign_Tag')
truth_matrix.fillna('0', inplace=True)
evaluation_column = 'Campaign_Tag'

In [None]:
all_features[all_features['hash'] == '2046bb14d9f63612a6d19d5224727f3607be4e8f8ad13e9efe34620fda2d9d99'].T

In [None]:
truth_matrix[truth_matrix['Campaign_Tag'] == 'C0006']

In [None]:
all_features[all_features['Campaign_Tag'] == 'StarBlizzardJan2024']

In [None]:
merg_res = all_features.copy()
merg_res['labels'] = y_pred


In [None]:
merg_res