In [None]:
import concurrent
import concurrent.futures
import os
import pickle
import re
import statistics
import warnings

import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import regex
import tqdm
from featgenerator.config import Config
from featgenerator.exif_feat import ExifFeatures
from featgenerator.featurizer import Featurizer
from featgenerator.floss_general_feat import FlossFeatures
from featgenerator.floss_regex import FlossRegexFeatures
from featgenerator.lief_features import LiefFeatures, get_features_from_function_lists
from featgenerator.malcat import MalcatFeatures
from featgenerator import feature_transformation
from featgenerator.util import ClusteringMetrics, DataProcessor, MinHashLSHForest
from itables import init_notebook_mode, show
from keras import models
from keras.layers import Dense, Input, Reshape
from keras.models import Model
from keras.optimizers import Adam
from sklearn.cluster import AffinityPropagation, AgglomerativeClustering, KMeans
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfTransformer,
    TfidfVectorizer,
)
from sklearn.metrics import (
    adjusted_mutual_info_score,
    adjusted_rand_score,
    auc,
    classification_report,
    davies_bouldin_score,
    roc_auc_score,
    roc_curve,
    silhouette_score,
    v_measure_score,
)
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import (
    LabelEncoder,
    QuantileTransformer,
    label_binarize,
    normalize,
)
from featgenerator.util import Util
from transformers import AutoModel, AutoTokenizer
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from importlib import reload
from featgenerator import util
from featgenerator import util
import os

%matplotlib inline

In [None]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:1024'
warnings.filterwarnings("ignore")
from itables import init_notebook_mode

init_notebook_mode(all_interactive=True)

util = reload(util)
feature_transformation = reload(feature_transformation)

# Reload the util module if you make changes
# util = reload(util)

## Before loading config make sure you have the right root_dir in the config file

In [None]:
conf = Config()

adversary_dataset = pd.read_csv(conf.get_adversary_mapping())
adversary_dataset['hash'] = adversary_dataset['sha256'].copy()

In [None]:

# To get feature specific information you can get the keys and get the downstream keys
# E.g. 
# >>> joined_data_dict.keys()
# dict_keys(['floss_features', 'exif_features', 'malcat_features', 'lief_features', 'flossregex_features', 'exported_functions_features', 'configuration_version'])
# To get the exported function features
# >>> exdf = joined_data_dict['exported_functions_features']

joined_df, joined_data_dict = feature_transformation.get_combined_features(
    floss_features=False,
    exif_features=True,
    malcat_features=True,
    lief_features=True,
    flossregex_features=True,
    exported_functions_features=True,
    configuration_version=True
)
lcf = joined_data_dict['lief_features']

In [None]:
embedding_df = Util().process_raw_strings_dataset(conf.get_root_dir(), conf.get_floss_file(), joined_df['hash'])

In [None]:
embedding_df['hash'] = joined_df['hash'].astype(str).copy()

In [None]:
joined_inner = joined_df.merge(embedding_df,how="inner", on=["hash"])

In [None]:
autoencoder, X_encoded, combined_features, all_features  = feature_transformation.prepare_and_encode_features(joined_df, embedding_df, adversary_dataset, target_column="Campaign_Tag", include_embedding=False, num_epochs=20)

In [None]:
n_clusters = list(np.arange(5, 60, 2))

In [None]:
modelling = util.Modelling()
all_params, best_param, best_truth_matrix = modelling.find_best_agglo(combined_features, n_clusters,all_features[['hash', 'Campaign_Tag']], 'Campaign_Tag')

In [None]:
show(pd.DataFrame(all_params))

In [None]:
agglomerative = AgglomerativeClustering(n_clusters=55, metric="euclidean", linkage="ward")
y_pred = agglomerative.fit_predict(combined_features)

In [None]:
merg_res = all_features.copy()
merg_res['labels'] = y_pred

merged_adversary_experiment_final = all_features.copy()
merged_adversary_experiment_final['labels'] = y_pred
clustering_metrics = util.ClusteringMetrics()
truth_matrix, precision, recall = clustering_metrics.metrics_using_labels(merged_adversary_experiment_final, evaluation_column = 'Campaign_Tag')
truth_matrix.fillna('0', inplace=True)
evaluation_column = 'Campaign_Tag'

In [None]:
truth_matrix

In [None]:
truth_matrix[truth_matrix['Campaign_Tag'] == 'UNC4990Jan2024']