In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d.axes3d import Axes3D

import warnings
warnings.filterwarnings("ignore")

In [None]:
import pymongo
client = pymongo.MongoClient("mongodb://mongodb:mongodb@localhost:27017")
for db in client.list_databases():
    print(db)

In [None]:
mydb = client.workshop_db
print("List of collections\n--------------------")
for coll in mydb.list_collection_names():
    print(coll)

In [None]:
command_lst = []
cursor = mydb.ds.find({"fields.input": {"$exists":"true"}, "$where":"this.fields.input.length > 0"}, {"fields.input": 1})
for i in cursor:
    command_lst.append(i["fields"]["input"][0])

In [None]:
command_lst[:10]

In [None]:
command_lst = list(set(command_lst))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(token_pattern=r"[-/\w]*")
tf_model = vectorizer.fit_transform(command_lst)
tf_model_df = pd.DataFrame(tf_model.toarray(), columns = vectorizer.get_feature_names_out())
tf_model_df.head(10)

In [None]:
tf_model_df.shape

In [None]:
from sklearn.cluster import KMeans
mod = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, random_state=123, tol=0.0001, verbose=0)
res = mod.fit_transform(tf_model_df)
plt.figure(figsize=(8,4))
plt.scatter(res[:,0], res[:,1])
plt.title("Unsupervised Clustering")
plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

linked = linkage(tf_model_df, 'ward')

plt.figure(figsize=(15, 15))
dendrogram(linked,
            orientation='left',
            labels=command_lst,
            distance_sort='descending',
            show_leaf_counts=True,
            truncate_mode="level", p=4)
resolution_value = 200
plt.savefig("myImage.png", format="png", dpi=resolution_value)
plt.show()

In [None]:
from wordcloud import WordCloud
word_cloud = WordCloud(collocations = False, background_color = 'white').generate(" ".join(command_lst))

In [None]:
plt.figure(figsize=(10,10))
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Using SLP

In [None]:
from slp import ShellTokenizer, ShellEncoder

In [None]:
t = ShellTokenizer(verbose=False)

In [None]:
command_lst = [x.replace('"', '').replace('#', '') for x in command_lst]

In [None]:
command_lst = [re.sub(r'hive-passwd [0-9A-z]*', 'hive-passwd PASS', x) for x in command_lst]

In [None]:
command_lst = list(set(command_lst))

In [None]:
X_corpus, X_counter = t.tokenize(command_lst)

In [None]:
encoder = ShellEncoder(X_corpus, X_counter, top_tokens=100, verbose=False)

In [None]:
X = {}
X["tfidf"] = encoder.tfidf()
X["one-hot"] = encoder.onehot()
X["labels"] = encoder.labels(pad_width=100)

_ = [print(X[k].shape) for k in X]

In [None]:
# Initialize KMeans
n_clusters = 3  # Replace with the number of clusters you want
mod = KMeans(n_clusters=n_clusters, algorithm='auto', copy_x=True, init='k-means++')

# Fit and transform
# fit_transform returns the distances to the cluster centers
res = mod.fit_transform(X["tfidf"])

# Plotting
fig, ax = plt.subplots(figsize=(8, 4))
ax.scatter(res[:, 0], res[:, 1], c=mod.labels_, marker='o', s=75)
ax.set_xlabel('Dimension 1')
ax.set_ylabel('Dimension 2')
plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

linked = linkage(pd.DataFrame(X['tfidf'].toarray()), 'ward')

plt.figure(figsize=(15, 30))
dendrogram(linked,
           orientation='left',
           labels=command_lst,
           distance_sort='descending',
           show_leaf_counts=True)
resolution_value = 500
#plt.savefig("myImage.png", format="png", dpi=resolution_value)
plt.show()

In [None]:
# Initialize KMeans
n_clusters = 3  # Replace with the number of clusters you want
mod = KMeans(n_clusters=n_clusters, algorithm='auto', copy_x=True, init='k-means++')

# Fit and transform
# fit_transform returns the distances to the cluster centers
res = mod.fit_transform(X["one-hot"])

# Plotting
fig, ax = plt.subplots(figsize=(8, 4))
ax.scatter(res[:, 0], res[:, 1], c=mod.labels_, marker='o', s=75)
ax.set_xlabel('Dimension 1')
ax.set_ylabel('Dimension 2')
plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

linked = linkage(pd.DataFrame(X['one-hot'].toarray()), 'ward')

plt.figure(figsize=(15, 30))
dendrogram(linked,
           orientation='left',
           labels=command_lst,
           distance_sort='descending',
           show_leaf_counts=True)
resolution_value = 500
plt.savefig("myImage2.png", format="png", dpi=resolution_value)
plt.show()

In [None]:
# Initialize KMeans
n_clusters = 3  # Replace with the number of clusters you want
mod = KMeans(n_clusters=n_clusters, algorithm='auto', copy_x=True, init='k-means++')

# Fit and transform
# fit_transform returns the distances to the cluster centers
res = mod.fit_transform(X["labels"])

# Plotting
fig, ax = plt.subplots(figsize=(8, 4))
ax.scatter(res[:, 0], res[:, 1], c=mod.labels_, marker='o', s=75)
ax.set_xlabel('Dimension 1')
ax.set_ylabel('Dimension 2')
plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

linked = linkage(pd.DataFrame(X['labels'].toarray()), 'ward')

plt.figure(figsize=(15, 30))
dendrogram(linked,
           orientation='left',
           labels=command_lst,
           distance_sort='descending',
           show_leaf_counts=True)
resolution_value = 500
plt.savefig("myImage3.png", format="png", dpi=resolution_value)
plt.show()