# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
import pickle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_curve, recall_score, auc, roc_auc_score, roc_curve, precision_score, f1_score
from mlxtend.plotting import plot_confusion_matrix
from scikitplot.metrics import plot_cumulative_gain, plot_precision_recall, plot_roc_curve

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Load Data

In [None]:
data_dir = "/mnt/hdd/Datasets/Obfuscated_JS/"

In [None]:
folders = os.listdir(data_dir)
folders

In [None]:
data = []
labels = []

for folder in folders:
    files = os.listdir(os.path.join(data_dir, folder))
    for file in files:
        file_path = os.path.join(data_dir, folder, file)
        with open(file_path, "r") as f:
            js = f.read().replace("\n", "")
            js = str(js)
            data.append(js)
            labels.append(folder)

In [None]:
df = pd.DataFrame({"js": data, "label": labels})
df.head()

In [None]:
df["label"].value_counts().plot(kind="bar")

In [None]:
df["label"].value_counts().plot(kind="pie", autopct="%.1f%%", shadow=True, explode=[0, 0.1])

# Preprocess

In [None]:
def label_encoder(column):
    le = LabelEncoder().fit(column)
    print(column.name, le.classes_)
    return le.transform(column)

In [None]:
df["label"] = label_encoder(df["label"])

In [None]:
class_names = ["not-obfuscated", "obfuscated"]

In [None]:
X = df["js"]
y = df["label"]

In [None]:
hv = HashingVectorizer(ngram_range=(1, 3))
X_vect = hv.fit_transform(X)

In [None]:
pickle.dump(hv, open("hashing.pkl", "wb"))

In [None]:
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X_vect)

In [None]:
pickle.dump(tfidf, open("tfidf.pkl", "wb"))

# Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [None]:
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

# Logistic Regression

In [None]:
logreg = LogisticRegression()
start = time.time()
logreg.fit(X_train, y_train)
end = time.time()
logreg_time = end - start
print("Logistic Regression Train Time:", logreg_time)

In [None]:
pickle.dump(logreg, open("logreg.pkl", "wb"))

In [None]:
logreg_scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=3)
print("Logistic Regression Cross-Validation Scores:", logreg_scores)

In [None]:
logreg_pred_train = logreg.predict(X_train)
logreg_pred_test = logreg.predict(X_test)

logreg_train_score = accuracy_score(logreg_pred_train, y_train)
logreg_test_score = accuracy_score(logreg_pred_test, y_test)
print("Logistic Regression Train Score:", logreg_train_score)
print("Logistic Regression Test Score:", logreg_test_score)

In [None]:
logreg_precision_score = precision_score(y_test, logreg_pred_test)
logreg_f1_score = f1_score(y_test, logreg_pred_test)
logreg_recall_score = recall_score(y_test, logreg_pred_test)
logreg_accuracy_score = accuracy_score(y_test, logreg_pred_test)

print("Logistic Regression Precision Score:", logreg_precision_score)
print("Logistic Regression F1 Score:", logreg_f1_score)
print("Logistic Regression Recall Score:", logreg_recall_score)
print("Logistic Regression Accuracy Score:", logreg_accuracy_score)

In [None]:
print(classification_report(y_test, logreg_pred_test, target_names=class_names))

In [None]:
logreg_cm = confusion_matrix(y_test, logreg_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=logreg_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("Logistic Regression Confusion Matrix")
plt.show()

In [None]:
logreg_test_proba = logreg.predict_proba(X_test)

In [None]:
plot_precision_recall(y_test, logreg_test_proba)
plt.title("Logistic Regression Precision-Recall Curve")
plt.show()

In [None]:
plot_roc_curve(y_test, logreg_test_proba)
plt.title("Logistic Regression ROC Curve")
plt.show()

In [None]:
plot_cumulative_gain(y_test, logreg_test_proba)
plt.title("Logistic Regression Cumulative Gains")
plt.show()

# Random Forest

In [None]:
rf = RandomForestClassifier()
start = time.time()
rf.fit(X_train, y_train)
end = time.time()
rf_time = end - start
print("Random Forest Train Time:", rf_time)

In [None]:
pickle.dump(rf, open("rf.pkl", "wb"))

In [None]:
rf_scores = cross_val_score(RandomForestClassifier(), X_train, y_train, cv=3)
print("Random Forest Cross-Validation Scores:", rf_scores)

In [None]:
rf_pred_train = rf.predict(X_train)
rf_pred_test = rf.predict(X_test)

rf_train_score = accuracy_score(rf_pred_train, y_train)
rf_test_score = accuracy_score(rf_pred_test, y_test)
print("Random Forest Train Score:", rf_train_score)
print("Random Forest Test Score:", rf_test_score)

In [None]:
rf_precision_score = precision_score(y_test, rf_pred_test)
rf_f1_score = f1_score(y_test, rf_pred_test)
rf_recall_score = recall_score(y_test, rf_pred_test)
rf_accuracy_score = accuracy_score(y_test, rf_pred_test)

print("Random Forest Precision Score:", rf_precision_score)
print("Random Forest F1 Score:", rf_f1_score)
print("Random Forest Recall Score:", rf_recall_score)
print("Random Forest Accuracy Score:", rf_accuracy_score)

In [None]:
print(classification_report(y_test, rf_pred_test, target_names=class_names))

In [None]:
rf_cm = confusion_matrix(y_test, rf_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=rf_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("Random Forest Confusion Matrix")
plt.show()

In [None]:
rf_test_proba = rf.predict_proba(X_test)

In [None]:
plot_precision_recall(y_test, rf_test_proba)
plt.title("Random Forest Precision-Recall Curve")
plt.show()

In [None]:
plot_roc_curve(y_test, rf_test_proba)
plt.title("Random Forest ROC Curve")
plt.show()

In [None]:
plot_cumulative_gain(y_test, rf_test_proba)
plt.title("Random Forest Cumulative Gains")
plt.show()

# Decision Tree

In [None]:
dt = DecisionTreeClassifier()
start = time.time()
dt.fit(X_train, y_train)
end = time.time()
dt_time = end - start
print("Decision Tree Train Time:", dt_time)

In [None]:
pickle.dump(dt, open("dt.pkl", "wb"))

In [None]:
dt_scores = cross_val_score(DecisionTreeClassifier(), X_train, y_train, cv=3)
print("Decision Tree Cross-Validation Scores:", dt_scores)

In [None]:
dt_pred_train = dt.predict(X_train)
dt_pred_test = dt.predict(X_test)

dt_train_score = accuracy_score(dt_pred_train, y_train)
dt_test_score = accuracy_score(dt_pred_test, y_test)
print("Decision Tree Train Score:", dt_train_score)
print("Decision Tree Test Score:", dt_test_score)

In [None]:
dt_precision_score = precision_score(y_test, dt_pred_test)
dt_f1_score = f1_score(y_test, dt_pred_test)
dt_recall_score = recall_score(y_test, dt_pred_test)
dt_accuracy_score = accuracy_score(y_test, dt_pred_test)

print("Decision Tree Precision Score:", dt_precision_score)
print("Decision Tree F1 Score:", dt_f1_score)
print("Decision Tree Recall Score:", dt_recall_score)
print("Decision Tree Accuracy Score:", dt_accuracy_score)

In [None]:
print(classification_report(y_test, dt_pred_test, target_names=class_names))

In [None]:
dt_cm = confusion_matrix(y_test, dt_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=dt_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("Decision Tree Confusion Matrix")
plt.show()

In [None]:
dt_test_proba = dt.predict_proba(X_test)

In [None]:
plot_precision_recall(y_test, dt_test_proba)
plt.title("Decision Tree Precision-Recall Curve")
plt.show()

In [None]:
plot_roc_curve(y_test, dt_test_proba)
plt.title("Decision Tree ROC Curve")
plt.show()

In [None]:
plot_cumulative_gain(y_test, dt_test_proba)
plt.title("Decision Tree Cumulative Gains")
plt.show()

# XGB

In [None]:
xgb = XGBClassifier()
start = time.time()
xgb.fit(X_train, y_train)
end = time.time()
xgb_time = end - start
print("XGBClassifier Train Time:", xgb_time)

In [None]:
pickle.dump(xgb, open("xgb.pkl", "wb"))

In [None]:
xgb_scores = cross_val_score(XGBClassifier(), X_train, y_train, cv=3)
print("XGBClassifier Cross-Validation Scores:", xgb_scores)

In [None]:
xgb_pred_train = xgb.predict(X_train)
xgb_pred_test = xgb.predict(X_test)

xgb_train_score = accuracy_score(xgb_pred_train, y_train)
xgb_test_score = accuracy_score(xgb_pred_test, y_test)
print("XGBClassifier Train Score:", xgb_train_score)
print("XGBClassifier Test Score:", xgb_test_score)

In [None]:
xgb_precision_score = precision_score(y_test, xgb_pred_test)
xgb_f1_score = f1_score(y_test, xgb_pred_test)
xgb_recall_score = recall_score(y_test, xgb_pred_test)
xgb_accuracy_score = accuracy_score(y_test, xgb_pred_test)

print("XGBClassifier Precision Score:", xgb_precision_score)
print("XGBClassifier F1 Score:", xgb_f1_score)
print("XGBClassifier Recall Score:", xgb_recall_score)
print("XGBClassifier Accuracy Score:", xgb_accuracy_score)

In [None]:
print(classification_report(y_test, xgb_pred_test, target_names=class_names))

In [None]:
xgb_cm = confusion_matrix(y_test, xgb_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=xgb_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("XGB Confusion Matrix")
plt.show()

In [None]:
xgb_test_proba = xgb.predict_proba(X_test)

In [None]:
plot_precision_recall(y_test, xgb_test_proba)
plt.title("XGB Precision-Recall Curve")
plt.show()

In [None]:
plot_roc_curve(y_test, xgb_test_proba)
plt.title("XGB ROC Curve")
plt.show()

In [None]:
plot_cumulative_gain(y_test, xgb_test_proba)
plt.title("XGB Cumulative Gains")
plt.show()

In [None]:
labels = ["LR", "DT", "RF", "XGB"]
scores = [logreg_test_score, dt_test_score, rf_test_score, xgb_test_score]

In [None]:
def plot_model_plot(labels, scores):
    plt.figure()
    ax = sns.barplot(x=labels, y=scores)
    ax.set_title("Trained Models Accuracy")
    for container in ax.containers:
        ax.bar_label(container)

In [None]:
plot_model_plot(labels, scores)
plt.show()

# Test

In [None]:
normal_js = """document.addEventListener('DOMContentLoaded', function () { var checkButton = document.getElementById('checkButton');var contentInput = document.getElementById('content');var resultElement = document.getElementById('result');checkButton.addEventListener('click', function () {var content = contentInput.value;var data = { content: content }; fetch('http://localhost:5000/check-spam', {method: 'POST',headers: {'Content-Type': 'application/json'},body: JSON.stringify(data)}).then(response => response.json()).then(result => {if (result.spam) {resultElement.textContent = 'SPAM';} else {resultElement.textContent = 'NOT SPAM';}}).catch(error => {console.error('Error:', error);});});});"""

In [None]:
obfuscated_js = """var _0x2e798d=_0x6bac;function _0x6bac(_0x545aba,_0x4f5617){var _0x4af7cb=_0x4af7();return _0x6bac=function(_0x6bacb7,_0x2583a0){_0x6bacb7=_0x6bacb7-0x14d;var _0x5e6e39=_0x4af7cb[_0x6bacb7];return _0x5e6e39;},_0x6bac(_0x545aba,_0x4f5617);}function _0x4af7(){var _0x520823=['30524263MYtvJR','getElementById','content','1824sXZsMa','value','addEventListener','4817704xcwRmz','1714ruszmI','SPAM','DOMContentLoaded','then','1706079MjbDTe','NOT\x20SPAM','http://localhost:5000/check-spam','click','2464976OdVnyn','4008njBbAn','json','error','result','Error:','textContent','spam','74997dCjyST','380jLuyKF','stringify','POST','540qXTwmX','14YmYPED'];_0x4af7=function(){return _0x520823;};return _0x4af7();}(function(_0x187d2c,_0x497914){var _0x237a90=_0x6bac,_0x6fe22e=_0x187d2c();while(!![]){try{var _0x3c0b49=parseInt(_0x237a90(0x14d))/0x1*(parseInt(_0x237a90(0x151))/0x2)+parseInt(_0x237a90(0x155))/0x3+parseInt(_0x237a90(0x159))/0x4+-parseInt(_0x237a90(0x165))/0x5*(parseInt(_0x237a90(0x15a))/0x6)+-parseInt(_0x237a90(0x166))/0x7*(-parseInt(_0x237a90(0x150))/0x8)+-parseInt(_0x237a90(0x161))/0x9*(parseInt(_0x237a90(0x162))/0xa)+-parseInt(_0x237a90(0x167))/0xb;if(_0x3c0b49===_0x497914)break;else _0x6fe22e['push'](_0x6fe22e['shift']());}catch(_0x22766b){_0x6fe22e['push'](_0x6fe22e['shift']());}}}(_0x4af7,0xc0940),document['addEventListener'](_0x2e798d(0x153),function(){var _0xca7897=_0x2e798d,_0x1dfe1b=document[_0xca7897(0x168)]('checkButton'),_0x4fb26b=document[_0xca7897(0x168)](_0xca7897(0x169)),_0x8440fe=document[_0xca7897(0x168)](_0xca7897(0x15d));_0x1dfe1b[_0xca7897(0x14f)](_0xca7897(0x158),function(){var _0x419232=_0xca7897,_0x3325dc=_0x4fb26b[_0x419232(0x14e)],_0x7f54fc={'content':_0x3325dc};fetch(_0x419232(0x157),{'method':_0x419232(0x164),'headers':{'Content-Type':'application/json'},'body':JSON[_0x419232(0x163)](_0x7f54fc)})[_0x419232(0x154)](_0x12d223=>_0x12d223[_0x419232(0x15b)]())[_0x419232(0x154)](_0x29e918=>{var _0x707f05=_0x419232;_0x29e918[_0x707f05(0x160)]?_0x8440fe[_0x707f05(0x15f)]=_0x707f05(0x152):_0x8440fe[_0x707f05(0x15f)]=_0x707f05(0x156);})['catch'](_0x5d1f8d=>{var _0xc8b180=_0x419232;console[_0xc8b180(0x15c)](_0xc8b180(0x15e),_0x5d1f8d);});});}));"""

In [None]:
test_normal = hv.transform([normal_js])
test_normal = tfidf.transform(test_normal)
result_array = xgb.predict(test_normal)
print("Result: ", class_names[result_array[0]])

In [None]:
test_obfuscated = hv.transform([obfuscated_js])
test_obfuscated = tfidf.transform(test_obfuscated)
result_array = xgb.predict(test_obfuscated)
print("Result: ", class_names[result_array[0]])