# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import time
import pickle
import missingno
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, f1_score, 
    precision_recall_curve, precision_score, recall_score, roc_auc_score
)
from mlxtend.plotting import plot_confusion_matrix
from scikitplot.metrics import plot_precision_recall, plot_roc_curve

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Load Data

In [None]:
df = pd.read_csv("/mnt/hdd/Datasets/HybridAppsDataset.csv")
df.head()

In [None]:
df.drop(["Unnamed: 0", "app_hash"], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
missingno.bar(df, color="red")

In [None]:
df.info()

# EDA

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="webview_tab", hue="label")
for container in ax.containers:
    ax.bar_label(container)
plt.title("webview_tab")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="js_enabled", hue="label")
for container in ax.containers:
    ax.bar_label(container)
plt.title("js_enabled")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="js_inf_defined", hue="label")
for container in ax.containers:
    ax.bar_label(container)
plt.title("js_inf_defined")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="acc_sys_call", hue="label")
for container in ax.containers:
    ax.bar_label(container)
plt.title("acc_sys_call")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="obf_js_permit", hue="label")
for container in ax.containers:
    ax.bar_label(container)
plt.title("obf_js_permit")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="inf_droid_Code_obf", hue="label")
for container in ax.containers:
    ax.bar_label(container)
plt.title("inf_droid_Code_obf")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="out_url", hue="label")
for container in ax.containers:
    ax.bar_label(container)
plt.title("out_url")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="gsafe_brow", hue="label")
for container in ax.containers:
    ax.bar_label(container)
plt.title("gsafe_brow")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="https", hue="label")
for container in ax.containers:
    ax.bar_label(container)
plt.title("https")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="js_input_val", hue="label")
for container in ax.containers:
    ax.bar_label(container)
plt.title("js_input_val")
plt.show()

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="web_redirect", hue="label")
for container in ax.containers:
    ax.bar_label(container)
plt.title("web_redirect")
plt.show()

In [None]:
plt.figure(figsize=(14, 5))
sns.histplot(data=df, x="js_inf_len", kde=True)
mean = df["js_inf_len"].mean()
median = df["js_inf_len"].median()
plt.axvline(mean, color="r", linestyle="--", label="Mean")
plt.axvline(median, color="g", linestyle="-", label="Median")
plt.xlabel("js_inf_len")
plt.ylabel("count")
plt.legend()
plt.title("js_inf_len")
plt.show()

# Preprocess

In [None]:
df.head()

In [None]:
for col in df.columns:
    if df[col].dtype == np.bool_:
        df[col] = df[col].map({True: 1, False: 0})

In [None]:
df.head()

In [None]:
def label_encoder(column):
    le = LabelEncoder().fit(column)
    print(column.name, le.classes_)
    return le.transform(column)

In [None]:
df["webview_tab"] = label_encoder(df["webview_tab"])

In [None]:
df["label"] = label_encoder(df["label"])

In [None]:
class_names = ["benign", "malicious"]

In [None]:
X = df.drop("label", axis=1)
y = df["label"]

In [None]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
ss = StandardScaler()
X_scaled = ss.fit_transform(X_resampled)

In [None]:
pickle.dump(ss, open("ss.pkl", "wb"))

# Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2, random_state=42)

# Logistic Regression

In [None]:
logreg = LogisticRegression()
start = time.time()
logreg.fit(X_train, y_train)
end = time.time()
logreg_time = end - start
print("Logistic Regression Train Time:", logreg_time)

In [None]:
pickle.dump(logreg, open("logreg.pkl", "wb"))

In [None]:
logreg_scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=3)
print("Logistic Regression Cross-Validation Scores:", logreg_scores)

In [None]:
logreg_pred_train = logreg.predict(X_train)
logreg_pred_test = logreg.predict(X_test)
logreg_test_proba = logreg.predict_proba(X_test)

logreg_train_score = accuracy_score(logreg_pred_train, y_train)
logreg_test_score = accuracy_score(logreg_pred_test, y_test)
print("Logistic Regression Train Score:", logreg_train_score)
print("Logistic Regression Test Score:", logreg_test_score)

In [None]:
logreg_precision_score = precision_score(y_test, logreg_pred_test)
logreg_f1_score = f1_score(y_test, logreg_pred_test)
logreg_recall_score = recall_score(y_test, logreg_pred_test)
logreg_accuracy_score = accuracy_score(y_test, logreg_pred_test)

print("Logistic Regression Precision Score:", logreg_precision_score)
print("Logistic Regression F1 Score:", logreg_f1_score)
print("Logistic Regression Recall Score:", logreg_recall_score)
print("Logistic Regression Accuracy Score:", logreg_accuracy_score)

In [None]:
print(classification_report(y_test, logreg_pred_test, target_names=class_names))

In [None]:
logreg_cm = confusion_matrix(y_test, logreg_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=logreg_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("Logistic Regression Confusion Matrix")
plt.show()

In [None]:
plot_precision_recall(y_test, logreg_test_proba)
plt.title("Logistic Regression Precision-Recall Curve")
plt.show()

In [None]:
plot_roc_curve(y_test, logreg_test_proba)
plt.title("Logistic Regression ROC Curve")
plt.show()

# Random Forest

In [None]:
rf = RandomForestClassifier()
start = time.time()
rf.fit(X_train, y_train)
end = time.time()
rf_time = end - start
print("Random Forest Train Time:", rf_time)

In [None]:
pickle.dump(rf, open("rf.pkl", "wb"))

In [None]:
rf_scores = cross_val_score(RandomForestClassifier(), X_train, y_train, cv=3)
print("Random Forest Cross-Validation Scores:", rf_scores)

In [None]:
rf_pred_train = rf.predict(X_train)
rf_pred_test = rf.predict(X_test)
rf_test_proba = rf.predict_proba(X_test)

rf_train_score = accuracy_score(rf_pred_train, y_train)
rf_test_score = accuracy_score(rf_pred_test, y_test)
print("Random Forest Train Score:", rf_train_score)
print("Random Forest Test Score:", rf_test_score)

In [None]:
rf_precision_score = precision_score(y_test, rf_pred_test)
rf_f1_score = f1_score(y_test, rf_pred_test)
rf_recall_score = recall_score(y_test, rf_pred_test)
rf_accuracy_score = accuracy_score(y_test, rf_pred_test)

print("Random Forest Precision Score:", rf_precision_score)
print("Random Forest F1 Score:", rf_f1_score)
print("Random Forest Recall Score:", rf_recall_score)
print("Random Forest Accuracy Score:", rf_accuracy_score)

In [None]:
print(classification_report(y_test, rf_pred_test, target_names=class_names))

In [None]:
rf_cm = confusion_matrix(y_test, rf_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=rf_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("Random Forest Confusion Matrix")
plt.show()

In [None]:
plot_precision_recall(y_test, rf_test_proba)
plt.title("Random Forest Precision-Recall Curve")
plt.show()

In [None]:
plot_roc_curve(y_test, rf_test_proba)
plt.title("Random Forest ROC Curve")
plt.show()

# AdaBoost 

In [None]:
ada = AdaBoostClassifier()
start = time.time()
ada.fit(X_train, y_train)
end = time.time()
ada_time = end - start
print("AdaBoost Train Time:", ada_time)

In [None]:
pickle.dump(ada, open("ada.pkl", "wb"))

In [None]:
ada_scores = cross_val_score(AdaBoostClassifier(), X_train, y_train, cv=3)
print("AdaBoost Cross-Validation Scores:", ada_scores)

In [None]:
ada_pred_train = ada.predict(X_train)
ada_pred_test = ada.predict(X_test)
ada_test_proba = ada.predict_proba(X_test)

ada_train_score = accuracy_score(ada_pred_train, y_train)
ada_test_score = accuracy_score(ada_pred_test, y_test)
print("AdaBoost Train Score:", ada_train_score)
print("AdaBoost Test Score:", ada_test_score)

In [None]:
ada_precision_score = precision_score(y_test, ada_pred_test)
ada_f1_score = f1_score(y_test, ada_pred_test)
ada_recall_score = recall_score(y_test, ada_pred_test)
ada_accuracy_score = accuracy_score(y_test, ada_pred_test)

print("AdaBoost Precision Score:", ada_precision_score)
print("AdaBoost F1 Score:", ada_f1_score)
print("AdaBoost Recall Score:", ada_recall_score)
print("AdaBoost Accuracy Score:", ada_accuracy_score)

In [None]:
print(classification_report(y_test, ada_pred_test, target_names=class_names))

In [None]:
ada_cm = confusion_matrix(y_test, ada_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=ada_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("AdaBoost Confusion Matrix")
plt.show()

In [None]:
plot_precision_recall(y_test, ada_test_proba)
plt.title("AdaBoost Precision-Recall Curve")
plt.show()

In [None]:
plot_roc_curve(y_test, ada_test_proba)
plt.title("AdaBoost ROC Curve")
plt.show()

# Gradient Boosting

In [None]:
gb = GradientBoostingClassifier()
start = time.time()
gb.fit(X_train, y_train)
end = time.time()
gb_time = end - start
print("Gradient Boosting Train Time:", gb_time)

In [None]:
pickle.dump(gb, open("gb.pkl", "wb"))

In [None]:
gb_scores = cross_val_score(GradientBoostingClassifier(), X_train, y_train, cv=3)
print("Gradient Boosting Cross-Validation Scores:", gb_scores)

In [None]:
gb_pred_train = gb.predict(X_train)
gb_pred_test = gb.predict(X_test)
gb_test_proba = gb.predict_proba(X_test)

gb_train_score = accuracy_score(gb_pred_train, y_train)
gb_test_score = accuracy_score(gb_pred_test, y_test)
print("Gradient Boosting Train Score:", gb_train_score)
print("Gradient Boosting Test Score:", gb_test_score)

In [None]:
gb_precision_score = precision_score(y_test, gb_pred_test)
gb_f1_score = f1_score(y_test, gb_pred_test)
gb_recall_score = recall_score(y_test, gb_pred_test)
gb_accuracy_score = accuracy_score(y_test, gb_pred_test)

print("Gradient Boosting Precision Score:", gb_precision_score)
print("Gradient Boosting F1 Score:", gb_f1_score)
print("Gradient Boosting Recall Score:", gb_recall_score)
print("Gradient Boosting Accuracy Score:", gb_accuracy_score)

In [None]:
print(classification_report(y_test, gb_pred_test, target_names=class_names))

In [None]:
gb_cm = confusion_matrix(y_test, gb_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=gb_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("Gradient Boosting Confusion Matrix")
plt.show()

In [None]:
plot_precision_recall(y_test, gb_test_proba)
plt.title("Gradient Boosting Precision-Recall Curve")
plt.show()

In [None]:
plot_roc_curve(y_test, gb_test_proba)
plt.title("Gradient Boosting ROC Curve")
plt.show()

In [None]:
labels = ["LR", "ADA", "RF", "GB"]
scores = [logreg_test_score, ada_test_score, rf_test_score, gb_test_score]

In [None]:
def plot_model_plot(labels, scores):
    plt.figure()
    ax = sns.barplot(x=labels, y=scores)
    ax.set_title("Trained Models Accuracy")
    for container in ax.containers:
        ax.bar_label(container)

In [None]:
plot_model_plot(labels, scores)
plt.show()