In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    recall_score,
    f1_score,
    precision_score,
)

In [3]:
# Load the dataset
file_path = "../data/la_morgia_data/features_15S.csv.gz"
data = pd.read_csv(file_path, compression="gzip")


# Select the specified features and the target variable
features = [
    "std_rush_order",
    "avg_rush_order",
    "std_trades",
    "std_volume",
    "avg_volume",
    "std_price",
    "avg_price",
    "avg_price_max",
]
X = data[features]
y = data["gt"]

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [4]:
value_counts = y_train.value_counts()
print(value_counts)
print(
    f"Fraction of Anomalies: {value_counts[1] / (value_counts.iloc[1]+value_counts.iloc[0])}"
)

gt
0    467032
1       251
Name: count, dtype: int64
Fraction of Anomalies: 0.0005371477241842737


# Gaussian Naive Bayes

In [5]:
# Initialize the Gaussian Naive Bayes classifier and train the model on the training data
model = GaussianNB()
model.fit(X_train, y_train)

# Predict the labels for the testing set
y_pred = model.predict(X_test)


# Print the results
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1_Macro: ", f1_score(y_test, y_pred, average="macro"))
print("F1_Micro: ", f1_score(y_test, y_pred, average="micro"))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00    116755
           1       0.06      0.95      0.11        66

    accuracy                           0.99    116821
   macro avg       0.53      0.97      0.55    116821
weighted avg       1.00      0.99      0.99    116821

Confusion Matrix:
 [[115700   1055]
 [     3     63]]
Accuracy:  0.9909434091473279
Precision:  0.05635062611806798
Recall:  0.9545454545454546
F1_Macro:  0.5509337795516912
F1_Micro:  0.990943409147328
