# Real and Synthetic data raw without balancing and data handling

### import libraries and read data files

In [1]:
from ctgan import CTGAN

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [2]:
#define data for each scenarios with data paths
synthetic_data = pd.read_csv("../Data/synthetic_data_scenario1.csv") 
feature_df = pd.read_csv("../Data/preprocessed_df_scenario1.csv")


# RF

### RF without Optuna

In [3]:
# Import libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Split the data into training and testing sets
X, y = synthetic_data.drop("Label", axis=1), synthetic_data.Label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the data (fit on training set and transform both training and test sets)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and fit the RandomForestClassifier model with default parameters
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
y_test_pred_proba = rf_model.predict_proba(X_test_scaled)[:, 1]
y_test_pred = rf_model.predict(X_test_scaled)

# Compute metrics
roc_auc = roc_auc_score(y_test, y_test_pred_proba)
accuracy = accuracy_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred)

# Print metrics
print(f"Test ROC-AUC score: {roc_auc:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", class_report)
print("\nConfusion Matrix:\n", conf_matrix)

# Extract True Negatives, False Positives, False Negatives, and True Positives from the confusion matrix
tn, fp, fn, tp = conf_matrix.ravel()

# Calculate False Positive Rate (FPR)
fpr = fp / (fp + tn)
print(f"False Positive Rate: {fpr}")
print(f"TN, FP, FN, TP: {tn, fp, fn, tp}")


Test ROC-AUC score: 0.5129
Test Accuracy: 0.8257

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       520
           1       0.83      1.00      0.90      2480

    accuracy                           0.83      3000
   macro avg       0.41      0.50      0.45      3000
weighted avg       0.68      0.83      0.75      3000


Confusion Matrix:
 [[   0  520]
 [   3 2477]]
False Positive Rate: 1.0
TN, FP, FN, TP: (0, 520, 3, 2477)


In [4]:
#TEST ON REAL DATASET, for each scenario, change test data file name correspondingly. for each model, change rf_model from 1 to 5.

#evaluate the model on the test data
X_test, y_test = feature_df.drop("Label", axis=1), feature_df.Label

#scale the real test data using the same scaler
X_real_scaled = scaler.transform(feature_df.drop("Label", axis=1))
print(classification_report(y_test, rf_model.predict(X_real_scaled)))
#generate the confusion matrix
cm = confusion_matrix(y_test, rf_model.predict(X_real_scaled))

#extract confusion matrix values
TN, FP, FN, TP = cm.ravel()  # Unpack the confusion matrix

#calculate the False Positive Rate (FPR)
FPR = FP / (FP + TN)
print(f"False Positive Rate (FPR): {FPR:.4f}")

#print confusion matrix values with descriptions
print("\nConfusion Matrix Scores:")
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.80      1.00      0.89        48

    accuracy                           0.80        60
   macro avg       0.40      0.50      0.44        60
weighted avg       0.64      0.80      0.71        60

False Positive Rate (FPR): 1.0000

Confusion Matrix Scores:
True Positives (TP): 48
True Negatives (TN): 0
False Positives (FP): 12
False Negatives (FN): 0


### XGB

### XGB without optuna

In [5]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Split the data into training and testing sets
X, y = synthetic_data.drop("Label", axis=1), synthetic_data.Label
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_scaled)
X_test_scaled = scaler.transform(X_test_scaled)

# Initialize and fit the XGBClassifier model with default parameters
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
y_test_pred_proba = xgb_model.predict_proba(X_test_scaled)[:, 1]
y_test_pred = xgb_model.predict(X_test_scaled)

# Compute metrics
roc_auc = roc_auc_score(y_test, y_test_pred_proba)
accuracy = accuracy_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred)

# Print metrics
print(f"Test ROC-AUC score: {roc_auc:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", class_report)
print("\nConfusion Matrix:\n", conf_matrix)



# Assuming `y_true` are the true labels and `y_pred` are the predicted labels
cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()
# Calculate False Positive Rate
fpr = fp / (fp + tn)
print(f"False Positive Rate: {fpr}")
print(f"TN,FP,FN,TP: {tn,fp,fn,tp}")





Test ROC-AUC score: 0.5006
Test Accuracy: 0.8007

Classification Report:
               precision    recall  f1-score   support

           0       0.20      0.05      0.08       520
           1       0.83      0.96      0.89      2480

    accuracy                           0.80      3000
   macro avg       0.51      0.50      0.48      3000
weighted avg       0.72      0.80      0.75      3000


Confusion Matrix:
 [[  26  494]
 [ 104 2376]]
False Positive Rate: 0.95
TN,FP,FN,TP: (26, 494, 104, 2376)


In [6]:
from sklearn.metrics import classification_report

# Evaluate the model on the test data
X_test, y_test = feature_df.drop("Label", axis=1), feature_df.Label
# Scale the real test data using the same scaler
X_real_scaled = scaler.transform(feature_df.drop("Label", axis=1))

print(classification_report(y_test, xgb_model.predict(X_real_scaled)))

# Generate the confusion matrix
cm = confusion_matrix(y_test, xgb_model.predict(X_real_scaled))

# Extract confusion matrix values
TN, FP, FN, TP = cm.ravel()  # Unpack the confusion matrix

# Calculate the False Positive Rate (FPR)
FPR = FP / (FP + TN)
print(f"False Positive Rate (FPR): {FPR:.4f}")

# Print confusion matrix values with descriptions
print("\nConfusion Matrix Scores:")
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

              precision    recall  f1-score   support

           0       1.00      0.08      0.15        12
           1       0.81      1.00      0.90        48

    accuracy                           0.82        60
   macro avg       0.91      0.54      0.53        60
weighted avg       0.85      0.82      0.75        60

False Positive Rate (FPR): 0.9167

Confusion Matrix Scores:
True Positives (TP): 48
True Negatives (TN): 1
False Positives (FP): 11
False Negatives (FN): 0


### LOG REG

### LR Without optuna

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Split the data into training and testing sets
X, y = synthetic_data.drop("Label", axis=1), synthetic_data.Label
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_scaled))
X_test_scaled = pd.DataFrame(scaler.transform(X_test_scaled))

# Initialize and fit the Logistic Regression model with default parameters
Logreg_model = LogisticRegression(max_iter=1000, random_state=42)
Logreg_model.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
y_test_pred_proba = Logreg_model.predict_proba(X_test_scaled)[:, 1]
y_test_pred = Logreg_model.predict(X_test_scaled)

# Compute metrics
roc_auc = roc_auc_score(y_test, y_test_pred_proba)
accuracy = accuracy_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred)

# Print metrics
print(f"Test ROC-AUC score: {roc_auc:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", class_report)
print("\nConfusion Matrix:\n", conf_matrix)



# Assuming `y_true` are the true labels and `y_pred` are the predicted labels
cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()
# Calculate False Positive Rate
fpr = fp / (fp + tn)
print(f"False Positive Rate: {fpr}")
print(f"TN,FP,FN,TP: {tn,fp,fn,tp}")



Test ROC-AUC score: 0.5475
Test Accuracy: 0.8267

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       520
           1       0.83      1.00      0.91      2480

    accuracy                           0.83      3000
   macro avg       0.41      0.50      0.45      3000
weighted avg       0.68      0.83      0.75      3000


Confusion Matrix:
 [[   0  520]
 [   0 2480]]
False Positive Rate: 1.0
TN,FP,FN,TP: (0, 520, 0, 2480)


In [12]:
# Evaluate the model on the test data

X_test, y_test = feature_df.drop("Label", axis=1), feature_df.Label
# Scale the real test data using the same scaler
X_real_scaled = scaler.transform(feature_df.drop("Label", axis=1))

print(classification_report(y_test, Logreg_model.predict(X_real_scaled)))


# Generate the confusion matrix
cm = confusion_matrix(y_test, Logreg_model.predict(X_real_scaled))

# Extract confusion matrix values
TN, FP, FN, TP = cm.ravel()  # Unpack the confusion matrix

# Calculate the False Positive Rate (FPR)
FPR = FP / (FP + TN)
print(f"False Positive Rate (FPR): {FPR:.4f}")

# Print confusion matrix values with descriptions
print("\nConfusion Matrix Scores:")
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.80      1.00      0.89        48

    accuracy                           0.80        60
   macro avg       0.40      0.50      0.44        60
weighted avg       0.64      0.80      0.71        60

False Positive Rate (FPR): 1.0000

Confusion Matrix Scores:
True Positives (TP): 48
True Negatives (TN): 0
False Positives (FP): 12
False Negatives (FN): 0


### Dec tree

### Dec tree default params, without optuna

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, classification_report, confusion_matrix, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Split the data into training and testing sets
X, y = synthetic_data.drop("Label", axis=1), synthetic_data.Label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train))
X_test_scaled = pd.DataFrame(scaler.transform(X_test))

# Initialize and fit the Decision Tree model with default parameters
DT_model = DecisionTreeClassifier(random_state=42)
DT_model.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
y_test_pred_proba = DT_model.predict_proba(X_test_scaled)[:, 1] if hasattr(DT_model, "predict_proba") else DT_model.predict(X_test_scaled)
y_test_pred = DT_model.predict(X_test_scaled)

# Compute metrics
roc_auc = roc_auc_score(y_test, y_test_pred_proba) if hasattr(DT_model, "predict_proba") else None
accuracy = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred)

# Print metrics
print(f"Test ROC-AUC score: {roc_auc:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", class_report)
print("\nConfusion Matrix:\n", conf_matrix)



# Assuming `y_true` are the true labels and `y_pred` are the predicted labels
cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()
# Calculate False Positive Rate
fpr = fp / (fp + tn)
print(f"False Positive Rate: {fpr}")
print(f"TN,FP,FN,TP: {tn,fp,fn,tp}")


Test ROC-AUC score: 0.4873
Test Accuracy: 0.6800

Classification Report:
               precision    recall  f1-score   support

           0       0.16      0.19      0.17       520
           1       0.82      0.78      0.80      2480

    accuracy                           0.68      3000
   macro avg       0.49      0.49      0.49      3000
weighted avg       0.71      0.68      0.69      3000


Confusion Matrix:
 [[ 100  420]
 [ 540 1940]]
False Positive Rate: 0.8076923076923077
TN,FP,FN,TP: (100, 420, 540, 1940)


In [15]:
from sklearn.metrics import classification_report

X_test, y_test = feature_df.drop("Label", axis=1), feature_df.Label
# Scale the real test data using the same scaler
X_real_scaled = scaler.transform(feature_df.drop("Label", axis=1))

print(classification_report(y_test, DT_model.predict(X_real_scaled)))


# Generate the confusion matrix
cm = confusion_matrix(y_test, DT_model.predict(X_real_scaled))

# Extract confusion matrix values
TN, FP, FN, TP = cm.ravel()  # Unpack the confusion matrix

# Calculate the False Positive Rate (FPR)
FPR = FP / (FP + TN)
print(f"False Positive Rate (FPR): {FPR:.4f}")

# Print confusion matrix values with descriptions
print("\nConfusion Matrix Scores:")
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

              precision    recall  f1-score   support

           0       0.38      0.25      0.30        12
           1       0.83      0.90      0.86        48

    accuracy                           0.77        60
   macro avg       0.60      0.57      0.58        60
weighted avg       0.74      0.77      0.75        60

False Positive Rate (FPR): 0.7500

Confusion Matrix Scores:
True Positives (TP): 43
True Negatives (TN): 3
False Positives (FP): 9
False Negatives (FN): 5


### LightGBM

In [17]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Split the data into training and testing sets
X, y = synthetic_data.drop("Label", axis=1), synthetic_data["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scale the features once for both training and testing sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and fit the LGBMClassifier model with default parameters
LGBM_model = LGBMClassifier(random_state=42, n_jobs=-1)
LGBM_model.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
y_test_pred_proba = LGBM_model.predict_proba(X_test_scaled)[:, 1]
y_test_pred = LGBM_model.predict(X_test_scaled)

# Compute metrics
roc_auc = roc_auc_score(y_test, y_test_pred_proba)
accuracy = accuracy_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred)

# Print metrics
print(f"Test ROC-AUC score: {roc_auc:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", class_report)
print("\nConfusion Matrix:\n", conf_matrix)



# Assuming `y_true` are the true labels and `y_pred` are the predicted labels
cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()
# Calculate False Positive Rate
fpr = fp / (fp + tn)
print(f"False Positive Rate: {fpr}")
print(f"TP,TN,FP,FN: {tp,tn,fp,fn}")

[LightGBM] [Info] Number of positive: 5806, number of negative: 1194
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001125 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 166
[LightGBM] [Info] Number of data points in the train set: 7000, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.829429 -> initscore=1.581583
[LightGBM] [Info] Start training from score 1.581583
Test ROC-AUC score: 0.5280
Test Accuracy: 0.8257

Classification Report:
               precision    recall  f1-score   support

           0       0.08      0.00      0.00       512
           1       0.83      1.00      0.90      2488

    accuracy                           0.83      3000
   macro avg       0.45      0.50      0.45      3000
weighted avg       0.70      0.83      0.75      3000


Confusion Matrix:
 [[   1  511]
 [  12 2476]]


In [18]:
from sklearn.metrics import classification_report
X_test, y_test = feature_df.drop("Label", axis=1), feature_df.Label
# Scale the real test data using the same scaler
X_real_scaled = scaler.transform(feature_df.drop("Label", axis=1))

print(classification_report(y_test, LGBM_model.predict(X_real_scaled)))


# Generate the confusion matrix
cm = confusion_matrix(y_test, LGBM_model.predict(X_real_scaled))

# Extract confusion matrix values
TN, FP, FN, TP = cm.ravel()  # Unpack the confusion matrix

# Calculate the False Positive Rate (FPR)
FPR = FP / (FP + TN)
print(f"False Positive Rate (FPR): {FPR:.4f}")

# Print confusion matrix values with descriptions
print("\nConfusion Matrix Scores:")
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.80      1.00      0.89        48

    accuracy                           0.80        60
   macro avg       0.40      0.50      0.44        60
weighted avg       0.64      0.80      0.71        60

False Positive Rate (FPR): 1.0000

Confusion Matrix Scores:
True Positives (TP): 48
True Negatives (TN): 0
False Positives (FP): 12
False Negatives (FN): 0
