In [None]:
# 1. Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import joblib
import os


In [None]:
# 2. Load raw data
benign_df = pd.read_csv("../data/raw/benign.csv")
malware_df = pd.read_csv("../data/raw/malware.csv")

benign_df["malice_label"] = 0
malware_df["malice_label"] = 1

df = pd.concat([benign_df, malware_df], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Select 5 features
selected_features = ["spyware", "encrypter", "downloader", "backdoor", "ransomware"]
df = df[selected_features + ["malice", "malice_label"]]


In [None]:
# 3. Split 5-10% for inference (e.g., 10%)
X = df[selected_features]
y = df["malice_label"]

X_main, X_infer, y_main, y_infer = train_test_split(
    X, y, test_size=0.10, stratify=y, random_state=42
)

# Save inference set
X_infer.to_csv("../data/processed/inference/X_inference.csv", index=False)
y_infer.to_csv("../data/processed/inference/y_inference.csv", index=False)


In [None]:
# 4. Split X_main/y_main into train/val/test (60/20/20)
X_temp, X_test, y_temp, y_test = train_test_split(
    X_main, y_main, test_size=0.25, stratify=y_main, random_state=42
)  # 25% of 90% = 22.5% ≈ test
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=42
)  # 25% of 67.5% = 16.875% ≈ val


In [None]:
# 5. Apply SMOTE to training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [None]:
# 6. Normalize using StandardScaler (fit only on train)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Save scaler
joblib.dump(scaler, "../models/scaler.pkl")


In [None]:
# 7. Save .npy arrays for model training
np.save("../data/processed/train_test/X_train.npy", X_train_scaled)
np.save("../data/processed/train_test/y_train.npy", y_train_resampled)

np.save("../data/processed/train_test/X_val.npy", X_val_scaled)
np.save("../data/processed/train_test/y_val.npy", y_val)

np.save("../data/processed/train_test/X_test.npy", X_test_scaled)
np.save("../data/processed/train_test/y_test.npy", y_test)


In [None]:
# 8. Confirm shapes
print("X_train:", X_train_scaled.shape)
print("X_val  :", X_val_scaled.shape)
print("X_test :", X_test_scaled.shape)
print("X_infer:", X_infer.shape)
