In [20]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score


In [23]:
# Step 1: Load Dataset
# Update the path with the correct dataset location.
data = pd.read_csv('Obfuscated-MalMem2022.csv')  # Replace with actual dataset path

# Inspect the data
print("Dataset Preview:")
print(data.head())
print("\nDataset Information:")
print(data.info())
print("\nDataset Summary:")
print(data.describe())


Dataset Preview:
  Category  pslist.nproc  pslist.nppid  pslist.avg_threads  \
0   Benign            45            17           10.555556   
1   Benign            47            19           11.531915   
2   Benign            40            14           14.725000   
3   Benign            32            13           13.500000   
4   Benign            42            16           11.452381   

   pslist.nprocs64bit  pslist.avg_handlers  dlllist.ndlls  \
0                   0           202.844444           1694   
1                   0           242.234043           2074   
2                   0           288.225000           1932   
3                   0           264.281250           1445   
4                   0           281.333333           2067   

   dlllist.avg_dlls_per_proc  handles.nhandles  handles.avg_handles_per_proc  \
0                  38.500000              9129                    212.302326   
1                  44.127660             11385                    242.234043   
2  

In [25]:
# Map 'Category' to numerical labels
data['Category'] = data['Category'].map({'benign': 0, 'malicious': 1})

# Separate features and target variable
X = data.drop(['Category'], axis=1)
y = data['Category']

# Check for non-numeric columns in X
non_numeric_columns = X.select_dtypes(include=['object', 'category']).columns
print("Non-numeric columns in features:", non_numeric_columns)

# Drop or encode non-numeric columns
if len(non_numeric_columns) > 0:
    # Option 1: Drop non-numeric columns
    X = X.drop(non_numeric_columns, axis=1)
    # Option 2: Encode non-numeric columns
    # from sklearn.preprocessing import LabelEncoder
    # for col in non_numeric_columns:
    #     X[col] = LabelEncoder().fit_transform(X[col])

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape}, Testing set size: {X_test.shape}")


Non-numeric columns in features: Index(['Class'], dtype='object')
Training set size: (46876, 55), Testing set size: (11720, 55)


In [27]:
# Step 3: Random Forest Classifier
print("\nTraining Random Forest Classifier...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

# Evaluation
print("\nRandom Forest Results:")
print(classification_report(y_test, rf_predictions))
rf_roc_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])
print(f"Random Forest ROC-AUC: {rf_roc_auc:.3f}")

# Feature Importance Visualization
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.bar(range(X.shape[1]), importances[indices], align="center")
plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)
plt.title("Random Forest Feature Importance")
plt.show()


Training Random Forest Classifier...


ValueError: Input y contains NaN.

In [None]:
# Step 4: Support Vector Machine (SVM) Classifier
print("\nTraining Support Vector Machine Classifier...")
svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)

# Evaluation
print("\nSVM Results:")
print(classification_report(y_test, svm_predictions))
svm_roc_auc = roc_auc_score(y_test, svm_model.decision_function(X_test))
print(f"SVM ROC-AUC: {svm_roc_auc:.3f}")


In [None]:
# Step 5: Results Comparison
print("\nModel Comparison:")
models = ['Random Forest', 'SVM']
roc_auc_scores = [rf_roc_auc, svm_roc_auc]
accuracy_scores = [accuracy_score(y_test, rf_predictions), accuracy_score(y_test, svm_predictions)]

comparison_df = pd.DataFrame({
    'Model': models,
    'ROC-AUC': roc_auc_scores,
    'Accuracy': accuracy_scores
})
print(comparison_df)

# Visualization
plt.figure(figsize=(8, 5))
sns.barplot(x='Model', y='ROC-AUC', data=comparison_df)
plt.title("Model ROC-AUC Comparison")
plt.show()

plt.figure(figsize=(8, 5))
sns.barplot(x='Model', y='Accuracy', data=comparison_df)
plt.title("Model Accuracy Comparison")
plt.show()

In [None]:
# Step 6: Conclusion
print("\nConclusion:")
print("Both models performed well, with the following metrics:")
print(comparison_df.to_string(index=False))