In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt

Prepare data

In [None]:
# df = pd.read_csv('merged.csv')

# X = df.drop(['label'], axis=1)  # Drop 'label' column
# y = df['label'] # Keep only labels

# # Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

Prepare data with specified ratios of malicious/benign data

In [None]:
# All features: 'name', 'packages', 'version', 'average_entropy', 'python_file_count', 'package_size', 'contains_ip', 'contains_domain', 'contains_bytestrings', 'contains_base64', 'contains_import_subprocess', 'contains_import_os', 'contains_import_network_modules'

# Specify the columns to exclude
columns_to_exclude =  ['name', 'packages', 'version', 'contains_domain', 'contains_base64']

# Read the CSV file excluding specified columns
df = pd.read_csv('merged.csv', usecols=lambda x: x not in columns_to_exclude)

# Split the dataset into malicious and benign subsets
malicious_df = df[df['label'] == 1]
benign_df = df[df['label'] == 0]

# Desired test size and malicious ratio in the test set
test_size = 0.2 
malicious_ratio_in_test = 0.1

# Calculate the number of malicious samples in the test set
total_test_samples = int(len(df) * test_size)
malicious_test_samples = int(total_test_samples * malicious_ratio_in_test)
benign_test_samples = total_test_samples - malicious_test_samples

# Split the malicious data
malicious_train, malicious_test = train_test_split(malicious_df, test_size=malicious_test_samples, random_state=42)

# Split the benign data
benign_train, benign_test = train_test_split(benign_df, test_size=benign_test_samples, random_state=42)

# Combine the training and testing sets
train_df = pd.concat([malicious_train, benign_train])
test_df = pd.concat([malicious_test, benign_test])

# Shuffle the training and testing sets to ensure random distribution
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate features and labels for training and testing sets
X_train = train_df.drop(['label'], axis=1)
y_train = train_df['label']
X_test = test_df.drop(['label'], axis=1)
y_test = test_df['label']

# Print the sizes of the training and test data
print(f'Training set size: {len(X_train)} samples')
print(f'Test set size: {len(X_test)} samples')

# Output the shapes to verify the splits
print(f'Training set shape: {X_train.shape}, {y_train.shape}')
print(f'Test set shape: {X_test.shape}, {y_test.shape}')
print(f'Class distribution in test set:\n{y_test.value_counts()}')

Train RF Classifier

In [None]:
# Initialize the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

Make predictions

In [None]:
# Predict on the test set
y_pred = rf_model.predict(X_test)

Evaluate results

In [None]:
# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Classification report
print(classification_report(y_test, y_pred))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

Examine feature importance

In [None]:
# Feature importance
if columns_to_exclude:
    remaining_columns = [col for col in X_train.columns if col not in columns_to_exclude]
    if len(remaining_columns) == len(rf_model.feature_importances_):
        feature_importances_adjusted = pd.Series(rf_model.feature_importances_, index=remaining_columns)
        print('Adjusted Feature Importances:')
        print(feature_importances_adjusted)
    else:
        print('Number of remaining columns does not match the length of feature importances.')
else:
    feature_importances = pd.Series(rf_model.feature_importances_, index=X_train.columns)
    print('Feature Importances:')
    print(feature_importances)

Examine results

In [None]:
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Display the results
print(results_df)

print(f"Flagged {results_df[results_df['Predicted'] == 1].shape[0]} out of {results_df[results_df['Actual'] == 1].shape[0]} malicious samples")
print(f"Flagged {(results_df[results_df['Predicted'] == 1].shape[0] / len(X_train) ) * 100} percent of samples")
print(f"{malicious_ratio_in_test * 100}% of test was malicious")

Plot ROC Curve

In [None]:
# Predict probabilities for the test set
y_prob = rf_model.predict_proba(X_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)

# Calculate the AUC (Area Under the Curve)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()