<a href="https://colab.research.google.com/github/sunnyjahangir/Documents/blob/main/Milestone3_CyberSecurity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
import matplotlib.gridspec as gridspec
import os

# Fix for XGBoost on Mac OS
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

try:
    import xgboost as xgb
    has_xgboost = True
except:
    print("XGBoost not available. Using GradientBoostingClassifier as a replacement.")
    has_xgboost = False

# --- 1. Load Primary Dataset (UGRansome) ---
primary_df = pd.read_csv('/Users/sunnyjahangir/Downloads/UGRansome_Dataset_2024.csv')
print(f"Primary dataset shape: {primary_df.shape}")

# Remove duplicates
initial_rows = len(primary_df)
primary_df.drop_duplicates(inplace=True)
print(f"Removed {initial_rows - len(primary_df)} duplicate rows")

# --- 2. Load Windows Malware API Functions Dataset ---
# Load only the API functions dataset
api_functions = pd.read_csv('/Users/sunnyjahangir/PycharmProjects/MachineLearning/PythonProject4/analysis_results/API_Functions_samples.csv')
print(f"API functions shape: {api_functions.shape}")

# --- 3. Feature Engineering ---
print("\n--- Feature Engineering ---")

# Feature Engineering for Primary Dataset
# Add network-based features
if 'Netflow_Bytes' in primary_df.columns:
    # Log transform for skewed numerical data
    primary_df['Log_Netflow_Bytes'] = np.log1p(primary_df['Netflow_Bytes'])

# Create Protocol-Port combinations if both exist
if 'Protocol' in primary_df.columns and 'Port' in primary_df.columns:
    primary_df['Protocol_Port'] = primary_df['Protocol'] + '_' + primary_df['Port'].astype(str)

# --- API Functions Feature Engineering ---
# Count non-null APIs per sample
api_counts = api_functions.iloc[:, 2:].notna().sum(axis=1)
api_functions['API_Count'] = api_counts

# Get most common API functions (top 20)
common_apis = []
for col in api_functions.columns[2:]:
    if col != 'API_Count' and api_functions[col].notna().sum() > api_functions.shape[0] * 0.1:  # At least 10% occurrence
        common_apis.append(col)
common_apis = common_apis[:20]  # Keep top 20 most common

# Create binary features for common APIs
for api in common_apis:
    if api in api_functions.columns:
        api_functions[f'Uses_{api}'] = api_functions[api].notna().astype(int)

# --- 4. Prepare Primary Dataset ---
print("\n--- Processing Primary Dataset ---")

# Define target for primary dataset
primary_target = 'Threats'

# Select features from primary dataset
primary_features = [
    'Time', 'Protocol', 'Flag', 'Clusters', 'Netflow_Bytes', 'Port',
    'Log_Netflow_Bytes', 'Protocol_Port'  # Include engineered features
]
# Filter to include only columns that exist
primary_features = [col for col in primary_features if col in primary_df.columns]

# Separate features and target
X_primary = primary_df[primary_features].copy()
y_primary = primary_df[primary_target].copy()

# Process categorical and numerical features
primary_cat_cols = X_primary.select_dtypes(include=['object', 'category']).columns.tolist()
primary_num_cols = X_primary.select_dtypes(include=np.number).columns.tolist()

# Handle missing values
for col in primary_num_cols:
    if X_primary[col].isnull().any():
        X_primary[col] = X_primary[col].fillna(X_primary[col].median())

for col in primary_cat_cols:
    if X_primary[col].isnull().any():
        X_primary[col] = X_primary[col].fillna(X_primary[col].mode()[0])

# Encode categorical features
for col in primary_cat_cols:
    if col in X_primary.columns:
        le = LabelEncoder()
        X_primary[col] = le.fit_transform(X_primary[col])

# Encode target variable
primary_target_encoder = LabelEncoder()
y_primary_encoded = primary_target_encoder.fit_transform(y_primary)
print(f"Primary target classes: {primary_target_encoder.classes_}")

# --- 5. Process API Functions Dataset ---
print("\n--- Processing API Functions Dataset ---")

# Define features and target
X_api = api_functions.drop(['SHA256', 'Label'], axis=1)
y_api = api_functions['Label']

# Process categorical and numerical features
api_cat_cols = X_api.select_dtypes(include=['object', 'category']).columns.tolist()
api_num_cols = X_api.select_dtypes(include=np.number).columns.tolist()

# Handle missing values
for col in api_num_cols:
    if X_api[col].isnull().any():
        X_api[col] = X_api[col].fillna(X_api[col].median())

for col in api_cat_cols:
    if X_api[col].isnull().any():
        X_api[col] = X_api[col].fillna(X_api[col].mode()[0])

# Encode categorical features
for col in api_cat_cols:
    if col in X_api.columns:
        le = LabelEncoder()
        X_api[col] = le.fit_transform(X_api[col])

# Encode target variable if needed
if y_api.dtype == 'object':
    api_target_encoder = LabelEncoder()
    y_api_encoded = api_target_encoder.fit_transform(y_api)
    print(f"API target classes: {api_target_encoder.classes_}")
else:
    y_api_encoded = y_api.values
    print(f"API target classes: {sorted(y_api.unique())}")

print(f"API features shape: {X_api.shape}")

# --- 6. Data Splitting ---
print("\n--- Data Splitting ---")

# Split primary dataset
X_train_primary, X_test_primary, y_train_primary, y_test_primary = train_test_split(
    X_primary, y_primary_encoded, test_size=0.2, random_state=42, stratify=y_primary_encoded
)

# Split API dataset
X_train_api, X_test_api, y_train_api, y_test_api = train_test_split(
    X_api, y_api_encoded, test_size=0.2, random_state=42, stratify=y_api_encoded
)

# --- 7. Feature Scaling ---
print("\n--- Feature Scaling ---")

# Scale primary features
primary_scaler = StandardScaler()
X_train_primary_scaled = X_train_primary.copy()
X_test_primary_scaled = X_test_primary.copy()
X_train_primary_scaled[primary_num_cols] = primary_scaler.fit_transform(X_train_primary[primary_num_cols])
X_test_primary_scaled[primary_num_cols] = primary_scaler.transform(X_test_primary[primary_num_cols])

# Scale API features
api_scaler = StandardScaler()
X_train_api_scaled = api_scaler.fit_transform(X_train_api)
X_test_api_scaled = api_scaler.transform(X_test_api)

# --- 8. Handle Class Imbalance ---
print("\n--- Handling Class Imbalance ---")

# Apply SMOTE to primary dataset
smote_primary = SMOTE(random_state=42)
X_train_primary_smote, y_train_primary_smote = smote_primary.fit_resample(X_train_primary_scaled, y_train_primary)
print(f"Primary training data shape after SMOTE: {X_train_primary_smote.shape}")
print(f"Primary class distribution after SMOTE: {np.bincount(y_train_primary_smote)}")

# Apply SMOTE to API dataset
smote_api = SMOTE(random_state=42)
X_train_api_smote, y_train_api_smote = smote_api.fit_resample(X_train_api_scaled, y_train_api)
print(f"API training data shape after SMOTE: {X_train_api_smote.shape}")
print(f"API class distribution after SMOTE: {np.bincount(y_train_api_smote)}")

# --- 9. Train Models ---
print("\n--- Training Models ---")

# Define models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
    'SVC': SVC(probability=True, random_state=42)
}

# Add XGBoost if available, otherwise use Gradient Boosting
if has_xgboost:
    models['XGBoost'] = xgb.XGBClassifier(
        n_estimators=200,
        random_state=42,
        n_jobs=1,  # Use single thread to avoid OpenMP issues
        use_label_encoder=False,
        eval_metric='mlogloss',
        tree_method='hist'  # Use histogram-based algorithm which is faster
    )
else:
    models['Gradient Boosting'] = GradientBoostingClassifier(
        n_estimators=200,
        random_state=42
    )

# Train and evaluate models on primary dataset
primary_results = {}
primary_models = {}

print("\nTraining on Primary Dataset:")
for name, model in models.items():
    print(f"  Training {name}...")
    model.fit(X_train_primary_smote, y_train_primary_smote)

    # Save the trained model
    primary_models[name] = model

    # Make predictions
    y_pred = model.predict(X_test_primary_scaled)

    # Calculate metrics
    accuracy = accuracy_score(y_test_primary, y_pred)
    f1 = f1_score(y_test_primary, y_pred, average='weighted')

    # Store results
    primary_results[name] = {
        'Accuracy': accuracy,
        'F1 Score': f1
    }

    print(f"  {name} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

# Train and evaluate models on API dataset
api_results = {}
api_models = {}

print("\nTraining on API Function Dataset:")
for name, model in models.items():
    print(f"  Training {name}...")
    model.fit(X_train_api_smote, y_train_api_smote)

    # Save the trained model
    api_models[name] = model

    # Make predictions
    y_pred = model.predict(X_test_api_scaled)

    # Calculate metrics
    accuracy = accuracy_score(y_test_api, y_pred)
    f1 = f1_score(y_test_api, y_pred, average='weighted')

    # Store results
    api_results[name] = {
        'Accuracy': accuracy,
        'F1 Score': f1
    }

    print(f"  {name} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

# --- 10. Model Ranking and Visualization ---
print("\n--- Model Ranking and Visualization ---")

# Create DataFrames for results
primary_df_results = pd.DataFrame.from_dict(primary_results, orient='index')
primary_df_results['Dataset'] = 'UGRansome'

api_df_results = pd.DataFrame.from_dict(api_results, orient='index')
api_df_results['Dataset'] = 'API Functions'

# Combine results
all_results = pd.concat([primary_df_results, api_df_results])
all_results['Model'] = all_results.index
all_results.reset_index(drop=True, inplace=True)

# Sort by accuracy (descending)
ranked_by_accuracy = all_results.sort_values('Accuracy', ascending=False)
print("\nModels Ranked by Accuracy:")
print(ranked_by_accuracy[['Model', 'Dataset', 'Accuracy', 'F1 Score']])

# Sort by F1 Score (descending)
ranked_by_f1 = all_results.sort_values('F1 Score', ascending=False)
print("\nModels Ranked by F1 Score:")
print(ranked_by_f1[['Model', 'Dataset', 'Accuracy', 'F1 Score']])

# Find the best model overall
best_model_idx = ranked_by_f1.index[0]
best_model_name = ranked_by_f1.loc[best_model_idx, 'Model']
best_model_dataset = ranked_by_f1.loc[best_model_idx, 'Dataset']
best_model_accuracy = ranked_by_f1.loc[best_model_idx, 'Accuracy']
best_model_f1 = ranked_by_f1.loc[best_model_idx, 'F1 Score']

print(f"\nBest Overall Model: {best_model_name} on {best_model_dataset} Dataset")
print(f"  Accuracy: {best_model_accuracy:.4f}")
print(f"  F1 Score: {best_model_f1:.4f}")

# --- 11. Detailed Evaluation of the Best Model ---
print("\n--- Detailed Evaluation of the Best Model ---")

# Get the actual best model
if best_model_dataset == 'UGRansome':
    best_model = primary_models[best_model_name]
    X_test = X_test_primary_scaled
    y_test = y_test_primary
    target_encoder = primary_target_encoder
else:
    best_model = api_models[best_model_name]
    X_test = X_test_api_scaled
    y_test = y_test_api
    target_encoder = api_target_encoder

# Make predictions
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)

# Classification report
print("\nClassification Report for the Best Model:")
print(classification_report(y_test, y_pred, target_names=target_encoder.classes_))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_encoder.classes_,
            yticklabels=target_encoder.classes_)
plt.title(f'Confusion Matrix - {best_model_name} on {best_model_dataset}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('best_model_confusion_matrix.png')
plt.close()

# --- 12. Visualization of Model Performance ---
print("\n--- Creating Performance Visualizations ---")

# Setup for visualizations
plt.style.use('ggplot')
colors = plt.cm.tab10(np.linspace(0, 1, 10))

# 1. Bar chart comparison of all models
plt.figure(figsize=(14, 8))
gs = gridspec.GridSpec(1, 2, width_ratios=[1, 1])

# Accuracy comparison
ax1 = plt.subplot(gs[0])
bar_width = 0.35
index = np.arange(len(models))

primary_acc = [primary_results[model]['Accuracy'] for model in models]
api_acc = [api_results[model]['Accuracy'] for model in models]

ax1.bar(index, primary_acc, bar_width, label='UGRansome', color=colors[0], alpha=0.8)
ax1.bar(index + bar_width, api_acc, bar_width, label='API Functions', color=colors[1], alpha=0.8)

ax1.set_xlabel('Model')
ax1.set_ylabel('Accuracy')
ax1.set_title('Accuracy Comparison')
ax1.set_xticks(index + bar_width / 2)
ax1.set_xticklabels(models.keys())
ax1.legend()

# F1 Score comparison
ax2 = plt.subplot(gs[1])
primary_f1 = [primary_results[model]['F1 Score'] for model in models]
api_f1 = [api_results[model]['F1 Score'] for model in models]

ax2.bar(index, primary_f1, bar_width, label='UGRansome', color=colors[0], alpha=0.8)
ax2.bar(index + bar_width, api_f1, bar_width, label='API Functions', color=colors[1], alpha=0.8)

ax2.set_xlabel('Model')
ax2.set_ylabel('F1 Score')
ax2.set_title('F1 Score Comparison')
ax2.set_xticks(index + bar_width / 2)
ax2.set_xticklabels(models.keys())
ax2.legend()

plt.tight_layout()
plt.savefig('model_performance_comparison.png')
plt.close()

