<a href="https://colab.research.google.com/github/yms07/My-Project1/blob/main/TestingModel_UsingAES.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =====================================
# STEP 0: Mount Google Drive
# =====================================
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# ✅ Ransomware Detection Full Colab Notebook (Steps 4-6 + K-Fold CV)

import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Paths
base_path = '/content/drive/MyDrive/ransomware_data'
plain_csv = f'{base_path}/features_plaintext.csv'
infect_csv = f'{base_path}/features_infected.csv'

# Load data
plain_df = pd.read_csv(plain_csv)
infect_df = pd.read_csv(infect_csv)

# Combine datasets
df = pd.concat([plain_df, infect_df], ignore_index=True)

# Convert timestamps to numeric
for col in ['modified_time', 'access_time', 'created_time']:
    df[col] = pd.to_datetime(df[col], errors='coerce')
    df[col] = df[col].astype('int64') / 1e9

df.dropna(inplace=True)
print("✅ Dataset loaded and cleaned")

# All models
models = {
    'KNN': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'SVM': SVC(probability=True),
    'MLP': MLPClassifier(max_iter=1000)
}

def kfold_train_and_evaluate(X, y, label):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    print(f"\n🧪 K-Fold Evaluation for {label}:")

    for name, model in models.items():
        acc_list, prec_list, rec_list, f1_list, auc_list = [], [], [], [], []

        for train_idx, test_idx in skf.split(X_scaled, y):
            X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

            acc_list.append(accuracy_score(y_test, y_pred))
            prec_list.append(precision_score(y_test, y_pred))
            rec_list.append(recall_score(y_test, y_pred))
            f1_list.append(f1_score(y_test, y_pred))
            if y_prob is not None:
                auc_list.append(roc_auc_score(y_test, y_prob))

        avg_acc = sum(acc_list) / len(acc_list)
        avg_prec = sum(prec_list) / len(prec_list)
        avg_rec = sum(rec_list) / len(rec_list)
        avg_f1 = sum(f1_list) / len(f1_list)
        avg_auc = sum(auc_list) / len(auc_list) if auc_list else None

        print(f"\n📊 Model: {name}")
        print(f"Accuracy : {avg_acc:.4f}")
        print(f"Precision: {avg_prec:.4f}")
        print(f"Recall   : {avg_rec:.4f}")
        print(f"F1 Score : {avg_f1:.4f}")
        print(f"AUC      : {avg_auc:.4f}" if avg_auc else "AUC      : N/A")

# ===============================
# Run for all 3 feature combinations
# ===============================

y = df['label']

# Dataset 1: entropy + file_type
X1 = df[['entropy', 'file_type']]
kfold_train_and_evaluate(X1, y, "Dataset 1 (entropy + file_type)")

# Dataset 2: entropy + file_type + size
X2 = df[['entropy', 'file_type', 'size']]
kfold_train_and_evaluate(X2, y, "Dataset 2 (entropy + file_type + size)")

# Dataset 3: entropy + file_type + size + MAC times
X3 = df[['entropy', 'file_type', 'size', 'modified_time', 'access_time', 'created_time']]
kfold_train_and_evaluate(X3, y, "Dataset 3 (entropy + file_type + size + MAC times)")

✅ Dataset loaded and cleaned

🧪 K-Fold Evaluation for Dataset 1 (entropy + file_type):

📊 Model: KNN
Accuracy : 0.9950
Precision: 0.9975
Recall   : 0.9972
F1 Score : 0.9974
AUC      : 0.9984

📊 Model: LogisticRegression
Accuracy : 0.9422
Precision: 0.9422
Recall   : 1.0000
F1 Score : 0.9703
AUC      : 0.9146

📊 Model: DecisionTree
Accuracy : 0.9944
Precision: 0.9973
Recall   : 0.9968
F1 Score : 0.9970
AUC      : 0.9764

📊 Model: RandomForest
Accuracy : 0.9943
Precision: 0.9972
Recall   : 0.9968
F1 Score : 0.9970
AUC      : 0.9981

📊 Model: GradientBoosting
Accuracy : 0.9952
Precision: 0.9977
Recall   : 0.9971
F1 Score : 0.9974
AUC      : 0.9995

📊 Model: SVM
Accuracy : 0.9926
Precision: 0.9984
Recall   : 0.9938
F1 Score : 0.9961
AUC      : 0.9966

📊 Model: MLP
Accuracy : 0.9937
Precision: 0.9984
Recall   : 0.9950
F1 Score : 0.9967
AUC      : 0.9994

🧪 K-Fold Evaluation for Dataset 2 (entropy + file_type + size):

📊 Model: KNN
Accuracy : 0.9961
Precision: 0.9980
Recall   : 0.9978
F1 Sco

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Example: Final training for RandomForest with Dataset 2
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import joblib

# Features same as best Dataset 2
X = df[['entropy', 'file_type', 'size']]
y = df['label']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model = RandomForestClassifier()
model.fit(X_scaled, y)

# Save model & scaler
joblib.dump(model, '/content/drive/MyDrive/ransomware_model.joblib')
joblib.dump(scaler, '/content/drive/MyDrive/ransomware_scaler.joblib')

print("✅ Final model and scaler saved!")

✅ Final model and scaler saved!


In [None]:
import os
import math
from datetime import datetime
import joblib
import math

def calculate_entropy(file_path):
  try:
    with open(file_path, 'rb') as f:
      data = f.read()
    if not data:
      return 0.0
    freq_list = [0] * 256
    for b in data:
      freq_list[b] += 1
    entropy = 0.0
    for freq in freq_list:
      if freq > 0:
        p = freq / len(data)
        entropy -= p * math.log2(p)
    return round(entropy, 4)
  except:
    return 0.0

# Same file type map
FILE_TYPE_MAP = {
  'dll': 3, 'sys': 4, 'c': 13, 'cpp': 14, 'csv': 1, 'txt': 2,
  'doc': 5, 'docx': 6, 'pdf': 7, 'ppt': 8, 'pptx': 9, 'xls': 10,
  'xlsx': 11, 'html': 12, 'jpg': 15, 'zip': 16
}


# Example: New file path
test_file_path = '/content/drive/MyDrive/test_files/olecli32.dll'

# Use your same calculate_entropy & FILE_TYPE_MAP
entropy = calculate_entropy(test_file_path)
size = os.path.getsize(test_file_path) / 10_000_000
ext = test_file_path.split('.')[-1].lower()
file_type = FILE_TYPE_MAP.get(ext, 0)
stat = os.stat(test_file_path)

from datetime import datetime
m_time = datetime.fromtimestamp(stat.st_mtime).timestamp()
a_time = datetime.fromtimestamp(stat.st_atime).timestamp()
c_time = datetime.fromtimestamp(stat.st_ctime).timestamp()

X_new = [[entropy, file_type, size]]

# Load final model & scaler
import joblib
model = joblib.load('/content/drive/MyDrive/ransomware_model.joblib')
scaler = joblib.load('/content/drive/MyDrive/ransomware_scaler.joblib')

X_scaled_new = scaler.transform(X_new)

pred = model.predict(X_scaled_new)

print("✅ Verdict:", "Safe (Plaintext)" if pred[0] == 1 else "⚠️ Infected (Ransomware)!")

✅ Verdict: Safe (Plaintext)




In [None]:
import os
import math
from datetime import datetime
import joblib
import math

def calculate_entropy(file_path):
  try:
    with open(file_path, 'rb') as f:
      data = f.read()
    if not data:
      return 0.0
    freq_list = [0] * 256
    for b in data:
      freq_list[b] += 1
    entropy = 0.0
    for freq in freq_list:
      if freq > 0:
        p = freq / len(data)
        entropy -= p * math.log2(p)
    return round(entropy, 4)
  except:
    return 0.0

# Same file type map
FILE_TYPE_MAP = {
  'dll': 3, 'sys': 4, 'c': 13, 'cpp': 14, 'csv': 1, 'txt': 2,
  'doc': 5, 'docx': 6, 'pdf': 7, 'ppt': 8, 'pptx': 9, 'xls': 10,
  'xlsx': 11, 'html': 12, 'jpg': 15, 'zip': 16
}


# Example: New file path
test_file_path = '/content/drive/MyDrive/test_files/aclui.dll'

# Use your same calculate_entropy & FILE_TYPE_MAP
entropy = calculate_entropy(test_file_path)
size = os.path.getsize(test_file_path) / 10_000_000
ext = test_file_path.split('.')[-1].lower()
file_type = FILE_TYPE_MAP.get(ext, 0)
stat = os.stat(test_file_path)

from datetime import datetime
m_time = datetime.fromtimestamp(stat.st_mtime).timestamp()
a_time = datetime.fromtimestamp(stat.st_atime).timestamp()
c_time = datetime.fromtimestamp(stat.st_ctime).timestamp()

X_new = [[entropy, file_type, size]]

# Load final model & scaler
import joblib
model = joblib.load('/content/drive/MyDrive/ransomware_model.joblib')
scaler = joblib.load('/content/drive/MyDrive/ransomware_scaler.joblib')

X_scaled_new = scaler.transform(X_new)

pred = model.predict(X_scaled_new)

print("✅ Verdict:", "Safe (Plaintext)" if pred[0] == 1 else "⚠️ Infected (Ransomware)!")

✅ Verdict: ⚠️ Infected (Ransomware)!




In [None]:
# ✅ Final training for Decision Tree with Dataset 2
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
import joblib

# ✅ Features for Dataset 2: entropy, file_type, size
X = df[['entropy', 'file_type', 'size']]
X['size'] = X['size'] / 10_000_000  # scale size (if not done before)
y = df['label']

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Decision Tree Model
model = DecisionTreeClassifier()
model.fit(X_scaled, y)

# Save model & scaler
joblib.dump(model, '/content/drive/MyDrive/ransomware_model_dataset2_DT.joblib')
joblib.dump(scaler, '/content/drive/MyDrive/ransomware_scaler_dataset2_DT.joblib')

print("✅ Final Decision Tree model and scaler saved for Dataset 2!")

✅ Final Decision Tree model and scaler saved for Dataset 2!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['size'] = X['size'] / 10_000_000  # scale size (if not done before)


In [None]:
import os
import math
from datetime import datetime
import joblib

def calculate_entropy(file_path):
  try:
    with open(file_path, 'rb') as f:
      data = f.read()
    if not data:
      return 0.0
    freq_list = [0] * 256
    for b in data:
      freq_list[b] += 1
    entropy = 0.0
    for freq in freq_list:
      if freq > 0:
        p = freq / len(data)
        entropy -= p * math.log2(p)
    return round(entropy, 4)
  except:
    return 0.0

# Same file type map
FILE_TYPE_MAP = {
  'dll': 3, 'sys': 4, 'c': 13, 'cpp': 14, 'csv': 1, 'txt': 2,
  'doc': 5, 'docx': 6, 'pdf': 7, 'ppt': 8, 'pptx': 9, 'xls': 10,
  'xlsx': 11, 'html': 12, 'jpg': 15, 'zip': 16
}

# Example: New file path
test_file_path = '/content/drive/MyDrive/test_files/aclui.dll'

# Extract features
entropy = calculate_entropy(test_file_path)
size = os.path.getsize(test_file_path) / 10_000_000  # scale like training
ext = test_file_path.split('.')[-1].lower()
file_type = FILE_TYPE_MAP.get(ext, 0)

# ✅ Dataset 2: Only [entropy, file_type, size]
X_new = [[entropy, file_type, size]]

# Load Decision Tree model & scaler
model = joblib.load('/content/drive/MyDrive/ransomware_model_dataset2_DT.joblib')
scaler = joblib.load('/content/drive/MyDrive/ransomware_scaler_dataset2_DT.joblib')

X_scaled_new = scaler.transform(X_new)

pred = model.predict(X_scaled_new)

print("✅ Verdict:", "Safe (Plaintext)" if pred[0] == 1 else "⚠️ Infected (Ransomware)!")

✅ Verdict: ⚠️ Infected (Ransomware)!




In [None]:
import os
import math
from datetime import datetime
import joblib

def calculate_entropy(file_path):
  try:
    with open(file_path, 'rb') as f:
      data = f.read()
    if not data:
      return 0.0
    freq_list = [0] * 256
    for b in data:
      freq_list[b] += 1
    entropy = 0.0
    for freq in freq_list:
      if freq > 0:
        p = freq / len(data)
        entropy -= p * math.log2(p)
    return round(entropy, 4)
  except:
    return 0.0

# Same file type map
FILE_TYPE_MAP = {
  'dll': 3, 'sys': 4, 'c': 13, 'cpp': 14, 'csv': 1, 'txt': 2,
  'doc': 5, 'docx': 6, 'pdf': 7, 'ppt': 8, 'pptx': 9, 'xls': 10,
  'xlsx': 11, 'html': 12, 'jpg': 15, 'zip': 16
}

# Example: New file path
test_file_path = '/content/drive/MyDrive/test_files/50.c'

# Extract features
entropy = calculate_entropy(test_file_path)
size = os.path.getsize(test_file_path) / 10_000_000  # scale like training
ext = test_file_path.split('.')[-1].lower()
file_type = FILE_TYPE_MAP.get(ext, 0)

# ✅ Dataset 2: Only [entropy, file_type, size]
X_new = [[entropy, file_type, size]]

# Load Decision Tree model & scaler
model = joblib.load('/content/drive/MyDrive/ransomware_model_dataset2_DT.joblib')
scaler = joblib.load('/content/drive/MyDrive/ransomware_scaler_dataset2_DT.joblib')

X_scaled_new = scaler.transform(X_new)

pred = model.predict(X_scaled_new)

print("✅ Verdict:", "Safe (Plaintext)" if pred[0] == 1 else "⚠️ Infected (Ransomware)!")

✅ Verdict: Safe (Plaintext)




In [None]:
# ✅ Final training for SVM with Dataset 2
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import joblib

# ✅ Features for Dataset 2: entropy, file_type, size
X = df[['entropy', 'file_type', 'size']]
X['size'] = X['size'] / 10_000_000  # scale size (if not done before)
y = df['label']

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# SVM Model (enable probability for predict_proba)
model = SVC(probability=True)
model.fit(X_scaled, y)

# Save model & scaler
joblib.dump(model, '/content/drive/MyDrive/ransomware_model_dataset2_SVM.joblib')
joblib.dump(scaler, '/content/drive/MyDrive/ransomware_scaler_dataset2_SVM.joblib')

print("✅ Final SVM model and scaler saved for Dataset 2!")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['size'] = X['size'] / 10_000_000  # scale size (if not done before)


✅ Final SVM model and scaler saved for Dataset 2!


In [None]:
import os
import math
from datetime import datetime
import joblib

def calculate_entropy(file_path):
  try:
    with open(file_path, 'rb') as f:
      data = f.read()
    if not data:
      return 0.0
    freq_list = [0] * 256
    for b in data:
      freq_list[b] += 1
    entropy = 0.0
    for freq in freq_list:
      if freq > 0:
        p = freq / len(data)
        entropy -= p * math.log2(p)
    return round(entropy, 4)
  except:
    return 0.0

# Same file type map
FILE_TYPE_MAP = {
  'dll': 3, 'sys': 4, 'c': 13, 'cpp': 14, 'csv': 1, 'txt': 2,
  'doc': 5, 'docx': 6, 'pdf': 7, 'ppt': 8, 'pptx': 9, 'xls': 10,
  'xlsx': 11, 'html': 12, 'jpg': 15, 'zip': 16
}

# Example: New file path
test_file_path = '/content/drive/MyDrive/test_files/aclui.dll'

# Extract features
entropy = calculate_entropy(test_file_path)
size = os.path.getsize(test_file_path) / 10_000_000  # scale like training
ext = test_file_path.split('.')[-1].lower()
file_type = FILE_TYPE_MAP.get(ext, 0)

# ✅ Dataset 2: Only [entropy, file_type, size]
X_new = [[entropy, file_type, size]]

# Load SVM model & scaler
model = joblib.load('/content/drive/MyDrive/ransomware_model_dataset2_SVM.joblib')
scaler = joblib.load('/content/drive/MyDrive/ransomware_scaler_dataset2_SVM.joblib')

X_scaled_new = scaler.transform(X_new)

pred = model.predict(X_scaled_new)

print("✅ Verdict:", "Safe (Plaintext)" if pred[0] == 1 else "⚠️ Infected (Ransomware)!")

✅ Verdict: ⚠️ Infected (Ransomware)!




In [None]:
import os
import math
from datetime import datetime
import joblib

def calculate_entropy(file_path):
  try:
    with open(file_path, 'rb') as f:
      data = f.read()
    if not data:
      return 0.0
    freq_list = [0] * 256
    for b in data:
      freq_list[b] += 1
    entropy = 0.0
    for freq in freq_list:
      if freq > 0:
        p = freq / len(data)
        entropy -= p * math.log2(p)
    return round(entropy, 4)
  except:
    return 0.0

# Same file type map
FILE_TYPE_MAP = {
  'dll': 3, 'sys': 4, 'c': 13, 'cpp': 14, 'csv': 1, 'txt': 2,
  'doc': 5, 'docx': 6, 'pdf': 7, 'ppt': 8, 'pptx': 9, 'xls': 10,
  'xlsx': 11, 'html': 12, 'jpg': 15, 'zip': 16
}

# Example: New file path
test_file_path = '/content/drive/MyDrive/test_files/50.c'

# Extract features
entropy = calculate_entropy(test_file_path)
size = os.path.getsize(test_file_path) / 10_000_000  # scale like training
ext = test_file_path.split('.')[-1].lower()
file_type = FILE_TYPE_MAP.get(ext, 0)

# ✅ Dataset 2: Only [entropy, file_type, size]
X_new = [[entropy, file_type, size]]

# Load SVM model & scaler
model = joblib.load('/content/drive/MyDrive/ransomware_model_dataset2_SVM.joblib')
scaler = joblib.load('/content/drive/MyDrive/ransomware_scaler_dataset2_SVM.joblib')

X_scaled_new = scaler.transform(X_new)

pred = model.predict(X_scaled_new)

print("✅ Verdict:", "Safe (Plaintext)" if pred[0] == 1 else "⚠️ Infected (Ransomware)!")

✅ Verdict: Safe (Plaintext)




In [None]:
# ✅ Final training for MLP with Dataset 2
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import joblib

# ✅ Features for Dataset 2: entropy, file_type, size
X = df[['entropy', 'file_type', 'size']]
X['size'] = X['size'] / 10_000_000  # scale size (if not done before)
y = df['label']

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# MLP Model (you can tune hidden_layer_sizes if needed)
model = MLPClassifier(max_iter=1000, random_state=42)
model.fit(X_scaled, y)

# Save model & scaler
joblib.dump(model, '/content/drive/MyDrive/ransomware_model_dataset2_MLP.joblib')
joblib.dump(scaler, '/content/drive/MyDrive/ransomware_scaler_dataset2_MLP.joblib')

print("✅ Final MLP model and scaler saved for Dataset 2!")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['size'] = X['size'] / 10_000_000  # scale size (if not done before)


✅ Final MLP model and scaler saved for Dataset 2!


In [None]:
import os
import math
from datetime import datetime
import joblib

def calculate_entropy(file_path):
  try:
    with open(file_path, 'rb') as f:
      data = f.read()
    if not data:
      return 0.0
    freq_list = [0] * 256
    for b in data:
      freq_list[b] += 1
    entropy = 0.0
    for freq in freq_list:
      if freq > 0:
        p = freq / len(data)
        entropy -= p * math.log2(p)
    return round(entropy, 4)
  except:
    return 0.0

# Same file type map
FILE_TYPE_MAP = {
  'dll': 3, 'sys': 4, 'c': 13, 'cpp': 14, 'csv': 1, 'txt': 2,
  'doc': 5, 'docx': 6, 'pdf': 7, 'ppt': 8, 'pptx': 9, 'xls': 10,
  'xlsx': 11, 'html': 12, 'jpg': 15, 'zip': 16
}

# Example: New file path
test_file_path = '/content/drive/MyDrive/test_files/aclui.dll'

# Extract features
entropy = calculate_entropy(test_file_path)
size = os.path.getsize(test_file_path) / 10_000_000  # scale like training
ext = test_file_path.split('.')[-1].lower()
file_type = FILE_TYPE_MAP.get(ext, 0)

# ✅ Dataset 2: Only [entropy, file_type, size]
X_new = [[entropy, file_type, size]]

# Load MLP model & scaler
model = joblib.load('/content/drive/MyDrive/ransomware_model_dataset2_MLP.joblib')
scaler = joblib.load('/content/drive/MyDrive/ransomware_scaler_dataset2_MLP.joblib')

X_scaled_new = scaler.transform(X_new)

pred = model.predict(X_scaled_new)

print("✅ Verdict:", "Safe (Plaintext)" if pred[0] == 1 else "⚠️ Infected (Ransomware)!")

✅ Verdict: ⚠️ Infected (Ransomware)!




In [None]:
import os
import math
from datetime import datetime
import joblib

def calculate_entropy(file_path):
  try:
    with open(file_path, 'rb') as f:
      data = f.read()
    if not data:
      return 0.0
    freq_list = [0] * 256
    for b in data:
      freq_list[b] += 1
    entropy = 0.0
    for freq in freq_list:
      if freq > 0:
        p = freq / len(data)
        entropy -= p * math.log2(p)
    return round(entropy, 4)
  except:
    return 0.0

# Same file type map
FILE_TYPE_MAP = {
  'dll': 3, 'sys': 4, 'c': 13, 'cpp': 14, 'csv': 1, 'txt': 2,
  'doc': 5, 'docx': 6, 'pdf': 7, 'ppt': 8, 'pptx': 9, 'xls': 10,
  'xlsx': 11, 'html': 12, 'jpg': 15, 'zip': 16
}

# Example: New file path
test_file_path = '/content/drive/MyDrive/test_files/50.c'

# Extract features
entropy = calculate_entropy(test_file_path)
size = os.path.getsize(test_file_path) / 10_000_000  # scale like training
ext = test_file_path.split('.')[-1].lower()
file_type = FILE_TYPE_MAP.get(ext, 0)

# ✅ Dataset 2: Only [entropy, file_type, size]
X_new = [[entropy, file_type, size]]

# Load MLP model & scaler
model = joblib.load('/content/drive/MyDrive/ransomware_model_dataset2_MLP.joblib')
scaler = joblib.load('/content/drive/MyDrive/ransomware_scaler_dataset2_MLP.joblib')

X_scaled_new = scaler.transform(X_new)

pred = model.predict(X_scaled_new)

print("✅ Verdict:", "Safe (Plaintext)" if pred[0] == 1 else "⚠️ Infected (Ransomware)!")

✅ Verdict: Safe (Plaintext)




In [7]:
# ==========================
# 📂 1) Import libraries
# ==========================
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.base import clone

# ==========================
# 📂 2) Load data
# ==========================
# Paths
base_path = '/content/drive/MyDrive/ransomware_data'
plain_csv = f'{base_path}/features_plaintext.csv'
infect_csv = f'{base_path}/features_infected.csv'

# Load data
plain_df = pd.read_csv(plain_csv)
infect_df = pd.read_csv(infect_csv)

# Combine datasets
df = pd.concat([plain_df, infect_df], ignore_index=True)

# Convert timestamps to numeric
for col in ['modified_time', 'access_time', 'created_time']:
    df[col] = pd.to_datetime(df[col], errors='coerce')
    df[col] = df[col].astype('int64') / 1e9

df.dropna(inplace=True)
print("✅ Dataset loaded and cleaned")

# ==========================
# 📂 3) Models
# ==========================
models = {
    'KNN': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'SVM': SVC(probability=True),
    'MLP': MLPClassifier(max_iter=1000)
}

# ==========================
# 📂 4) K-Fold CV Function
# ==========================
def kfold_train_and_evaluate(X, y, label):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    print(f"\n🧪 K-Fold Evaluation for {label}:")

    for name, model in models.items():
        acc_list, prec_list, rec_list, f1_list, auc_list = [], [], [], [], []
        acc_list_train, prec_list_train, rec_list_train, f1_list_train, auc_list_train = [], [], [], [], []

        for train_idx, test_idx in skf.split(X_scaled, y):
            X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            m = clone(model)
            m.fit(X_train, y_train)

            y_pred_test = m.predict(X_test)
            y_pred_train = m.predict(X_train)

            y_prob_test = m.predict_proba(X_test)[:, 1] if hasattr(m, "predict_proba") else None
            y_prob_train = m.predict_proba(X_train)[:, 1] if hasattr(m, "predict_proba") else None

            # Test scores
            acc_list.append(accuracy_score(y_test, y_pred_test))
            prec_list.append(precision_score(y_test, y_pred_test))
            rec_list.append(recall_score(y_test, y_pred_test))
            f1_list.append(f1_score(y_test, y_pred_test))
            if y_prob_test is not None:
                auc_list.append(roc_auc_score(y_test, y_prob_test))

            # Train scores
            acc_list_train.append(accuracy_score(y_train, y_pred_train))
            prec_list_train.append(precision_score(y_train, y_pred_train))
            rec_list_train.append(recall_score(y_train, y_pred_train))
            f1_list_train.append(f1_score(y_train, y_pred_train))
            if y_prob_train is not None:
                auc_list_train.append(roc_auc_score(y_train, y_prob_train))

        # Averages
        avg_acc = sum(acc_list) / len(acc_list)
        avg_prec = sum(prec_list) / len(prec_list)
        avg_rec = sum(rec_list) / len(rec_list)
        avg_f1 = sum(f1_list) / len(f1_list)
        avg_auc = sum(auc_list) / len(auc_list) if auc_list else None

        avg_acc_train = sum(acc_list_train) / len(acc_list_train)
        avg_prec_train = sum(prec_list_train) / len(prec_list_train)
        avg_rec_train = sum(rec_list_train) / len(rec_list_train)
        avg_f1_train = sum(f1_list_train) / len(f1_list_train)
        avg_auc_train = sum(auc_list_train) / len(auc_list_train) if auc_list_train else None

        print(f"\n📊 Model: {name}")
        print(f"Train Accuracy : {avg_acc_train:.4f} | Test Accuracy : {avg_acc:.4f} | Diff: {avg_acc_train - avg_acc:.4f}")
        print(f"Train Precision: {avg_prec_train:.4f} | Test Precision: {avg_prec:.4f} | Diff: {avg_prec_train - avg_prec:.4f}")
        print(f"Train Recall   : {avg_rec_train:.4f} | Test Recall   : {avg_rec:.4f} | Diff: {avg_rec_train - avg_rec:.4f}")
        print(f"Train F1 Score : {avg_f1_train:.4f} | Test F1 Score : {avg_f1:.4f} | Diff: {avg_f1_train - avg_f1:.4f}")
        if avg_auc_train is not None and avg_auc is not None:
            print(f"Train AUC      : {avg_auc_train:.4f} | Test AUC      : {avg_auc:.4f} | Diff: {avg_auc_train - avg_auc:.4f}")
        else:
            print(f"AUC: N/A")

# ==========================
# 📂 5) Run all 3 datasets
# ==========================
y = df['label']

# Dataset 1: entropy + file_type
X1 = df[['entropy', 'file_type']]
kfold_train_and_evaluate(X1, y, "Dataset 1 (entropy + file_type)")

# Dataset 2: entropy + file_type + size
X2 = df[['entropy', 'file_type', 'size']]
kfold_train_and_evaluate(X2, y, "Dataset 2 (entropy + file_type + size)")

# Dataset 3: entropy + file_type + size + MAC times
X3 = df[['entropy', 'file_type', 'size', 'modified_time', 'access_time', 'created_time']]
kfold_train_and_evaluate(X3, y, "Dataset 3 (entropy + file_type + size + MAC times)")

✅ Dataset loaded and cleaned

🧪 K-Fold Evaluation for Dataset 1 (entropy + file_type):

📊 Model: KNN
Train Accuracy : 0.9964 | Test Accuracy : 0.9950 | Diff: 0.0013
Train Precision: 0.9983 | Test Precision: 0.9975 | Diff: 0.0007
Train Recall   : 0.9979 | Test Recall   : 0.9972 | Diff: 0.0007
Train F1 Score : 0.9981 | Test F1 Score : 0.9974 | Diff: 0.0007
Train AUC      : 0.9998 | Test AUC      : 0.9984 | Diff: 0.0014

📊 Model: LogisticRegression
Train Accuracy : 0.9422 | Test Accuracy : 0.9422 | Diff: -0.0000
Train Precision: 0.9422 | Test Precision: 0.9422 | Diff: -0.0000
Train Recall   : 1.0000 | Test Recall   : 1.0000 | Diff: 0.0000
Train F1 Score : 0.9703 | Test F1 Score : 0.9703 | Diff: 0.0000
Train AUC      : 0.9156 | Test AUC      : 0.9146 | Diff: 0.0010

📊 Model: DecisionTree
Train Accuracy : 0.9991 | Test Accuracy : 0.9944 | Diff: 0.0048
Train Precision: 1.0000 | Test Precision: 0.9973 | Diff: 0.0027
Train Recall   : 0.9991 | Test Recall   : 0.9968 | Diff: 0.0024
Train F1 Scor