<a href="https://colab.research.google.com/github/yms07/My-Project1/blob/main/RansomwareDetection_UsingAES.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# =====================================
# STEP 0: Mount Google Drive
# =====================================
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
# =====================================
# STEP 1: Setup (clone repos, unzip datasets)
# =====================================
import os
import shutil
import zipfile
import glob

# (1) DLL_SYS dataset unzip
!unzip "/content/drive/MyDrive/ransomware_data/plaintext/dll_sys_dataset.zip" -d "/content/drive/MyDrive/ransomware_data/plaintext/"

# DLL move
os.makedirs('/content/drive/MyDrive/ransomware_data/plaintext/dll', exist_ok=True)
for f in os.listdir('/content/drive/MyDrive/ransomware_data/plaintext/dll_sys_dataset/dll'):
  shutil.move(f'/content/drive/MyDrive/ransomware_data/plaintext/dll_sys_dataset/dll/{f}',
              '/content/drive/MyDrive/ransomware_data/plaintext/dll')

# SYS move
os.makedirs('/content/drive/MyDrive/ransomware_data/plaintext/sys', exist_ok=True)
for f in os.listdir('/content/drive/MyDrive/ransomware_data/plaintext/dll_sys_dataset/sys'):
  shutil.move(f'/content/drive/MyDrive/ransomware_data/plaintext/dll_sys_dataset/sys/{f}',
              '/content/drive/MyDrive/ransomware_data/plaintext/sys')

# (2) Clone C/C++ source repos
!rm -rf /content/C-Plus-Plus
!rm -rf /content/C
!git clone https://github.com/TheAlgorithms/C-Plus-Plus.git
!git clone https://github.com/TheAlgorithms/C.git

# (3) Copy C files
c_files = glob.glob('/content/C/**/*.c', recursive=True)
c_target = '/content/drive/MyDrive/ransomware_data/plaintext/c'
os.makedirs(c_target, exist_ok=True)
for f in c_files:
  shutil.copy(f, c_target)

# (4) Copy C++ files
cpp_files = glob.glob('/content/C-Plus-Plus/**/*.cpp', recursive=True)
cpp_target = '/content/drive/MyDrive/ransomware_data/plaintext/cpp'
os.makedirs(cpp_target, exist_ok=True)
for f in cpp_files:
  shutil.copy(f, cpp_target)

# (5) Extract GovDocs1
zip_folder = '/content/drive/MyDrive/ransomware_data/govdocs_zips'
extract_to = '/content/drive/MyDrive/ransomware_data/plaintext/govdocs_extracted'
os.makedirs(extract_to, exist_ok=True)
for zip_file in os.listdir(zip_folder):
  if zip_file.endswith(".zip"):
    with zipfile.ZipFile(os.path.join(zip_folder, zip_file), 'r') as zip_ref:
      zip_ref.extractall(extract_to)

print("✅ Setup done!")


Archive:  /content/drive/MyDrive/ransomware_data/plaintext/dll_sys_dataset.zip
   creating: /content/drive/MyDrive/ransomware_data/plaintext/dll_sys_dataset/
   creating: /content/drive/MyDrive/ransomware_data/plaintext/dll_sys_dataset/dll/
  inflating: /content/drive/MyDrive/ransomware_data/plaintext/dll_sys_dataset/dll/AarSvc.dll  
  inflating: /content/drive/MyDrive/ransomware_data/plaintext/dll_sys_dataset/dll/accessibilitycpl.dll  
  inflating: /content/drive/MyDrive/ransomware_data/plaintext/dll_sys_dataset/dll/AcGenral.dll  
  inflating: /content/drive/MyDrive/ransomware_data/plaintext/dll_sys_dataset/dll/AcLayers.dll  
  inflating: /content/drive/MyDrive/ransomware_data/plaintext/dll_sys_dataset/dll/acledit.dll  
  inflating: /content/drive/MyDrive/ransomware_data/plaintext/dll_sys_dataset/dll/aclui.dll  
  inflating: /content/drive/MyDrive/ransomware_data/plaintext/dll_sys_dataset/dll/acppage.dll  
  inflating: /content/drive/MyDrive/ransomware_data/plaintext/dll_sys_dataset/d

In [17]:
# =====================================
# STEP 2: Feature Extraction (Plaintext)
# =====================================
import math
import pandas as pd
from datetime import datetime

# Entropy function
def calculate_entropy(file_path):
  try:
    with open(file_path, 'rb') as f:
      data = f.read()
    if not data:
      return 0.0
    freq_list = [0] * 256
    for b in data:
      freq_list[b] += 1
    entropy = 0.0
    for freq in freq_list:
      if freq > 0:
        p = freq / len(data)
        entropy -= p * math.log2(p)
    return round(entropy, 4)
  except:
    return 0.0

# File type map
FILE_TYPE_MAP = {
  'dll': 3, 'sys': 4, 'c': 13, 'cpp': 14, 'csv': 1, 'txt': 2,
  'doc': 5, 'docx': 6, 'pdf': 7, 'ppt': 8, 'pptx': 9, 'xls': 10,
  'xlsx': 11, 'html': 12, 'jpg': 15, 'zip': 16
}

# Extract features
def extract_features_from_folder(folder_path, label):
  data = []
  for root, dirs, files in os.walk(folder_path):
    for file in files:
      file_path = os.path.join(root, file)
      ext = file.split('.')[-1].lower()
      file_type = FILE_TYPE_MAP.get(ext, 0)
      try:
        entropy = calculate_entropy(file_path)
        size = os.path.getsize(file_path)
        stat = os.stat(file_path)
        m_time = datetime.fromtimestamp(stat.st_mtime).isoformat()
        a_time = datetime.fromtimestamp(stat.st_atime).isoformat()
        c_time = datetime.fromtimestamp(stat.st_ctime).isoformat()
        data.append({
          'file_name': file, 'entropy': entropy, 'size': size, 'file_type': file_type,
          'modified_time': m_time, 'access_time': a_time, 'created_time': c_time,
          'label': label
        })
      except Exception as e:
        continue
  return data

# Run extraction
dll_data = extract_features_from_folder('/content/drive/MyDrive/ransomware_data/plaintext/dll', 1)
sys_data = extract_features_from_folder('/content/drive/MyDrive/ransomware_data/plaintext/sys', 1)
c_data = extract_features_from_folder('/content/drive/MyDrive/ransomware_data/plaintext/c', 1)
cpp_data = extract_features_from_folder('/content/drive/MyDrive/ransomware_data/plaintext/cpp', 1)
govdocs_data = extract_features_from_folder('/content/drive/MyDrive/ransomware_data/plaintext/govdocs_extracted', 1)

all_plaintext = dll_data + sys_data + c_data + cpp_data + govdocs_data
df_plain = pd.DataFrame(all_plaintext)
df_plain.to_csv('/content/drive/MyDrive/ransomware_data/features_plaintext.csv', index=False)
print("✅ Plaintext features extracted!")


✅ Plaintext features extracted!


In [18]:
# PyCryptodome AES লাইব্রেরি
!pip install pycryptodome



In [19]:
# =====================================
# STEP 3: AES Encrypt (simulate ransomware infection)
# =====================================
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad
from tqdm import tqdm

# AES encrypt file
def aes_encrypt_file(input_path, output_path, key, iv):
  cipher = AES.new(key, AES.MODE_CBC, iv)
  with open(input_path, 'rb') as fin:
    data = fin.read()
  padded_data = pad(data, AES.block_size)
  enc = cipher.encrypt(padded_data)
  with open(output_path, 'wb') as fout:
    fout.write(enc)

# Encrypt folders
key = b'ThisIsASecretKey'  # 16 bytes
iv = b'ThisIsAnInitVect'   # 16 bytes

folders = ['dll', 'sys', 'c', 'cpp']
base_in = '/content/drive/MyDrive/ransomware_data/plaintext'
base_out = '/content/drive/MyDrive/ransomware_data/infected'
os.makedirs(base_out, exist_ok=True)

infected_meta = []

for folder in folders:
  in_dir = f"{base_in}/{folder}"
  out_dir = f"{base_out}/{folder}"
  os.makedirs(out_dir, exist_ok=True)
  files = os.listdir(in_dir)
  for f in tqdm(files, desc=f"Encrypting {folder}"):
    src = f"{in_dir}/{f}"
    dst = f"{out_dir}/{f}"
    aes_encrypt_file(src, dst, key, iv)
    # Feature extraction
    ext = f.split('.')[-1].lower()
    file_type = FILE_TYPE_MAP.get(ext, 0)
    stat = os.stat(dst)
    infected_meta.append({
      'file_name': f,
      'entropy': calculate_entropy(dst),
      'size': stat.st_size,
      'file_type': file_type,
      'modified_time': stat.st_mtime,
      'access_time': stat.st_atime,
      'created_time': stat.st_ctime,
      'label': 0
    })

df_infected = pd.DataFrame(infected_meta)
df_infected.to_csv('/content/drive/MyDrive/ransomware_data/features_infected.csv', index=False)
print("✅ Infected features extracted!")

Encrypting dll: 100%|██████████| 441/441 [00:35<00:00, 12.28it/s]
Encrypting sys: 100%|██████████| 441/441 [00:30<00:00, 14.40it/s]
Encrypting c: 100%|██████████| 371/371 [00:08<00:00, 43.96it/s]
Encrypting cpp: 100%|██████████| 356/356 [00:08<00:00, 42.88it/s]

✅ Infected features extracted!





In [20]:
# =====================================
# STEP 4: Merge & Preprocess
# =====================================
plain_df = pd.read_csv('/content/drive/MyDrive/ransomware_data/features_plaintext.csv')
infect_df = pd.read_csv('/content/drive/MyDrive/ransomware_data/features_infected.csv')
df = pd.concat([plain_df, infect_df])

# Convert timestamps
for col in ['modified_time', 'access_time', 'created_time']:
  df[col] = pd.to_datetime(df[col], errors='coerce').astype('int64') / 1e9

df.dropna(inplace=True)
df['size'] = df['size'] / 10_000_000
df.drop(columns=['file_name'], inplace=True)

print(df.head())


   entropy      size  file_type  modified_time   access_time  created_time  \
0   7.9143  0.000459          3   1.742418e+09  1.751096e+09  1.751097e+09   
1   7.8476  0.000217          3   1.742418e+09  1.751096e+09  1.751097e+09   
2   7.9913  0.005202          3   1.742418e+09  1.751096e+09  1.751097e+09   
3   7.9879  0.002797          3   1.742418e+09  1.751096e+09  1.751097e+09   
4   4.7114  0.000922          3   1.651901e+09  1.751096e+09  1.751097e+09   

   label  
0      1  
1      1  
2      1  
3      1  
4      1  


In [24]:
# ✅ Ransomware Detection Full Colab Notebook (Steps 4-6 + K-Fold CV)

import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Paths
base_path = '/content/drive/MyDrive/ransomware_data'
plain_csv = f'{base_path}/features_plaintext.csv'
infect_csv = f'{base_path}/features_infected.csv'

# Load data
plain_df = pd.read_csv(plain_csv)
infect_df = pd.read_csv(infect_csv)

# Combine datasets
df = pd.concat([plain_df, infect_df], ignore_index=True)

# Convert timestamps to numeric
for col in ['modified_time', 'access_time', 'created_time']:
    df[col] = pd.to_datetime(df[col], errors='coerce')
    df[col] = df[col].astype('int64') / 1e9

df.dropna(inplace=True)
print("✅ Dataset loaded and cleaned")

# All models
models = {
    'KNN': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'SVM': SVC(probability=True),
    'MLP': MLPClassifier(max_iter=1000)
}

def kfold_train_and_evaluate(X, y, label):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    print(f"\n🧪 K-Fold Evaluation for {label}:")

    for name, model in models.items():
        acc_list, prec_list, rec_list, f1_list, auc_list = [], [], [], [], []

        for train_idx, test_idx in skf.split(X_scaled, y):
            X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

            acc_list.append(accuracy_score(y_test, y_pred))
            prec_list.append(precision_score(y_test, y_pred))
            rec_list.append(recall_score(y_test, y_pred))
            f1_list.append(f1_score(y_test, y_pred))
            if y_prob is not None:
                auc_list.append(roc_auc_score(y_test, y_prob))

        avg_acc = sum(acc_list) / len(acc_list)
        avg_prec = sum(prec_list) / len(prec_list)
        avg_rec = sum(rec_list) / len(rec_list)
        avg_f1 = sum(f1_list) / len(f1_list)
        avg_auc = sum(auc_list) / len(auc_list) if auc_list else None

        print(f"\n📊 Model: {name}")
        print(f"Accuracy : {avg_acc:.4f}")
        print(f"Precision: {avg_prec:.4f}")
        print(f"Recall   : {avg_rec:.4f}")
        print(f"F1 Score : {avg_f1:.4f}")
        print(f"AUC      : {avg_auc:.4f}" if avg_auc else "AUC      : N/A")

# ===============================
# Run for all 3 feature combinations
# ===============================

y = df['label']

# Dataset 1: entropy + file_type
X1 = df[['entropy', 'file_type']]
kfold_train_and_evaluate(X1, y, "Dataset 1 (entropy + file_type)")

# Dataset 2: entropy + file_type + size
X2 = df[['entropy', 'file_type', 'size']]
kfold_train_and_evaluate(X2, y, "Dataset 2 (entropy + file_type + size)")

# Dataset 3: entropy + file_type + size + MAC times
X3 = df[['entropy', 'file_type', 'size', 'modified_time', 'access_time', 'created_time']]
kfold_train_and_evaluate(X3, y, "Dataset 3 (entropy + file_type + size + MAC times)")

✅ Dataset loaded and cleaned

🧪 K-Fold Evaluation for Dataset 1 (entropy + file_type):

📊 Model: KNN
Accuracy : 0.9950
Precision: 0.9975
Recall   : 0.9972
F1 Score : 0.9974
AUC      : 0.9984

📊 Model: LogisticRegression
Accuracy : 0.9422
Precision: 0.9422
Recall   : 1.0000
F1 Score : 0.9703
AUC      : 0.9146

📊 Model: DecisionTree
Accuracy : 0.9944
Precision: 0.9973
Recall   : 0.9968
F1 Score : 0.9970
AUC      : 0.9763

📊 Model: RandomForest
Accuracy : 0.9945
Precision: 0.9973
Recall   : 0.9969
F1 Score : 0.9971
AUC      : 0.9987

📊 Model: GradientBoosting
Accuracy : 0.9952
Precision: 0.9977
Recall   : 0.9971
F1 Score : 0.9974
AUC      : 0.9995

📊 Model: SVM
Accuracy : 0.9926
Precision: 0.9984
Recall   : 0.9938
F1 Score : 0.9961
AUC      : 0.9966

📊 Model: MLP
Accuracy : 0.9937
Precision: 0.9982
Recall   : 0.9951
F1 Score : 0.9967
AUC      : 0.9994

🧪 K-Fold Evaluation for Dataset 2 (entropy + file_type + size):

📊 Model: KNN
Accuracy : 0.9961
Precision: 0.9980
Recall   : 0.9978
F1 Sco