In [None]:
'''Bernoulli Naive Bayes Classifier for Benign Class Only'''
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB # Importing Bernoulli Naive Bayes
import pandas as pd
import numpy as np

# Load Dataset
df = pd.read_csv("/content/Dataset.txt", header=None)
data = df.values
x = data[:, :-1].astype(float)  # Convert features to numeric values
y = data[:, -1].astype(str)

# Define class labels and their corresponding original sample counts
class_labels = {
    "Benign": 438,
    "Reveton": 948,
    "Cerber": 897,
    "teslacrypt": 914,
    "Locky": 944,
    "Yakes": 925
}

# Number of records for testing
num_testing_records = 200

# Iterate over different test ratios
for test_ratio in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    # Calculate the number of training records
    num_training_records = int((1 - test_ratio) * len(y)) - num_testing_records

    # Split data into train and test sets
    x_train_benign, x_test, y_train_benign, y_test = train_test_split(x, y, train_size=num_training_records, test_size=num_testing_records, random_state=42)

    # Selecting only 'Benign' class for training
    x_train, y_train = x_train_benign, y_train_benign

    # Filtering data for only 'Benign' class
    benign_indices = np.where(y_train == "Benign")[0]
    x_train_benign = x_train[benign_indices]
    y_train_benign = y_train[benign_indices]

    # Bernoulli Naive Bayes classifier
    bnb_classifier = BernoulliNB()
    bnb_classifier.fit(x_train_benign, y_train_benign)
    y_pred = bnb_classifier.predict(x_test)

    # Calculate TP, TN, FP, FN for each class label
    results = {}
    for label in class_labels:
        # Find indices of samples with current label
        indices = np.where(y_test == label)[0]
        # True positives: predicted as label and actually label
        TP = np.sum(y_pred[indices] == label)
        # False positives: predicted as label but actually not label
        FP = np.sum(y_pred == label) - TP
        # False negatives: not predicted as label but actually label
        FN = len(indices) - TP
        # True negatives: not predicted as label and actually not label
        # Summing up all instances not belonging to current label and not predicted as current label
        TN = np.sum((y_pred != label) & (y_test != label))

        results[label] = {"TP": TP, "TN": TN, "FP": FP, "FN": FN}

    # Number of records of each family used for testing
    family_records_testing = {label: len(np.where(y_test == label)[0]) for label in class_labels}

    # Calculate overall accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Calculate accuracy considering only the "Benign" class
    benign_indices = np.where(y_test == "Benign")[0]
    accuracy1 = np.sum(y_pred[benign_indices] == "Benign") / len(benign_indices) if len(benign_indices) > 0 else 0

    # Print test ratio and number of records of each family used for testing
    print(f"Test Ratio: {int(test_ratio * 100)}:{int((1 - test_ratio) * 100)}")
    print("Number of Records of Each Family Used for Testing:")
    for label, count in family_records_testing.items():
        print(label + ":", count)

    # Print accuracies and TN, TP, FP, FN for each class label
    print("Overall Accuracy:", accuracy)
    print("Accuracy for 'Benign' class only:", accuracy1)
    print("Number of Testing Records:", num_testing_records)
    for label, metrics in results.items():
        print(label + ":")
        print("\tTrue Positives:", metrics["TP"])
        print("\tTrue Negatives:", metrics["TN"])
        print("\tFalse Positives:", metrics["FP"])
        print("\tFalse Negatives:", metrics["FN"])


Test Ratio: 10:90
Number of Records of Each Family Used for Testing:
Benign: 15
Reveton: 44
Cerber: 36
teslacrypt: 31
Locky: 38
Yakes: 36
Overall Accuracy: 0.075
Accuracy for 'Benign' class only: 1.0
Number of Testing Records: 200
Benign:
	True Positives: 15
	True Negatives: 0
	False Positives: 185
	False Negatives: 0
Reveton:
	True Positives: 0
	True Negatives: 156
	False Positives: 0
	False Negatives: 44
Cerber:
	True Positives: 0
	True Negatives: 164
	False Positives: 0
	False Negatives: 36
teslacrypt:
	True Positives: 0
	True Negatives: 169
	False Positives: 0
	False Negatives: 31
Locky:
	True Positives: 0
	True Negatives: 162
	False Positives: 0
	False Negatives: 38
Yakes:
	True Positives: 0
	True Negatives: 164
	False Positives: 0
	False Negatives: 36
Test Ratio: 20:80
Number of Records of Each Family Used for Testing:
Benign: 15
Reveton: 44
Cerber: 36
teslacrypt: 31
Locky: 38
Yakes: 36
Overall Accuracy: 0.075
Accuracy for 'Benign' class only: 1.0
Number of Testing Records: 200
B

In [None]:
'''Bernoulli Naive Bayes Classifier for Cerber Class Only'''
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB # Importing Bernoulli Naive Bayes
import pandas as pd
import numpy as np

# Load Dataset
df = pd.read_csv("/content/Dataset.txt", header=None)
data = df.values
x = data[:, :-1].astype(float)  # Convert features to numeric values
y = data[:, -1].astype(str)

# Define class labels and their corresponding original sample counts
class_labels = {
    "Benign": 438,
    "Reveton": 948,
    "Cerber": 897,
    "teslacrypt": 914,
    "Locky": 944,
    "Yakes": 925
}

# Number of records for testing
num_testing_records = 200
# Iterate over different test ratios
for test_ratio in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    # Calculate the number of training records
    num_training_records = int((1 - test_ratio) * len(y)) - num_testing_records

    # Split data into train and test sets
    x_train_cerber, x_test, y_train_cerber, y_test = train_test_split(x, y, train_size=num_training_records, test_size=num_testing_records, random_state=42)

    # Selecting only 'Cerber' class for training
    x_train, y_train = x_train_cerber, y_train_cerber

    # Filtering data for only 'Cerber' class
    cerber_indices = np.where(y_train == "Cerber")[0]
    x_train_cerber = x_train[cerber_indices]
    y_train_cerber = y_train[cerber_indices]

    # Bernoulli Naive Bayes classifier
    bnb_classifier = BernoulliNB()
    bnb_classifier.fit(x_train_cerber, y_train_cerber)
    y_pred = bnb_classifier.predict(x_test)

    # Calculate TP, TN, FP, FN for each class label
    results = {}
    for label in class_labels:
        # Find indices of samples with current label
        indices = np.where(y_test == label)[0]
        # True positives: predicted as label and actually label
        TP = np.sum(y_pred[indices] == label)
        # False positives: predicted as label but actually not label
        FP = np.sum(y_pred == label) - TP
        # False negatives: not predicted as label but actually label
        FN = len(indices) - TP
        # True negatives: not predicted as label and actually not label
        # Summing up all instances not belonging to current label and not predicted as current label
        TN = np.sum((y_pred != label) & (y_test != label))

        results[label] = {"TP": TP, "TN": TN, "FP": FP, "FN": FN}

    # Number of records of each family used for testing
    family_records_testing = {label: len(np.where(y_test == label)[0]) for label in class_labels}

    # Calculate overall accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Calculate accuracy considering only the "Cerber" class
    cerber_indices = np.where(y_test == "Cerber")[0]
    accuracy1 = np.sum(y_pred[cerber_indices] == "Cerber") / len(cerber_indices) if len(cerber_indices) > 0 else 0

    # Print test ratio and number of records of each family used for testing
    print(f"Test Ratio: {int(test_ratio * 100)}:{int((1 - test_ratio) * 100)}")
    print("Number of Records of Each Family Used for Testing:")
    for label, count in family_records_testing.items():
        print(label + ":", count)

    # Print accuracies and TN, TP, FP, FN for each class label
    print("Overall Accuracy:", accuracy)
    print("Accuracy for 'Cerber' class only:", accuracy1)
    print("Number of Testing Records:", num_testing_records)
    for label, metrics in results.items():
        print(label + ":")
        print("\tTrue Positives:", metrics["TP"])
        print("\tTrue Negatives:", metrics["TN"])
        print("\tFalse Positives:", metrics["FP"])
        print("\tFalse Negatives:", metrics["FN"])


Test Ratio: 10:90
Number of Records of Each Family Used for Testing:
Benign: 15
Reveton: 44
Cerber: 36
teslacrypt: 31
Locky: 38
Yakes: 36
Overall Accuracy: 0.18
Accuracy for 'Cerber' class only: 1.0
Number of Testing Records: 200
Benign:
	True Positives: 0
	True Negatives: 185
	False Positives: 0
	False Negatives: 15
Reveton:
	True Positives: 0
	True Negatives: 156
	False Positives: 0
	False Negatives: 44
Cerber:
	True Positives: 36
	True Negatives: 0
	False Positives: 164
	False Negatives: 0
teslacrypt:
	True Positives: 0
	True Negatives: 169
	False Positives: 0
	False Negatives: 31
Locky:
	True Positives: 0
	True Negatives: 162
	False Positives: 0
	False Negatives: 38
Yakes:
	True Positives: 0
	True Negatives: 164
	False Positives: 0
	False Negatives: 36
Test Ratio: 20:80
Number of Records of Each Family Used for Testing:
Benign: 15
Reveton: 44
Cerber: 36
teslacrypt: 31
Locky: 38
Yakes: 36
Overall Accuracy: 0.18
Accuracy for 'Cerber' class only: 1.0
Number of Testing Records: 200
Ben

In [None]:
'''Bernoulli Naive Bayes Classifier for Locky Class Only'''
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB # Importing Bernoulli Naive Bayes
import pandas as pd
import numpy as np

# Load Dataset
df = pd.read_csv("/content/Dataset.txt", header=None)
data = df.values
x = data[:, :-1].astype(float)  # Convert features to numeric values
y = data[:, -1].astype(str)

# Define class labels and their corresponding original sample counts
class_labels = {
    "Benign": 438,
    "Reveton": 948,
    "Cerber": 897,
    "teslacrypt": 914,
    "Locky": 944,
    "Yakes": 925
}

# Number of records for testing
num_testing_records = 200
# Iterate over different test ratios
for test_ratio in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    # Calculate the number of training records
    num_training_records = int((1 - test_ratio) * len(y)) - num_testing_records

    # Split data into train and test sets
    x_train_locky, x_test, y_train_locky, y_test = train_test_split(x, y, train_size=num_training_records, test_size=num_testing_records, random_state=42)

    # Selecting only 'Locky' class for training
    x_train, y_train = x_train_locky, y_train_locky

    # Filtering data for only 'Locky' class
    locky_indices = np.where(y_train == "Locky")[0]
    x_train_locky = x_train[locky_indices]
    y_train_locky = y_train[locky_indices]

    # Bernoulli Naive Bayes classifier
    bnb_classifier = BernoulliNB()
    bnb_classifier.fit(x_train_locky, y_train_locky)
    y_pred = bnb_classifier.predict(x_test)

    # Calculate TP, TN, FP, FN for each class label
    results = {}
    for label in class_labels:
        # Find indices of samples with current label
        indices = np.where(y_test == label)[0]
        # True positives: predicted as label and actually label
        TP = np.sum(y_pred[indices] == label)
        # False positives: predicted as label but actually not label
        FP = np.sum(y_pred == label) - TP
        # False negatives: not predicted as label but actually label
        FN = len(indices) - TP
        # True negatives: not predicted as label and actually not label
        # Summing up all instances not belonging to current label and not predicted as current label
        TN = np.sum((y_pred != label) & (y_test != label))

        results[label] = {"TP": TP, "TN": TN, "FP": FP, "FN": FN}

    # Number of records of each family used for testing
    family_records_testing = {label: len(np.where(y_test == label)[0]) for label in class_labels}

    # Calculate overall accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Calculate accuracy considering only the "Locky" class
    locky_indices = np.where(y_test == "Locky")[0]
    accuracy1 = np.sum(y_pred[locky_indices] == "Locky") / len(locky_indices) if len(locky_indices) > 0 else 0

    # Print test ratio and number of records of each family used for testing
    print(f"Test Ratio: {int(test_ratio * 100)}:{int((1 - test_ratio) * 100)}")
    print("Number of Records of Each Family Used for Testing:")
    for label, count in family_records_testing.items():
        print(label + ":", count)

    # Print accuracies and TN, TP, FP, FN for each class label
    print("Overall Accuracy:", accuracy)
    print("Accuracy for 'Locky' class only:", accuracy1)
    print("Number of Testing Records:", num_testing_records)
    for label, metrics in results.items():
        print(label + ":")
        print("\tTrue Positives:", metrics["TP"])
        print("\tTrue Negatives:", metrics["TN"])
        print("\tFalse Positives:", metrics["FP"])
        print("\tFalse Negatives:", metrics["FN"])


Test Ratio: 10:90
Number of Records of Each Family Used for Testing:
Benign: 15
Reveton: 44
Cerber: 36
teslacrypt: 31
Locky: 38
Yakes: 36
Overall Accuracy: 0.19
Accuracy for 'Locky' class only: 1.0
Number of Testing Records: 200
Benign:
	True Positives: 0
	True Negatives: 185
	False Positives: 0
	False Negatives: 15
Reveton:
	True Positives: 0
	True Negatives: 156
	False Positives: 0
	False Negatives: 44
Cerber:
	True Positives: 0
	True Negatives: 164
	False Positives: 0
	False Negatives: 36
teslacrypt:
	True Positives: 0
	True Negatives: 169
	False Positives: 0
	False Negatives: 31
Locky:
	True Positives: 38
	True Negatives: 0
	False Positives: 162
	False Negatives: 0
Yakes:
	True Positives: 0
	True Negatives: 164
	False Positives: 0
	False Negatives: 36
Test Ratio: 20:80
Number of Records of Each Family Used for Testing:
Benign: 15
Reveton: 44
Cerber: 36
teslacrypt: 31
Locky: 38
Yakes: 36
Overall Accuracy: 0.19
Accuracy for 'Locky' class only: 1.0
Number of Testing Records: 200
Benig

In [None]:
'''Bernoulli Naive Bayes Classifier for Reveton Class Only'''
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB # Importing Bernoulli Naive Bayes
import pandas as pd
import numpy as np

# Load Dataset
df = pd.read_csv("/content/Dataset.txt", header=None)
data = df.values
x = data[:, :-1].astype(float)  # Convert features to numeric values
y = data[:, -1].astype(str)

# Define class labels and their corresponding original sample counts
class_labels = {
    "Benign": 438,
    "Reveton": 948,
    "Cerber": 897,
    "teslacrypt": 914,
    "Locky": 944,
    "Yakes": 925
}

# Number of records for testing
num_testing_records = 200
#
# Iterate over different test ratios
for test_ratio in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    # Calculate the number of training records
    num_training_records = int((1 - test_ratio) * len(y)) - num_testing_records

    # Split data into train and test sets
    x_train_reveton, x_test, y_train_reveton, y_test = train_test_split(x, y, train_size=num_training_records, test_size=num_testing_records, random_state=42)

    # Selecting only 'Reveton' class for training
    x_train, y_train = x_train_reveton, y_train_reveton

    # Filtering data for only 'Reveton' class
    reveton_indices = np.where(y_train == "Reveton")[0]
    x_train_reveton = x_train[reveton_indices]
    y_train_reveton = y_train[reveton_indices]

    # Bernoulli Naive Bayes classifier
    bnb_classifier = BernoulliNB()
    bnb_classifier.fit(x_train_reveton, y_train_reveton)
    y_pred = bnb_classifier.predict(x_test)

    # Calculate TP, TN, FP, FN for each class label
    results = {}
    for label in class_labels:
        # Find indices of samples with current label
        indices = np.where(y_test == label)[0]
        # True positives: predicted as label and actually label
        TP = np.sum(y_pred[indices] == label)
        # False positives: predicted as label but actually not label
        FP = np.sum(y_pred == label) - TP
        # False negatives: not predicted as label but actually label
        FN = len(indices) - TP
        # True negatives: not predicted as label and actually not label
        # Summing up all instances not belonging to current label and not predicted as current label
        TN = np.sum((y_pred != label) & (y_test != label))

        results[label] = {"TP": TP, "TN": TN, "FP": FP, "FN": FN}

    # Number of records of each family used for testing
    family_records_testing = {label: len(np.where(y_test == label)[0]) for label in class_labels}

    # Calculate overall accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Calculate accuracy considering only the "Reveton" class
    reveton_indices = np.where(y_test == "Reveton")[0]
    accuracy1 = np.sum(y_pred[reveton_indices] == "Reveton") / len(reveton_indices) if len(reveton_indices) > 0 else 0

    # Print test ratio and number of records of each family used for testing
    print(f"Test Ratio: {int(test_ratio * 100)}:{int((1 - test_ratio) * 100)}")
    print("Number of Records of Each Family Used for Testing:")
    for label, count in family_records_testing.items():
        print(label + ":", count)

    # Print accuracies and TN, TP, FP, FN for each class label
    print("Overall Accuracy:", accuracy)
    print("Accuracy for 'Reveton' class only:", accuracy1)
    print("Number of Testing Records:", num_testing_records)
    for label, metrics in results.items():
        print(label + ":")
        print("\tTrue Positives:", metrics["TP"])
        print("\tTrue Negatives:", metrics["TN"])
        print("\tFalse Positives:", metrics["FP"])
        print("\tFalse Negatives:", metrics["FN"])


Test Ratio: 10:90
Number of Records of Each Family Used for Testing:
Benign: 15
Reveton: 44
Cerber: 36
teslacrypt: 31
Locky: 38
Yakes: 36
Overall Accuracy: 0.22
Accuracy for 'Reveton' class only: 1.0
Number of Testing Records: 200
Benign:
	True Positives: 0
	True Negatives: 185
	False Positives: 0
	False Negatives: 15
Reveton:
	True Positives: 44
	True Negatives: 0
	False Positives: 156
	False Negatives: 0
Cerber:
	True Positives: 0
	True Negatives: 164
	False Positives: 0
	False Negatives: 36
teslacrypt:
	True Positives: 0
	True Negatives: 169
	False Positives: 0
	False Negatives: 31
Locky:
	True Positives: 0
	True Negatives: 162
	False Positives: 0
	False Negatives: 38
Yakes:
	True Positives: 0
	True Negatives: 164
	False Positives: 0
	False Negatives: 36
Test Ratio: 20:80
Number of Records of Each Family Used for Testing:
Benign: 15
Reveton: 44
Cerber: 36
teslacrypt: 31
Locky: 38
Yakes: 36
Overall Accuracy: 0.22
Accuracy for 'Reveton' class only: 1.0
Number of Testing Records: 200
B

In [None]:
'''Bernoulli Naive Bayes Classifier for teslacrypt Class Only'''
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB # Importing Bernoulli Naive Bayes
import pandas as pd
import numpy as np

# Load Dataset
df = pd.read_csv("/content/Dataset.txt", header=None)
data = df.values
x = data[:, :-1].astype(float)  # Convert features to numeric values
y = data[:, -1].astype(str)

# Define class labels and their corresponding original sample counts
class_labels = {
    "Benign": 438,
    "Reveton": 948,
    "Cerber": 897,
    "teslacrypt": 914,
    "Locky": 944,
    "Yakes": 925
}

# Number of records for testing
num_testing_records = 200
#
# Iterate over different test ratios
for test_ratio in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    # Calculate the number of training records
    num_training_records = int((1 - test_ratio) * len(y)) - num_testing_records

    # Split data into train and test sets
    x_train_teslacrypt, x_test, y_train_teslacrypt, y_test = train_test_split(x, y, train_size=num_training_records, test_size=num_testing_records, random_state=42)

    # Selecting only 'teslacrypt' class for training
    x_train, y_train = x_train_teslacrypt, y_train_teslacrypt

    # Filtering data for only 'teslacrypt' class
    teslacrypt_indices = np.where(y_train == "teslacrypt")[0]
    x_train_teslacrypt = x_train[teslacrypt_indices]
    y_train_teslacrypt = y_train[teslacrypt_indices]

    # Bernoulli Naive Bayes classifier
    bnb_classifier = BernoulliNB()
    bnb_classifier.fit(x_train_teslacrypt, y_train_teslacrypt)
    y_pred = bnb_classifier.predict(x_test)

    # Calculate TP, TN, FP, FN for each class label
    results = {}
    for label in class_labels:
        # Find indices of samples with current label
        indices = np.where(y_test == label)[0]
        # True positives: predicted as label and actually label
        TP = np.sum(y_pred[indices] == label)
        # False positives: predicted as label but actually not label
        FP = np.sum(y_pred == label) - TP
        # False negatives: not predicted as label but actually label
        FN = len(indices) - TP
        # True negatives: not predicted as label and actually not label
        # Summing up all instances not belonging to current label and not predicted as current label
        TN = np.sum((y_pred != label) & (y_test != label))

        results[label] = {"TP": TP, "TN": TN, "FP": FP, "FN": FN}

    # Number of records of each family used for testing
    family_records_testing = {label: len(np.where(y_test == label)[0]) for label in class_labels}

    # Calculate overall accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Calculate accuracy considering only the "teslacrypt" class
    teslacrypt_indices = np.where(y_test == "teslacrypt")[0]
    accuracy1 = np.sum(y_pred[teslacrypt_indices] == "teslacrypt") / len(teslacrypt_indices) if len(teslacrypt_indices) > 0 else 0

    # Print test ratio and number of records of each family used for testing
    print(f"Test Ratio: {int(test_ratio * 100)}:{int((1 - test_ratio) * 100)}")
    print("Number of Records of Each Family Used for Testing:")
    for label, count in family_records_testing.items():
        print(label + ":", count)

    # Print accuracies and TN, TP, FP, FN for each class label
    print("Overall Accuracy:", accuracy)
    print("Accuracy for 'teslacrypt' class only:", accuracy1)
    print("Number of Testing Records:", num_testing_records)
    for label, metrics in results.items():
        print(label + ":")
        print("\tTrue Positives:", metrics["TP"])
        print("\tTrue Negatives:", metrics["TN"])
        print("\tFalse Positives:", metrics["FP"])
        print("\tFalse Negatives:", metrics["FN"])


Test Ratio: 10:90
Number of Records of Each Family Used for Testing:
Benign: 15
Reveton: 44
Cerber: 36
teslacrypt: 31
Locky: 38
Yakes: 36
Overall Accuracy: 0.155
Accuracy for 'teslacrypt' class only: 1.0
Number of Testing Records: 200
Benign:
	True Positives: 0
	True Negatives: 185
	False Positives: 0
	False Negatives: 15
Reveton:
	True Positives: 0
	True Negatives: 156
	False Positives: 0
	False Negatives: 44
Cerber:
	True Positives: 0
	True Negatives: 164
	False Positives: 0
	False Negatives: 36
teslacrypt:
	True Positives: 31
	True Negatives: 0
	False Positives: 169
	False Negatives: 0
Locky:
	True Positives: 0
	True Negatives: 162
	False Positives: 0
	False Negatives: 38
Yakes:
	True Positives: 0
	True Negatives: 164
	False Positives: 0
	False Negatives: 36
Test Ratio: 20:80
Number of Records of Each Family Used for Testing:
Benign: 15
Reveton: 44
Cerber: 36
teslacrypt: 31
Locky: 38
Yakes: 36
Overall Accuracy: 0.155
Accuracy for 'teslacrypt' class only: 1.0
Number of Testing Record

In [None]:
'''Bernoulli Naive Bayes Classifier for Yakes Class Only'''
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB # Importing Bernoulli Naive Bayes
import pandas as pd
import numpy as np

# Load Dataset
df = pd.read_csv("/content/Dataset.txt", header=None)
data = df.values
x = data[:, :-1].astype(float)  # Convert features to numeric values
y = data[:, -1].astype(str)

# Define class labels and their corresponding original sample counts
class_labels = {
    "Benign": 438,
    "Reveton": 948,
    "Cerber": 897,
    "teslacrypt": 914,
    "Locky": 944,
    "Yakes": 925
}

# Number of records for testing
num_testing_records = 200

# Iterate over different test ratios
for test_ratio in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    # Calculate the number of training records
    num_training_records = int((1 - test_ratio) * len(y)) - num_testing_records

    # Split data into train and test sets
    x_train_yakes, x_test, y_train_yakes, y_test = train_test_split(x, y, train_size=num_training_records, test_size=num_testing_records, random_state=42)

    # Selecting only 'Yakes' class for training
    x_train, y_train = x_train_yakes, y_train_yakes

    # Filtering data for only 'Yakes' class
    yakes_indices = np.where(y_train == "Yakes")[0]
    x_train_yakes = x_train[yakes_indices]
    y_train_yakes = y_train[yakes_indices]

    # Bernoulli Naive Bayes classifier
    bnb_classifier = BernoulliNB()
    bnb_classifier.fit(x_train_yakes, y_train_yakes)
    y_pred = bnb_classifier.predict(x_test)

    # Calculate TP, TN, FP, FN for each class label
    results = {}
    for label in class_labels:
        # Find indices of samples with current label
        indices = np.where(y_test == label)[0]
        # True positives: predicted as label and actually label
        TP = np.sum(y_pred[indices] == label)
        # False positives: predicted as label but actually not label
        FP = np.sum(y_pred == label) - TP
        # False negatives: not predicted as label but actually label
        FN = len(indices) - TP
        # True negatives: not predicted as label and actually not label
        # Summing up all instances not belonging to current label and not predicted as current label
        TN = np.sum((y_pred != label) & (y_test != label))

        results[label] = {"TP": TP, "TN": TN, "FP": FP, "FN": FN}

    # Number of records of each family used for testing
    family_records_testing = {label: len(np.where(y_test == label)[0]) for label in class_labels}

    # Calculate overall accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Calculate accuracy considering only the "Yakes" class
    yakes_indices = np.where(y_test == "Yakes")[0]
    accuracy1 = np.sum(y_pred[yakes_indices] == "Yakes") / len(yakes_indices) if len(yakes_indices) > 0 else 0

    # Print test ratio and number of records of each family used for testing
    print(f"Test Ratio: {int(test_ratio * 100)}:{int((1 - test_ratio) * 100)}")
    print("Number of Records of Each Family Used for Testing:")
    for label, count in family_records_testing.items():
        print(label + ":", count)

    # Print accuracies and TN, TP, FP, FN for each class label
    print("Overall Accuracy:", accuracy)
    print("Accuracy for 'Yakes' class only:", accuracy1)
    print("Number of Testing Records:", num_testing_records)
    for label, metrics in results.items():
        print(label + ":")
        print("\tTrue Positives:", metrics["TP"])
        print("\tTrue Negatives:", metrics["TN"])
        print("\tFalse Positives:", metrics["FP"])
        print("\tFalse Negatives:", metrics["FN"])


Test Ratio: 10:90
Number of Records of Each Family Used for Testing:
Benign: 15
Reveton: 44
Cerber: 36
teslacrypt: 31
Locky: 38
Yakes: 36
Overall Accuracy: 0.18
Accuracy for 'Yakes' class only: 1.0
Number of Testing Records: 200
Benign:
	True Positives: 0
	True Negatives: 185
	False Positives: 0
	False Negatives: 15
Reveton:
	True Positives: 0
	True Negatives: 156
	False Positives: 0
	False Negatives: 44
Cerber:
	True Positives: 0
	True Negatives: 164
	False Positives: 0
	False Negatives: 36
teslacrypt:
	True Positives: 0
	True Negatives: 169
	False Positives: 0
	False Negatives: 31
Locky:
	True Positives: 0
	True Negatives: 162
	False Positives: 0
	False Negatives: 38
Yakes:
	True Positives: 36
	True Negatives: 0
	False Positives: 164
	False Negatives: 0
Test Ratio: 20:80
Number of Records of Each Family Used for Testing:
Benign: 15
Reveton: 44
Cerber: 36
teslacrypt: 31
Locky: 38
Yakes: 36
Overall Accuracy: 0.18
Accuracy for 'Yakes' class only: 1.0
Number of Testing Records: 200
Benig