In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest
from scipy import stats
import matplotlib.pyplot as plt
import itertools


In [None]:

# Set the path to your file
input_filename = "E:/Dataset/Computer Emergency Response Team (CERT) r6.2/r6.2/r6.2/login_transformed.csv"

# Read CSV file into DataFrame
df_read = pd.read_csv(input_filename)


In [None]:

# Encode the categorical features
encs = dict()
for column in df_read.columns:
    if df_read[column].dtype == "object":
        encs[column] = LabelEncoder()
        df_read[column] = encs[column].fit_transform(df_read[column])

# Create the Isolation Forest
clf = IsolationForest(n_estimators=200, max_samples=300)

# Transform in to periodical features
df_read['hour_sin'] = np.sin(df_read.hour*(2.*np.pi/24))
df_read['hour_cos'] = np.cos(df_read.hour*(2.*np.pi/24))

# Drop the original time features
df_read = df_read.drop('hour', 1)

# Prepare the training data
train_data = df_read.drop('threat', 1)

# Define the function to create unique datasets
def createUniqueDataframe(features):
    new_combination = train_data[features].copy()
    calculateAnomalyScore(new_combination)

# Define the function to combine features
def combineFeatures(): 
    for L in range(1, len(train_data.columns)+1):
        for subset in itertools.combinations(train_data.columns, L):
            createUniqueDataframe(list(subset))

# Define the function to calculate anomaly scores
def calculateAnomalyScore(dataset):
    outputfile = "LabelEncoded-KS-non-cyclical.txt"
    
    # Train the model
    clf.fit(dataset)
    
    # Calculate the anomaly score
    anomaly_score = clf.decision_function(dataset)
    
    # Add the anomaly score to the data frame
    df_read['anomaly_score'] = anomaly_score
    
    avg_count_0 = df_read.loc[df_read.threat==0]    #Data frame with true negatives
    avg_count_1 = df_read.loc[df_read.threat==1]    #Data frame with true positives
    
    features = dataset.columns.tolist()
    (ks_stat, pval) = stats.ks_2samp(avg_count_1.anomaly_score, avg_count_0.anomaly_score)
    
    output_file = open(outputfile, "a")
    output_file.write(str(features)+";"+str(ks_stat)+";"+str(pval)+"\n")
    output_file.close()

# Run the functions
combineFeatures()


In [None]:

# Plot the distribution of the scores for non-threats
plt.figure(figsize=(8, 4), dpi=600, facecolor='w', edgecolor='k')
normal = plt.hist(df_read.loc[df_read.threat==0].anomaly_score, 50, density=True)
plt.xlabel('Anomaly score')
plt.ylabel('Percentage')
plt.title("Distribution of anomaly score for non threats")
plt.show()


In [None]:

# Plot the distribution of the scores for threats
plt.figure(figsize=(8, 4), dpi=600, facecolor='w', edgecolor='k')
normal = plt.hist(df_read.loc[df_read.threat==1].anomaly_score, 50, density=True)
plt.xlabel('Anomaly score')
plt.ylabel('Percentage')
plt.title("Distribution of anomaly score for threats")
plt.show()


In [None]:

# Kolmogorov-Smirnov Test
avg_count_1 = df_read.loc[df_read.threat==1]    #Data frame with true positives
avg_count_0 = df_read.loc[df_read.threat==0]    #Data frame with true negatives
(ks_stat, pval) = stats.ks_2samp(avg_count_1.anomaly_score, avg_count_0.anomaly_score)
print("KS statistic: " + str(ks_stat) + "\np-value: " + str(pval))
