In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import pickle

In [2]:
# df_good = pd.read_csv('good_samples_cleaned.csv')
# df_bad = pd.read_csv('bad_samples_cleaned.csv')
# df = pd.concat([df_good, df_bad], ignore_index=True, sort=False)

df = pd.read_csv('good_samples_cleaned.csv')

df.head(5)

Unnamed: 0,LogName,Date,Time,ComputerName,SourceName,TaskCategory,EventType,Keywords,OpCode,EventCode,Type,RecordNumber,Message
0,Application,2023-09-22,00:10:15,CLIENT-PC,Microsoft-Windows-Security-SPP,,4,Classic,,16384,4,3267,Successfully scheduled Software Protection ser...
1,System,2023-09-22,00:10:01,CLIENT-PC,Microsoft-Windows-Kernel-General,,4,,Info,16,4,2279,The access history in hive \??\C:\Users\Commun...
2,Application,2023-09-22,00:09:45,CLIENT-PC,SecurityCenter,,4,Classic,Info,15,4,3266,Updated Windows Defender status successfully t...
3,System,2023-09-22,00:09:23,CLIENT-PC,Microsoft-Windows-Kernel-General,,4,,Info,16,4,2278,The access history in hive \??\C:\Users\Commun...
4,Application,2023-09-22,00:09:20,CLIENT-PC,Microsoft-Windows-Security-SPP,,2,Classic,,8198,2,3265,License Activation (slui.exe) failed with the ...


In [3]:
# Explicitly specify date and time formats
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d', errors='coerce')
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S', errors='coerce')
df['Hour'] = df['Time'].dt.hour
df['Minute'] = df['Time'].dt.minute
df['Second'] = df['Time'].dt.second

df.drop(['Date','Time'], axis=1, inplace=True)

In [4]:
# Initialize a LabelEncoder
label_encoder = LabelEncoder()

# Encode a specific column (e.g., 'SourceName')
df['LogName'] = label_encoder.fit_transform(df['LogName'])
df['Keywords'] = label_encoder.fit_transform(df['Keywords'])
df['TaskCategory'] = label_encoder.fit_transform(df['TaskCategory'])
df['SourceName'] = label_encoder.fit_transform(df['SourceName'])
df['OpCode'] = label_encoder.fit_transform(df['OpCode'])
df['ComputerName'] = label_encoder.fit_transform(df['OpCode'])

In [5]:
# Initialize CountVectorizer
vectorizer = CountVectorizer(max_features=1000, stop_words='english')

# Fit and transform the 'Message' column
message_bow = vectorizer.fit_transform(df['Message'])

# Convert the bag of words representation to a DataFrame
message_df = pd.DataFrame(message_bow.toarray(), columns=vectorizer.get_feature_names_out())

# Add the BoW representation as new columns in your original DataFrame
df = pd.concat([df, message_df], axis=1)

df.drop(['Message'], axis=1, inplace=True)

In [6]:
df.tail(5)

Unnamed: 0,LogName,ComputerName,SourceName,TaskCategory,EventType,Keywords,OpCode,EventCode,Type,RecordNumber,...,wsearch,x64,xboxapp_8wekyb3d8bbwe,xboxgamingoverlay_8wekyb3d8bbwe,xboxidentityprovider_8wekyb3d8bbwe,yourphone_8wekyb3d8bbwe,zone,zunemusic_11,zunemusic_8wekyb3d8bbwe,zunevideo_8wekyb3d8bbwe
346,3,1,13,10,4,6,1,27,4,2131,...,0,0,0,0,0,0,0,0,0,0
347,3,1,13,9,4,6,1,25,4,2130,...,0,0,0,0,0,0,0,0,0,0
348,3,1,13,8,4,6,1,20,4,2129,...,0,0,0,0,0,0,0,0,0,0
349,3,1,13,15,4,6,1,153,4,2128,...,0,0,0,0,0,0,0,0,0,0
350,3,1,14,0,4,6,1,12,4,2127,...,0,0,0,0,0,0,0,0,0,0


In [7]:
X = df

# Create and train the Isolation Forest model
model = IsolationForest(contamination=0.05, random_state=42)  # You can adjust the contamination parameter
model.fit(X)

In [8]:
# Predict anomaly scores for each data point
anomaly_scores = model.decision_function(X)

# Add the anomaly scores as a new column in your DataFrame
df['AnomalyScore'] = anomaly_scores

# Identify anomalies based on a threshold (you can adjust this threshold)
threshold = -0.2
anomalies = df[df['AnomalyScore'] < threshold]

# Extract the original columns for the anomalies
#anomalies = anomalies[df]

In [9]:
df['AnomalyScore'] = anomaly_scores

In [10]:
df[df['AnomalyScore'] < -0.185]

Unnamed: 0,LogName,ComputerName,SourceName,TaskCategory,EventType,Keywords,OpCode,EventCode,Type,RecordNumber,...,x64,xboxapp_8wekyb3d8bbwe,xboxgamingoverlay_8wekyb3d8bbwe,xboxidentityprovider_8wekyb3d8bbwe,yourphone_8wekyb3d8bbwe,zone,zunemusic_11,zunemusic_8wekyb3d8bbwe,zunevideo_8wekyb3d8bbwe,AnomalyScore


In [11]:
with open('isolation_forest_model.pkl', 'wb') as fout:
    pickle.dump((vectorizer, model), fout)

In [12]:
df.shape

(351, 522)