In [None]:
import os
import sys
from typing import List, Tuple, Dict
module_path = os.path.abspath(os.path.join('../library/'))
if module_path not in sys.path:
    sys.path.append(module_path)

import analysis_utils
from anomalous_user import AnomalousUser

turn_leavers_into_binary = True

# Training the model
This notebook is used to aggregate all the feature sets into a dataset that's ready for training. The model is then training and tested against a specified dataset to capture results/anomalies.

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from datetime import datetime, timezone
import matplotlib.pyplot as plt
import pandas as pd

input_file = '../data/v3-ml-complete-features-20250426.csv'
users_file = '../data/allorgs_useremail_20240510.csv'

anomaly_ranked_results_html = "../data/anomalies-ranked-20250426.html"
anomaly_ranked_results_md = "../data/anomalies-ranked-20250426.md"

run_as_global_only = False # Determines whether to exclude UEBA

# read in the dataset
df_features = pd.read_csv(input_file)

# convert 'day' column to a datetime
df_features['day'] = pd.to_datetime(df_features['day'])

if run_as_global_only:
    # drop all columsn that start with 'zscore_'
    df_features = df_features.loc[:, ~df_features.columns.str.startswith('zscore_')]

df_features.head(5)

In [None]:
# remove any duplicate rows
print(f"Number of rows before removing duplicates: {len(df_features)}")
df_features = df_features.drop_duplicates()
print(f"Number of rows after removing duplicates: {len(df_features)}")

In [None]:
# get the range of the 'day' column
min_day = df_features['day'].min()
max_day = df_features['day'].max()
print(f'Minimum day: {min_day}')
print(f'Maximum day: {max_day}')

# get the number of days in the dataset
num_days = (max_day - min_day).days + 1
print(f'Number of days in the dataset: {num_days}')

target_file = f'../data/results-{datetime.strftime(max_day, "%Y%d")}.csv'
print(f"Target file: {target_file}")

# get the count of rows for each 'day'
print("Number of items per day:")
df_features['day'].value_counts().sort_index()

In [None]:
# starting at the max day, check the number of rows for each of the past number of days
for i in range(0, num_days):
    day_to_test = max_day - pd.DateOffset(days=i)
    print(f"Day to test: {day_to_test}")
    print(f"Day shape: {df_features[df_features['day'] == day_to_test].shape}")
    print("\n")


## Train the model

### Test the contamination value

In [None]:
contamination_test_value = 0.1

# drop the actor and the unique_ips (replaced by in/out subnet)
df_input = df_features.drop(columns=['actor', 'day'])

# create the isolation forest model
clf = IsolationForest(random_state=0, contamination=contamination_test_value)

# fit the model to the dataset
clf.fit(df_input)

df_anomaly_values = clf.decision_function(df_input)
print(f"Anomaly values: {len(df_anomaly_values)}")

# copy df_features and add the anomaly values
df_c01results = df_features.copy()
df_c01results['anomaly_score'] = df_anomaly_values

df_c01results = df_c01results.sort_values(by='day')

# get the smallest anomaly score
min_anomaly_score = df_c01results['anomaly_score'].min()

# Assuming df_results contains the anomaly_score and day columns
plt.figure(figsize=(10, 6))
plt.scatter(df_c01results['anomaly_score'], df_c01results['day'], alpha=0.5)
plt.title('Scatter Plot of Anomaly Score vs Day')
plt.xlabel('Anomaly Score')
plt.ylabel('Day')
plt.grid(True)

anomaly_score_threshold: float = -0.38 if run_as_global_only else -0.32

# add a vertical line
plt.axvline(x=anomaly_score_threshold, color='r', linestyle='--', label='Threshold')

# add a light red box from threshold to end
plt.axvspan(anomaly_score_threshold, min_anomaly_score - 0.01, color='red', alpha=0.1)
plt.show()

"""
# get the MAD-z scores
df_c01results['mad_z'] = (df_c01results['anomaly_score'] - df_c01results['anomaly_score'].mean()) / df_c01results['anomaly_score'].std()
df_c01results['mad_z'] = df_c01results['mad_z'].abs()
df_c01results['mad_z'] = df_c01results['mad_z'].round(2)

# get the mad_z threshold that results in 26 anomalies
mad_z_threshold = 0.0
while len(df_c01results[df_c01results['mad_z'] > mad_z_threshold]) > 26:
    mad_z_threshold += 0.01
print(f"mad_z threshold: {mad_z_threshold}")
print(f"Number of anomalies with mad_z > {mad_z_threshold}: {len(df_c01results[df_c01results['mad_z'] > mad_z_threshold])}")
"""


In [None]:
contamination_test_value = 0.1

# split into 3 data sets
df_input_1 = df_features[df_features['day'] < pd.Timestamp('2025-01-01', tz=timezone.utc)]
df_input_2 = df_features[(df_features['day'] >= '2025-01-01') & (df_features['day'] < '2025-02-01')]
df_input_3 = df_features[(df_features['day'] >= '2025-02-01') & (df_features['day'] < '2025-03-01')]
df_input_4 = df_features[(df_features['day'] >= '2025-03-01') & (df_features['day'] < '2025-04-01')]
df_input_5 = df_features[df_features['day'] >= '2025-04-01']

# iterate for each of the 3 data sets
for i, df_set in enumerate([df_input_1, df_input_2, df_input_3, df_input_4, df_input_5]):
    df_input = df_set.drop(columns=['actor', 'day'])

    print(f"Data set {i + 1}")
    print(f"Shape: {df_input.shape}")

    # create the isolation forest model
    clf = IsolationForest(random_state=0, contamination=contamination_test_value)

    # fit the model to the dataset
    clf.fit(df_input)

    df_anomaly_values = clf.decision_function(df_input)
    print(f"Anomaly values: {len(df_anomaly_values)}")

    # copy df_features and add the anomaly values
    df_c01results = df_set.copy()
    df_c01results['anomaly_score'] = df_anomaly_values

    df_c01results = df_c01results.sort_values(by='day')

    # get the smallest anomaly score
    min_anomaly_score = df_c01results['anomaly_score'].min()

    # Assuming df_results contains the anomaly_score and day columns
    plt.figure(figsize=(10, 6))
    plt.scatter(df_c01results['anomaly_score'], df_c01results['day'], alpha=0.5)
    plt.title('Scatter Plot of Anomaly Score vs Day')
    plt.xlabel('Anomaly Score')
    plt.ylabel('Day')
    plt.grid(True)

    anomaly_score_threshold: float = -0.38 if run_as_global_only else -0.32

    # add a vertical line
    plt.axvline(x=anomaly_score_threshold, color='r', linestyle='--', label='Threshold')

    # add a light red box from threshold to end
    plt.axvspan(anomaly_score_threshold, min_anomaly_score - 0.01, color='red', alpha=0.1)
    plt.show()

### Calculate the contaimination value based on the chart

In [None]:
# get the number of anomalies with score < threshold
anomalies_below_threshold = df_c01results[df_c01results['anomaly_score'] <= anomaly_score_threshold]
num_anomalies_below_threshold: int = len(anomalies_below_threshold)
num_anomalies: int = len(df_c01results)
num_total_rows: int = len(df_features)
perc_anomalies_below_threshold: float = float(num_anomalies_below_threshold) / float(num_anomalies)
contamination_value: float = num_anomalies_below_threshold / num_total_rows

print(f"Number of anomalies with score < {anomaly_score_threshold}: {num_anomalies_below_threshold} of {num_anomalies}")
print(f"Percentage of anomalies with score < {anomaly_score_threshold}: {perc_anomalies_below_threshold * 100:.2f}%")
print(f"Total number of rows: {num_anomalies * (1/contamination_test_value)}")
print(f"Contamination value calculated: {contamination_value:.6f}")


### Train

In [None]:
# drop the actor and the unique_ips (replaced by in/out subnet)
df_input = df_features.drop(columns=['actor', 'day'])

# create the isolation forest model
clf = IsolationForest(random_state=0, contamination=contamination_value)

# fit the model to the dataset
clf.fit(df_input)


### Apply the Model

In [None]:
def append_email(df, users_file):
    # open the export-sede-x-1707016946.csv file and read all lines
    with open(users_file, 'r') as file:
        lines = file.readlines()

    # for each line, split into a list of values
    lines = [line.split(',') for line in lines]

    # for each line, make the first value a key and the last value a value in a dictionary
    lines = {line[0]: line[-2] for line in lines}

    # for each actor in df_anomly, add the email address for the actor which is the value in the dictionary
    df['email'] = df['actor'].map(lines)


def test_dataset(df, clf, day_to_test) -> list[tuple[str, datetime, float, str]]:

    df_test = df[df['day'] == day_to_test].copy()
    df_test_results = df_test.copy(deep=False)

    df_test = df_test.drop(columns=['actor', 'day'])

    # get the anomaly scores for each row in the dataset
    df_test_results["anomaly_score"] = clf.decision_function(df_test)
    df_test_results["anomaly"] = clf.predict(df_test)

    # filter the dataset where anomaly is -1 and sort by anomaly scores
    df_anomaly = df_test_results[df_test_results['anomaly'] == -1].sort_values(by='anomaly_score', ascending=True)

    append_email(df_anomaly, users_file)

    columns_to_move = ['anomaly_score', 'anomaly', 'email']

    columns = list(df_anomaly.columns)
    for col in columns_to_move:
        columns.remove(col)

    columns.insert(2, columns_to_move[0])
    columns.insert(3, columns_to_move[1])
    columns.insert(4, columns_to_move[2])

    df_anomaly = df_anomaly[columns]
    df_anomaly.drop(columns=['anomaly'], inplace=True)

    anomalies = []
    for index, row in df_anomaly.iterrows():
        anomalies.append((row['actor'], row['day'], row['anomaly_score'], row['email']))

    return anomalies


# for each of the last number of days days, test the dataset
df_anomalies = []
for i in range(0, num_days):
    day_to_test = max_day - pd.DateOffset(days=i)
    print(f"Testing day: {day_to_test}...")
    df_anomalies_iter = test_dataset(df_features, clf, day_to_test)
    df_anomalies.extend(df_anomalies_iter)

df_anomalies

In [None]:
# copy the df_features
df_results = df_features.copy()

# add a column to the df_results for the anomaly score
df_results['anomaly_score'] = np.nan

# add a column to the df_results for the email
df_results['email'] = np.nan

# for each anomaly in df_anomalies, set the anomaly score in df_results
for anomaly in df_anomalies:
    actor, day, anomaly_score, email = anomaly
    df_results.loc[(df_results['actor'] == actor) & (df_results['day'] == day), 'anomaly_score'] = anomaly_score
    # df_results.loc[(df_results['actor'] == actor) & (df_results['day'] == day), 'email'] = email

# remove all rows where the anomaly score is NaN
df_results = df_results[~df_results['anomaly_score'].isnull()]

# sort the dataset by day and anomaly score
df_results = df_results.sort_values(by=['anomaly_score', 'day'], ascending=[True, False])

# display the results with only actor, day, anomaly score, and email
df_results[['actor', 'day', 'anomaly_score', 'email']].head(10)

In [None]:
# dump anomalies detected to a csv file
df_results.to_csv(target_file)

# Analyze the results

In [None]:
# get the 95%/99% for each column in df_features
df_stat_cmp = df_results.describe(percentiles=[0.95, 0.99]).transpose()

# remove all columns that are not in the 95% value
df_stat_cmp = df_stat_cmp.drop(columns=['count', 'mean', 'std', 'min', 'max', '50%'])

# remove any column that has 'zscore' in the name
df_stat_cmp = df_stat_cmp[~df_stat_cmp.index.str.contains('zscore')]

df_stat_cmp

In [None]:
ausers:List[AnomalousUser] = list()

# for each row in df_results
for index, row in df_results.iterrows():

    auser = AnomalousUser(row['actor'], row['day'], row['anomaly_score'], row['email'])
    ausers.append(auser)
    
    # for every other column in the row, print the non-zero values
    for col in df_results.columns:
        if col in ['actor', 'day', 'anomaly_score', 'email']:
            continue

        zscore_col = "zscore" in col
        if zscore_col:
            if row[col] >= 3.0:
                auser.add_anomaly(col, row[col], AnomalousUser.AnomalyType.ZSCORE, 3.0)
        else:
            if col in ['leaver_action', 'is_weekend'] and row[col] > 0:
                auser.add_anomaly(col, row[col], AnomalousUser.AnomalyType.FIXED, 1)
            elif row[col] >= df_stat_cmp.loc[col].values[1] and row[col] > 0:
                auser.add_anomaly(col, row[col], AnomalousUser.AnomalyType.NINETYNINE, df_stat_cmp.loc[col].values[1])
            elif row[col] >= df_stat_cmp.loc[col].values[0] and row[col] > 0:
                auser.add_anomaly(col, row[col], AnomalousUser.AnomalyType.NINETYFIVE, df_stat_cmp.loc[col].values[0])

# group ausers by 'day'
ausers_groups: Dict[str, List[AnomalousUser]] = dict()
for auser in ausers:
    if auser.day in ausers_groups:
        ausers_groups[auser.day].append(auser)
    else:
        ausers_groups[auser.day] = [auser]

# for sorted each day, print the number of anomalies
for day, ausers2 in sorted(ausers_groups.items(), key=lambda x: x[0]):
    print(f"Day: {day} - Number of anomalies: {len(ausers2)}")

# Determine most anomalous features

## Cutoff analysis logic

In [None]:
from scipy import stats

# cutoff test values
cutoff_test = [95.0, 96.0, 97.0, 98.0, 99.0, 99.1, 99.2, 99.3, 99.4, 99.5, 99.6, 99.7, 99.8, 99.9, 99.99]

# create a list of 0s for the length of the cutoff_test
cutoff_values = [0] * len(cutoff_test)
cutoff_mins = [1000] * len(cutoff_test)

ranked_results = []

# for each row in df_results
for index, row in df_results.iterrows():

    feature_ranks = list[tuple[str, float, any]]()

    # for each feature in the row, determine the percentile of the feature compared to the df_features
    for col in df_results.columns:
        if col in ['actor', 'day', 'anomaly_score', 'email']:
            continue
        
        # Calculate the percentile rank of the given value
        percentile_rank = stats.percentileofscore(df_features[col], row[col])

        feature_ranks.append((col, percentile_rank, row[col]))
    
    # sort the feature ranks by the percentile rank
    feature_ranks = sorted(feature_ranks, key=lambda x: x[1], reverse=True)

    # reorder the feature ranks to put those with 'zscore' in the name at the end
    feature_ranks = sorted(feature_ranks, key=lambda x: 'zscore' in x[0])

    ranked_results.append((row['actor'], row['day'], row['anomaly_score'], row['email'], feature_ranks))

    for i, cutoff in enumerate(cutoff_test):
        cutoff_counts = 0
        # for each feature, if the percentile rank is greater than the cutoff, increment the cutoff_values
        for feature in feature_ranks:
            if feature[1] >= cutoff:
                cutoff_counts += 1
        cutoff_values[i] += cutoff_counts

        if cutoff_counts < cutoff_mins[i]:
            cutoff_mins[i] = cutoff_counts

# divide the cutoff_values by the number of rows in df_results to get the percentage
cutoff_values = [val / df_results.shape[0] for val in cutoff_values]

# print the cutoff_values for each cutoff_test
for i, cutoff in enumerate(cutoff_test):
    print(f"{cutoff} cutoff_avg: {cutoff_values[i]: 0.2f}, cutoff_min: {cutoff_mins[i]}")


## Cutoff results

Cutoff analysis was performed on each of the 180d to determine the optimal cutoff to minimize the results to be investigated once anoamalies have been identified.

## Filter analysis

In [None]:
"""
Create a markdown file that shows the list of anomalies, grouped by the user in order of anomaly score (lowest to higest).
"""

cutoff_key = 99.0

def create_markdown_file(ranked_results, cutoff_key, filename):
    with open(filename, 'w') as f:
        f.write(f"# Anomalous Users: {min_day.strftime('%Y-%m-%d')} to {max_day.strftime('%Y-%m-%d')}\n")

        num_unique_actors = len(set([result[0] for result in ranked_results]))
        f.write(f"This is {(max_day - min_day).days} days with {num_unique_actors} actor performing {len(ranked_results)} anomalies.\n")

        # group the results by actor
        ranked_results = sorted(ranked_results, key=lambda x: (x[0], x[2]), reverse=False)

        last_actor: str = ""
        for result in ranked_results:
            actor, day, anomaly_score, email, feature_ranks = result
            actor_str: str = ""
            new_anomaly: bool = True
            if actor != last_actor:
                last_actor = actor

                f.write("\n")
                f.write(f"## {actor}, {email}\n")
                f.write("| Day | Anomaly Score | Feature | % Rank | Value |\n")
                f.write("|-----|---------------|---------|--------|-------|\n")                
            
            for feature in feature_ranks:
                col, percentile_rank, value = feature
                val_str = str(value)
                if '.' in val_str:
                    val_str: str = val_str[:val_str.find('.') + 3]

                if percentile_rank >= cutoff_key or (col in ['is_weekend'] and value > 0):
                    if new_anomaly:
                        f.write(f"| {day} | {anomaly_score:0.6f} | {col} | {percentile_rank:.2f} | {val_str} |\n")
                        new_anomaly = False
                    else:
                        f.write(f"| | | {col} | {percentile_rank:.2f} | {val_str} |\n")

create_markdown_file(ranked_results, cutoff_key, anomaly_ranked_results_md)

# print the file as markdown in the cell
with open(anomaly_ranked_results_md, 'r') as f:
    print(f.read())

# convert the markdown file to html
os.system(f"pandoc {anomaly_ranked_results_md} -o {anomaly_ranked_results_html}")

In [None]:
# get the list of each feature and the number of times it appears in the anomalies
significant_features: Dict[str, int] = dict()
for result in ranked_results:
    actor, day, anomaly_score, email, feature_ranks = result
    for feature in feature_ranks:
        col, percentile_rank, value = feature
        if percentile_rank >= cutoff_key or (col in ['is_weekend'] and value > 0):
            if col in significant_features:
                significant_features[col] += 1
            else:
                significant_features[col] = 1

# sort the significant features by the number of times they appear and print
significant_features_list = sorted(significant_features.items(), key=lambda x: x[1], reverse=True)

print(f"Total features: {len(significant_features)}")
for feature in significant_features_list:
    print(f"{feature[0]}: {feature[1]}")