# Gathering and Generating Features
This notebook is used to aggregate all the feature sets into a dataset that's ready for training. 

In [None]:
import pandas as pd
import numpy as np
import sys, os
from sklearn.ensemble import IsolationForest
from datetime import datetime, timezone

# import module from parent directory
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from library.analysis_utils import AnomalyV3Attrs

pd.set_option('display.max_rows', 180)

input_file = '../data/v3-ml-features-20250508.csv'
target_file = '../data/v3-ml-complete-features-20250508.csv'

contamination_value = 0.009

# read in the dataset
df = pd.read_csv(input_file)

# convert 'day' column to a datetime
df['day'] = pd.to_datetime(df['day'])

df.head(5)

In [None]:
# filter the columns I'm interested in
df = df[AnomalyV3Attrs.BASE_FEATURES_TO_USE]
df.head(5)

    # excluding new_repo_git_clone, new_repo_git_push, new_repo_download, new_repo_workflow_run, active_read_repos_accessed

In [None]:
# get the earliest date in the dataset
df['day'].min()

# get the latest date in the dataset
df['day'].max()

# iterate from the earliest date to the latest date
for day in pd.date_range(start=df['day'].min(), end=df['day'].max()):
    print(f"{day.strftime('%Y%m%d')}: {len(df[df['day'] == day])}")

In [None]:
# describe before after removal
df_basefeatures: pd.DataFrame = df.copy()
description_frame = df_basefeatures.describe(percentiles=[0.5, 0.75, 0.95, 0.99]).loc[['min', '50%', '75%', '95%', '99%', 'max']].transpose()
display(description_frame)

display(df_basefeatures.shape)

## Create additional features

There are additional features to be generated based on variations of individual user behavior over time, rather than on a single day compared to all others.

In [None]:
# these are the columns that will be EXCLUDED from generating features for deviations
columns_to_zscore_exclude = set(AnomalyV3Attrs.FEATURES_TO_ZSCORE_EXCLUDE)

# exclude the list of columns where the max value is 1 or less (significant on individual days, not deviation needed)
bool_cols = description_frame.transpose().loc['max'] <= 1
tiny_columns = bool_cols.index[bool_cols].tolist()
for tiny_column in tiny_columns:
    columns_to_zscore_exclude.add(tiny_column)

print(f"Exclude columns: {columns_to_zscore_exclude}")

In [None]:
# get a copy of the dataframe without the excluded columns
df_deviations = df_basefeatures.drop(columns=columns_to_zscore_exclude)

# remove the 'day' column as it's not relevant for this calculation
df_deviations = df_deviations.drop(columns=['day'])

# get the average and standard deviation for each column for each actor
df_deviations = df_deviations.groupby('actor').agg(['mean', 'std'])

# for each column, replace any NaN in std with 0.0
df_deviations = df_deviations.fillna(0.0)

df_deviations

In [None]:
import math

df_fulldayset = df_basefeatures.copy()

# create a zscore for a given column/value in df_deviations
def create_actor_zscore_column(column, row, df_deviations):
    actor = row['actor']
    mean = df_deviations.loc[actor, (column, 'mean')]
    std = df_deviations.loc[actor, (column, 'std')]
    value = row[column]

    zscore = 0.0
    if not math.isclose(std, 0, abs_tol=1e-9):
        zscore = abs((value - mean) / std)
    
    if math.isnan(zscore):
        print(f"Nan values: actor {actor}, column {column}, value {value}, mean {mean}, std {std}")

    return zscore

# for each column in df_deviations, create a new column in df_fulldayset with the zscore
dev_columns = list(set([column[0] for column in df_deviations.columns]))
for column in sorted(dev_columns):
    print(f"Processing column: {column}")
    df_fulldayset['zscore_' + column] = df_fulldayset.apply(
        lambda row: create_actor_zscore_column(column, row, df_deviations), axis=1
    )

print(f"Shape of df_fulldayset: {df_fulldayset.shape}")

In [None]:
df_fulldayset.shape

In [None]:
df_fullfeatures = df_fulldayset.copy()

# describe after bot removal
display(df_fullfeatures.describe(percentiles=[0.5, 0.75, 0.95, 0.99]).loc[['min', '50%', '75%', '95%', '99%', 'max']].transpose())

# Do any data data analysis

In [None]:
print("Check for any rows with NaN values...")

# view any rows with NaN values
display(df_fullfeatures[df_fullfeatures.isnull().any(axis=1)].transpose())

# remove any rows with NaN values
df_fullfeatures7d = df_fullfeatures.dropna()

## Write out the complete feature set

In [None]:
df_fullfeatures.to_csv(target_file, index=False)